{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 4483, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_loss": 2.3125102519989014, "eval_runtime": 636.0298, "eval_samples_per_second": 99.225, "eval_steps_per_second": 1.552, "step": 0 }, { "epoch": 0.00022306491188935982, "grad_norm": 447.7063293457031, "learning_rate": 0.0, "loss": 2.6996, "step": 1 }, { "epoch": 0.00044612982377871963, "grad_norm": 410.13433837890625, "learning_rate": 2.0000000000000002e-07, "loss": 2.5839, "step": 2 }, { "epoch": 0.0006691947356680794, "grad_norm": 514.7770385742188, "learning_rate": 4.0000000000000003e-07, "loss": 2.538, "step": 3 }, { "epoch": 0.0008922596475574393, "grad_norm": 73.62993621826172, "learning_rate": 6.000000000000001e-07, "loss": 2.5648, "step": 4 }, { "epoch": 0.001115324559446799, "grad_norm": 35.405982971191406, "learning_rate": 8.000000000000001e-07, "loss": 2.5318, "step": 5 }, { "epoch": 0.0013383894713361588, "grad_norm": 31.43743133544922, "learning_rate": 1.0000000000000002e-06, "loss": 2.4902, "step": 6 }, { "epoch": 0.0015614543832255187, "grad_norm": 32.07968521118164, "learning_rate": 1.2000000000000002e-06, "loss": 2.4386, "step": 7 }, { "epoch": 0.0017845192951148785, "grad_norm": 19.430936813354492, "learning_rate": 1.4000000000000001e-06, "loss": 2.0901, "step": 8 }, { "epoch": 0.002007584207004238, "grad_norm": 20.800960540771484, "learning_rate": 1.6000000000000001e-06, "loss": 2.1737, "step": 9 }, { "epoch": 0.002230649118893598, "grad_norm": 20.56642723083496, "learning_rate": 1.8000000000000001e-06, "loss": 2.1823, "step": 10 }, { "epoch": 0.002453714030782958, "grad_norm": 20.692852020263672, "learning_rate": 2.0000000000000003e-06, "loss": 2.1278, "step": 11 }, { "epoch": 0.0026767789426723177, "grad_norm": 14.481375694274902, "learning_rate": 2.2e-06, "loss": 1.9787, "step": 12 }, { "epoch": 0.0028998438545616775, "grad_norm": 13.366188049316406, "learning_rate": 2.4000000000000003e-06, "loss": 1.9635, "step": 13 }, { "epoch": 0.0031229087664510374, "grad_norm": 14.910027503967285, "learning_rate": 2.6e-06, "loss": 1.8838, "step": 14 }, { "epoch": 0.0033459736783403972, "grad_norm": 13.10051441192627, "learning_rate": 2.8000000000000003e-06, "loss": 1.8825, "step": 15 }, { "epoch": 0.003569038590229757, "grad_norm": 14.625020027160645, "learning_rate": 3e-06, "loss": 1.6607, "step": 16 }, { "epoch": 0.0037921035021191165, "grad_norm": 9.08655834197998, "learning_rate": 3.2000000000000003e-06, "loss": 1.5911, "step": 17 }, { "epoch": 0.004015168414008476, "grad_norm": 8.972038269042969, "learning_rate": 3.4000000000000005e-06, "loss": 1.5538, "step": 18 }, { "epoch": 0.004238233325897837, "grad_norm": 7.949400901794434, "learning_rate": 3.6000000000000003e-06, "loss": 1.5478, "step": 19 }, { "epoch": 0.004461298237787196, "grad_norm": 7.048157215118408, "learning_rate": 3.8000000000000005e-06, "loss": 1.3381, "step": 20 }, { "epoch": 0.004684363149676556, "grad_norm": 7.008754730224609, "learning_rate": 4.000000000000001e-06, "loss": 1.3275, "step": 21 }, { "epoch": 0.004907428061565916, "grad_norm": 6.20928955078125, "learning_rate": 4.2000000000000004e-06, "loss": 1.2469, "step": 22 }, { "epoch": 0.005130492973455275, "grad_norm": 4.8463029861450195, "learning_rate": 4.4e-06, "loss": 1.1167, "step": 23 }, { "epoch": 0.005353557885344635, "grad_norm": 20.94325065612793, "learning_rate": 4.600000000000001e-06, "loss": 1.1391, "step": 24 }, { "epoch": 0.005576622797233995, "grad_norm": 3.783233404159546, "learning_rate": 4.800000000000001e-06, "loss": 1.0692, "step": 25 }, { "epoch": 0.005799687709123355, "grad_norm": 3.780400276184082, "learning_rate": 5e-06, "loss": 1.0715, "step": 26 }, { "epoch": 0.0060227526210127145, "grad_norm": 3.059216260910034, "learning_rate": 5.2e-06, "loss": 1.0057, "step": 27 }, { "epoch": 0.006245817532902075, "grad_norm": 2.7420809268951416, "learning_rate": 5.400000000000001e-06, "loss": 0.9421, "step": 28 }, { "epoch": 0.006468882444791434, "grad_norm": 2.8878915309906006, "learning_rate": 5.600000000000001e-06, "loss": 0.9625, "step": 29 }, { "epoch": 0.0066919473566807944, "grad_norm": 2.9568190574645996, "learning_rate": 5.8e-06, "loss": 0.924, "step": 30 }, { "epoch": 0.006915012268570154, "grad_norm": 2.239628314971924, "learning_rate": 6e-06, "loss": 0.8886, "step": 31 }, { "epoch": 0.007138077180459514, "grad_norm": 2.2312073707580566, "learning_rate": 6.200000000000001e-06, "loss": 0.9105, "step": 32 }, { "epoch": 0.0073611420923488735, "grad_norm": 1.7348324060440063, "learning_rate": 6.4000000000000006e-06, "loss": 0.8685, "step": 33 }, { "epoch": 0.007584207004238233, "grad_norm": 1.3611754179000854, "learning_rate": 6.600000000000001e-06, "loss": 0.8088, "step": 34 }, { "epoch": 0.007807271916127593, "grad_norm": 1.2034807205200195, "learning_rate": 6.800000000000001e-06, "loss": 0.8101, "step": 35 }, { "epoch": 0.008030336828016953, "grad_norm": 1.3573353290557861, "learning_rate": 7e-06, "loss": 0.8048, "step": 36 }, { "epoch": 0.008253401739906312, "grad_norm": 1.4660226106643677, "learning_rate": 7.2000000000000005e-06, "loss": 0.7663, "step": 37 }, { "epoch": 0.008476466651795673, "grad_norm": 1.4917473793029785, "learning_rate": 7.4e-06, "loss": 0.7847, "step": 38 }, { "epoch": 0.008699531563685033, "grad_norm": 0.9017640352249146, "learning_rate": 7.600000000000001e-06, "loss": 0.7472, "step": 39 }, { "epoch": 0.008922596475574392, "grad_norm": 0.9448667764663696, "learning_rate": 7.800000000000002e-06, "loss": 0.7384, "step": 40 }, { "epoch": 0.009145661387463751, "grad_norm": 1.0112793445587158, "learning_rate": 8.000000000000001e-06, "loss": 0.735, "step": 41 }, { "epoch": 0.009368726299353113, "grad_norm": 0.8147673606872559, "learning_rate": 8.2e-06, "loss": 0.7463, "step": 42 }, { "epoch": 0.009591791211242472, "grad_norm": 0.8109986782073975, "learning_rate": 8.400000000000001e-06, "loss": 0.7275, "step": 43 }, { "epoch": 0.009814856123131831, "grad_norm": 0.6843619346618652, "learning_rate": 8.6e-06, "loss": 0.7413, "step": 44 }, { "epoch": 0.01003792103502119, "grad_norm": 0.5661330223083496, "learning_rate": 8.8e-06, "loss": 0.7226, "step": 45 }, { "epoch": 0.01026098594691055, "grad_norm": 0.8250797986984253, "learning_rate": 9e-06, "loss": 0.7099, "step": 46 }, { "epoch": 0.010484050858799911, "grad_norm": 0.5656530857086182, "learning_rate": 9.200000000000002e-06, "loss": 0.7145, "step": 47 }, { "epoch": 0.01070711577068927, "grad_norm": 0.7327658534049988, "learning_rate": 9.4e-06, "loss": 0.716, "step": 48 }, { "epoch": 0.01093018068257863, "grad_norm": 0.5881125330924988, "learning_rate": 9.600000000000001e-06, "loss": 0.6747, "step": 49 }, { "epoch": 0.01115324559446799, "grad_norm": 0.5523539185523987, "learning_rate": 9.800000000000001e-06, "loss": 0.6521, "step": 50 }, { "epoch": 0.01137631050635735, "grad_norm": 0.5446374416351318, "learning_rate": 1e-05, "loss": 0.6752, "step": 51 }, { "epoch": 0.01159937541824671, "grad_norm": 0.475178599357605, "learning_rate": 1.02e-05, "loss": 0.7108, "step": 52 }, { "epoch": 0.01182244033013607, "grad_norm": 0.7163774967193604, "learning_rate": 1.04e-05, "loss": 0.6717, "step": 53 }, { "epoch": 0.012045505242025429, "grad_norm": 0.4185207784175873, "learning_rate": 1.0600000000000002e-05, "loss": 0.6645, "step": 54 }, { "epoch": 0.012268570153914788, "grad_norm": 0.43035364151000977, "learning_rate": 1.0800000000000002e-05, "loss": 0.6814, "step": 55 }, { "epoch": 0.01249163506580415, "grad_norm": 0.37698522210121155, "learning_rate": 1.1000000000000001e-05, "loss": 0.7271, "step": 56 }, { "epoch": 0.012714699977693509, "grad_norm": 0.41325655579566956, "learning_rate": 1.1200000000000001e-05, "loss": 0.6718, "step": 57 }, { "epoch": 0.012937764889582868, "grad_norm": 0.4455505907535553, "learning_rate": 1.14e-05, "loss": 0.6461, "step": 58 }, { "epoch": 0.013160829801472228, "grad_norm": 0.39668968319892883, "learning_rate": 1.16e-05, "loss": 0.6744, "step": 59 }, { "epoch": 0.013383894713361589, "grad_norm": 0.43382027745246887, "learning_rate": 1.18e-05, "loss": 0.686, "step": 60 }, { "epoch": 0.013606959625250948, "grad_norm": 0.3178131878376007, "learning_rate": 1.2e-05, "loss": 0.6461, "step": 61 }, { "epoch": 0.013830024537140308, "grad_norm": 0.36169150471687317, "learning_rate": 1.22e-05, "loss": 0.6521, "step": 62 }, { "epoch": 0.014053089449029667, "grad_norm": 0.3932245671749115, "learning_rate": 1.2400000000000002e-05, "loss": 0.7118, "step": 63 }, { "epoch": 0.014276154360919028, "grad_norm": 0.3924449384212494, "learning_rate": 1.2600000000000001e-05, "loss": 0.6723, "step": 64 }, { "epoch": 0.014499219272808388, "grad_norm": 0.32354405522346497, "learning_rate": 1.2800000000000001e-05, "loss": 0.6589, "step": 65 }, { "epoch": 0.014722284184697747, "grad_norm": 0.3272797465324402, "learning_rate": 1.3000000000000001e-05, "loss": 0.6413, "step": 66 }, { "epoch": 0.014945349096587107, "grad_norm": 0.36967551708221436, "learning_rate": 1.3200000000000002e-05, "loss": 0.6432, "step": 67 }, { "epoch": 0.015168414008476466, "grad_norm": 0.2998528480529785, "learning_rate": 1.3400000000000002e-05, "loss": 0.6543, "step": 68 }, { "epoch": 0.015391478920365827, "grad_norm": 0.30827096104621887, "learning_rate": 1.3600000000000002e-05, "loss": 0.6715, "step": 69 }, { "epoch": 0.015614543832255186, "grad_norm": 0.3031168580055237, "learning_rate": 1.38e-05, "loss": 0.6504, "step": 70 }, { "epoch": 0.015837608744144548, "grad_norm": 0.3063329756259918, "learning_rate": 1.4e-05, "loss": 0.6214, "step": 71 }, { "epoch": 0.016060673656033905, "grad_norm": 0.2707573175430298, "learning_rate": 1.4200000000000001e-05, "loss": 0.6347, "step": 72 }, { "epoch": 0.016283738567923266, "grad_norm": 0.3051629066467285, "learning_rate": 1.4400000000000001e-05, "loss": 0.6318, "step": 73 }, { "epoch": 0.016506803479812624, "grad_norm": 0.3260219991207123, "learning_rate": 1.46e-05, "loss": 0.6219, "step": 74 }, { "epoch": 0.016729868391701985, "grad_norm": 0.2733941376209259, "learning_rate": 1.48e-05, "loss": 0.6384, "step": 75 }, { "epoch": 0.016952933303591346, "grad_norm": 0.35662564635276794, "learning_rate": 1.5000000000000002e-05, "loss": 0.622, "step": 76 }, { "epoch": 0.017175998215480704, "grad_norm": 0.31423047184944153, "learning_rate": 1.5200000000000002e-05, "loss": 0.6344, "step": 77 }, { "epoch": 0.017399063127370065, "grad_norm": 0.3120841681957245, "learning_rate": 1.54e-05, "loss": 0.6566, "step": 78 }, { "epoch": 0.017622128039259423, "grad_norm": 0.30338573455810547, "learning_rate": 1.5600000000000003e-05, "loss": 0.6537, "step": 79 }, { "epoch": 0.017845192951148784, "grad_norm": 0.3202762007713318, "learning_rate": 1.58e-05, "loss": 0.6705, "step": 80 }, { "epoch": 0.018068257863038145, "grad_norm": 0.3163682520389557, "learning_rate": 1.6000000000000003e-05, "loss": 0.6175, "step": 81 }, { "epoch": 0.018291322774927503, "grad_norm": 0.2831353545188904, "learning_rate": 1.62e-05, "loss": 0.6301, "step": 82 }, { "epoch": 0.018514387686816864, "grad_norm": 0.267448365688324, "learning_rate": 1.64e-05, "loss": 0.6224, "step": 83 }, { "epoch": 0.018737452598706225, "grad_norm": 0.2999647259712219, "learning_rate": 1.66e-05, "loss": 0.6294, "step": 84 }, { "epoch": 0.018960517510595583, "grad_norm": 0.27424749732017517, "learning_rate": 1.6800000000000002e-05, "loss": 0.5998, "step": 85 }, { "epoch": 0.019183582422484944, "grad_norm": 0.2582665979862213, "learning_rate": 1.7e-05, "loss": 0.6043, "step": 86 }, { "epoch": 0.0194066473343743, "grad_norm": 0.2448103129863739, "learning_rate": 1.72e-05, "loss": 0.631, "step": 87 }, { "epoch": 0.019629712246263663, "grad_norm": 0.25323718786239624, "learning_rate": 1.7400000000000003e-05, "loss": 0.6248, "step": 88 }, { "epoch": 0.019852777158153024, "grad_norm": 0.32291799783706665, "learning_rate": 1.76e-05, "loss": 0.5797, "step": 89 }, { "epoch": 0.02007584207004238, "grad_norm": 0.3439197242259979, "learning_rate": 1.7800000000000002e-05, "loss": 0.6141, "step": 90 }, { "epoch": 0.020298906981931743, "grad_norm": 0.30653074383735657, "learning_rate": 1.8e-05, "loss": 0.6226, "step": 91 }, { "epoch": 0.0205219718938211, "grad_norm": 0.5994735956192017, "learning_rate": 1.8200000000000002e-05, "loss": 0.614, "step": 92 }, { "epoch": 0.02074503680571046, "grad_norm": 0.30260932445526123, "learning_rate": 1.8400000000000003e-05, "loss": 0.6224, "step": 93 }, { "epoch": 0.020968101717599823, "grad_norm": 0.2914034426212311, "learning_rate": 1.86e-05, "loss": 0.6201, "step": 94 }, { "epoch": 0.02119116662948918, "grad_norm": 0.28529268503189087, "learning_rate": 1.88e-05, "loss": 0.6046, "step": 95 }, { "epoch": 0.02141423154137854, "grad_norm": 0.26552823185920715, "learning_rate": 1.9e-05, "loss": 0.6058, "step": 96 }, { "epoch": 0.021637296453267903, "grad_norm": 0.4168465733528137, "learning_rate": 1.9200000000000003e-05, "loss": 0.6289, "step": 97 }, { "epoch": 0.02186036136515726, "grad_norm": 0.40763556957244873, "learning_rate": 1.94e-05, "loss": 0.6113, "step": 98 }, { "epoch": 0.02208342627704662, "grad_norm": 0.29686489701271057, "learning_rate": 1.9600000000000002e-05, "loss": 0.6288, "step": 99 }, { "epoch": 0.02230649118893598, "grad_norm": 0.27713605761528015, "learning_rate": 1.98e-05, "loss": 0.6404, "step": 100 }, { "epoch": 0.02252955610082534, "grad_norm": 0.2593631148338318, "learning_rate": 2e-05, "loss": 0.5652, "step": 101 }, { "epoch": 0.0227526210127147, "grad_norm": 0.2575121819972992, "learning_rate": 1.999999972306855e-05, "loss": 0.6062, "step": 102 }, { "epoch": 0.02297568592460406, "grad_norm": 0.24929626286029816, "learning_rate": 1.999999889227421e-05, "loss": 0.6336, "step": 103 }, { "epoch": 0.02319875083649342, "grad_norm": 0.2693648040294647, "learning_rate": 1.9999997507617033e-05, "loss": 0.6611, "step": 104 }, { "epoch": 0.023421815748382778, "grad_norm": 0.5212802290916443, "learning_rate": 1.9999995569097088e-05, "loss": 0.6162, "step": 105 }, { "epoch": 0.02364488066027214, "grad_norm": 0.25279495120048523, "learning_rate": 1.999999307671449e-05, "loss": 0.6062, "step": 106 }, { "epoch": 0.0238679455721615, "grad_norm": 0.2529296278953552, "learning_rate": 1.999999003046937e-05, "loss": 0.6164, "step": 107 }, { "epoch": 0.024091010484050858, "grad_norm": 0.3054715096950531, "learning_rate": 1.9999986430361896e-05, "loss": 0.5984, "step": 108 }, { "epoch": 0.02431407539594022, "grad_norm": 0.26101136207580566, "learning_rate": 1.9999982276392274e-05, "loss": 0.5809, "step": 109 }, { "epoch": 0.024537140307829577, "grad_norm": 0.26034069061279297, "learning_rate": 1.9999977568560734e-05, "loss": 0.6026, "step": 110 }, { "epoch": 0.024760205219718938, "grad_norm": 0.2570051848888397, "learning_rate": 1.999997230686753e-05, "loss": 0.6299, "step": 111 }, { "epoch": 0.0249832701316083, "grad_norm": 0.2621450424194336, "learning_rate": 1.999996649131296e-05, "loss": 0.6141, "step": 112 }, { "epoch": 0.025206335043497657, "grad_norm": 0.2425510734319687, "learning_rate": 1.999996012189734e-05, "loss": 0.6184, "step": 113 }, { "epoch": 0.025429399955387018, "grad_norm": 0.28203898668289185, "learning_rate": 1.999995319862103e-05, "loss": 0.5922, "step": 114 }, { "epoch": 0.02565246486727638, "grad_norm": 0.2976488173007965, "learning_rate": 1.9999945721484407e-05, "loss": 0.586, "step": 115 }, { "epoch": 0.025875529779165737, "grad_norm": 0.27894967794418335, "learning_rate": 1.999993769048789e-05, "loss": 0.6099, "step": 116 }, { "epoch": 0.026098594691055098, "grad_norm": 0.2924571931362152, "learning_rate": 1.999992910563192e-05, "loss": 0.5646, "step": 117 }, { "epoch": 0.026321659602944456, "grad_norm": 0.2446456104516983, "learning_rate": 1.9999919966916976e-05, "loss": 0.5722, "step": 118 }, { "epoch": 0.026544724514833817, "grad_norm": 0.23295633494853973, "learning_rate": 1.9999910274343562e-05, "loss": 0.5806, "step": 119 }, { "epoch": 0.026767789426723178, "grad_norm": 0.24767954647541046, "learning_rate": 1.999990002791221e-05, "loss": 0.6003, "step": 120 }, { "epoch": 0.026990854338612535, "grad_norm": 0.25390875339508057, "learning_rate": 1.99998892276235e-05, "loss": 0.5798, "step": 121 }, { "epoch": 0.027213919250501897, "grad_norm": 0.22989511489868164, "learning_rate": 1.999987787347802e-05, "loss": 0.5873, "step": 122 }, { "epoch": 0.027436984162391254, "grad_norm": 0.4493498206138611, "learning_rate": 1.99998659654764e-05, "loss": 0.5994, "step": 123 }, { "epoch": 0.027660049074280615, "grad_norm": 0.2416534572839737, "learning_rate": 1.99998535036193e-05, "loss": 0.5787, "step": 124 }, { "epoch": 0.027883113986169977, "grad_norm": 0.23413562774658203, "learning_rate": 1.9999840487907414e-05, "loss": 0.608, "step": 125 }, { "epoch": 0.028106178898059334, "grad_norm": 0.28015846014022827, "learning_rate": 1.9999826918341462e-05, "loss": 0.6034, "step": 126 }, { "epoch": 0.028329243809948695, "grad_norm": 0.2494441717863083, "learning_rate": 1.999981279492219e-05, "loss": 0.6256, "step": 127 }, { "epoch": 0.028552308721838057, "grad_norm": 0.3094983398914337, "learning_rate": 1.9999798117650385e-05, "loss": 0.6351, "step": 128 }, { "epoch": 0.028775373633727414, "grad_norm": 0.24032524228096008, "learning_rate": 1.9999782886526863e-05, "loss": 0.5703, "step": 129 }, { "epoch": 0.028998438545616775, "grad_norm": 0.24950377643108368, "learning_rate": 1.9999767101552458e-05, "loss": 0.5952, "step": 130 }, { "epoch": 0.029221503457506133, "grad_norm": 0.24877335131168365, "learning_rate": 1.999975076272805e-05, "loss": 0.6143, "step": 131 }, { "epoch": 0.029444568369395494, "grad_norm": 0.2579665184020996, "learning_rate": 1.999973387005455e-05, "loss": 0.6063, "step": 132 }, { "epoch": 0.029667633281284855, "grad_norm": 0.2793569564819336, "learning_rate": 1.9999716423532884e-05, "loss": 0.613, "step": 133 }, { "epoch": 0.029890698193174213, "grad_norm": 0.23061081767082214, "learning_rate": 1.999969842316402e-05, "loss": 0.5918, "step": 134 }, { "epoch": 0.030113763105063574, "grad_norm": 0.2353406399488449, "learning_rate": 1.999967986894896e-05, "loss": 0.585, "step": 135 }, { "epoch": 0.030336828016952932, "grad_norm": 0.2254764884710312, "learning_rate": 1.9999660760888722e-05, "loss": 0.611, "step": 136 }, { "epoch": 0.030559892928842293, "grad_norm": 0.24640028178691864, "learning_rate": 1.9999641098984378e-05, "loss": 0.5911, "step": 137 }, { "epoch": 0.030782957840731654, "grad_norm": 0.22932305932044983, "learning_rate": 1.9999620883237004e-05, "loss": 0.5999, "step": 138 }, { "epoch": 0.031006022752621012, "grad_norm": 0.2379560023546219, "learning_rate": 1.999960011364773e-05, "loss": 0.5987, "step": 139 }, { "epoch": 0.031229087664510373, "grad_norm": 0.22063873708248138, "learning_rate": 1.99995787902177e-05, "loss": 0.5867, "step": 140 }, { "epoch": 0.03145215257639973, "grad_norm": 0.23651117086410522, "learning_rate": 1.9999556912948096e-05, "loss": 0.5809, "step": 141 }, { "epoch": 0.031675217488289095, "grad_norm": 0.2621421813964844, "learning_rate": 1.9999534481840134e-05, "loss": 0.5935, "step": 142 }, { "epoch": 0.03189828240017845, "grad_norm": 0.24506062269210815, "learning_rate": 1.9999511496895047e-05, "loss": 0.5931, "step": 143 }, { "epoch": 0.03212134731206781, "grad_norm": 0.2224111109972, "learning_rate": 1.999948795811412e-05, "loss": 0.6156, "step": 144 }, { "epoch": 0.03234441222395717, "grad_norm": 0.23910044133663177, "learning_rate": 1.9999463865498644e-05, "loss": 0.5839, "step": 145 }, { "epoch": 0.03256747713584653, "grad_norm": 0.2555723786354065, "learning_rate": 1.9999439219049964e-05, "loss": 0.6474, "step": 146 }, { "epoch": 0.03279054204773589, "grad_norm": 0.25059348344802856, "learning_rate": 1.9999414018769442e-05, "loss": 0.5846, "step": 147 }, { "epoch": 0.03301360695962525, "grad_norm": 0.22700609266757965, "learning_rate": 1.9999388264658467e-05, "loss": 0.5867, "step": 148 }, { "epoch": 0.03323667187151461, "grad_norm": 0.25858354568481445, "learning_rate": 1.9999361956718476e-05, "loss": 0.5984, "step": 149 }, { "epoch": 0.03345973678340397, "grad_norm": 0.2813456356525421, "learning_rate": 1.9999335094950922e-05, "loss": 0.5943, "step": 150 }, { "epoch": 0.03368280169529333, "grad_norm": 0.288655549287796, "learning_rate": 1.9999307679357293e-05, "loss": 0.5975, "step": 151 }, { "epoch": 0.03390586660718269, "grad_norm": 0.23721511662006378, "learning_rate": 1.9999279709939102e-05, "loss": 0.5637, "step": 152 }, { "epoch": 0.03412893151907205, "grad_norm": 0.23953364789485931, "learning_rate": 1.999925118669791e-05, "loss": 0.5693, "step": 153 }, { "epoch": 0.03435199643096141, "grad_norm": 0.23194071650505066, "learning_rate": 1.9999222109635283e-05, "loss": 0.5446, "step": 154 }, { "epoch": 0.03457506134285077, "grad_norm": 0.2160234898328781, "learning_rate": 1.999919247875284e-05, "loss": 0.5812, "step": 155 }, { "epoch": 0.03479812625474013, "grad_norm": 0.22934173047542572, "learning_rate": 1.999916229405222e-05, "loss": 0.6149, "step": 156 }, { "epoch": 0.03502119116662949, "grad_norm": 0.24892570078372955, "learning_rate": 1.999913155553509e-05, "loss": 0.6053, "step": 157 }, { "epoch": 0.035244256078518846, "grad_norm": 0.2235739678144455, "learning_rate": 1.9999100263203165e-05, "loss": 0.5799, "step": 158 }, { "epoch": 0.03546732099040821, "grad_norm": 0.21852736175060272, "learning_rate": 1.9999068417058168e-05, "loss": 0.5793, "step": 159 }, { "epoch": 0.03569038590229757, "grad_norm": 0.269131600856781, "learning_rate": 1.9999036017101864e-05, "loss": 0.5817, "step": 160 }, { "epoch": 0.035913450814186926, "grad_norm": 0.22516575455665588, "learning_rate": 1.999900306333605e-05, "loss": 0.6, "step": 161 }, { "epoch": 0.03613651572607629, "grad_norm": 0.24313850700855255, "learning_rate": 1.999896955576255e-05, "loss": 0.5738, "step": 162 }, { "epoch": 0.03635958063796565, "grad_norm": 0.26203158497810364, "learning_rate": 1.999893549438322e-05, "loss": 0.5687, "step": 163 }, { "epoch": 0.036582645549855006, "grad_norm": 0.23275348544120789, "learning_rate": 1.9998900879199948e-05, "loss": 0.5698, "step": 164 }, { "epoch": 0.03680571046174437, "grad_norm": 0.2560306489467621, "learning_rate": 1.9998865710214646e-05, "loss": 0.5877, "step": 165 }, { "epoch": 0.03702877537363373, "grad_norm": 0.2175760716199875, "learning_rate": 1.999882998742927e-05, "loss": 0.5797, "step": 166 }, { "epoch": 0.037251840285523086, "grad_norm": 0.22381433844566345, "learning_rate": 1.999879371084579e-05, "loss": 0.5642, "step": 167 }, { "epoch": 0.03747490519741245, "grad_norm": 0.2363629788160324, "learning_rate": 1.9998756880466224e-05, "loss": 0.5651, "step": 168 }, { "epoch": 0.03769797010930181, "grad_norm": 0.22718748450279236, "learning_rate": 1.9998719496292603e-05, "loss": 0.5867, "step": 169 }, { "epoch": 0.037921035021191166, "grad_norm": 0.23127657175064087, "learning_rate": 1.9998681558327005e-05, "loss": 0.5724, "step": 170 }, { "epoch": 0.03814409993308052, "grad_norm": 0.23175480961799622, "learning_rate": 1.9998643066571527e-05, "loss": 0.5919, "step": 171 }, { "epoch": 0.03836716484496989, "grad_norm": 0.22871741652488708, "learning_rate": 1.9998604021028304e-05, "loss": 0.5653, "step": 172 }, { "epoch": 0.038590229756859246, "grad_norm": 0.21524189412593842, "learning_rate": 1.999856442169949e-05, "loss": 0.6123, "step": 173 }, { "epoch": 0.0388132946687486, "grad_norm": 0.23523908853530884, "learning_rate": 1.999852426858729e-05, "loss": 0.5931, "step": 174 }, { "epoch": 0.03903635958063797, "grad_norm": 0.23540736734867096, "learning_rate": 1.9998483561693926e-05, "loss": 0.5614, "step": 175 }, { "epoch": 0.039259424492527326, "grad_norm": 0.22590956091880798, "learning_rate": 1.999844230102164e-05, "loss": 0.5917, "step": 176 }, { "epoch": 0.03948248940441668, "grad_norm": 0.22635167837142944, "learning_rate": 1.999840048657273e-05, "loss": 0.5886, "step": 177 }, { "epoch": 0.03970555431630605, "grad_norm": 0.24712443351745605, "learning_rate": 1.9998358118349513e-05, "loss": 0.5936, "step": 178 }, { "epoch": 0.039928619228195406, "grad_norm": 0.266133576631546, "learning_rate": 1.999831519635433e-05, "loss": 0.5662, "step": 179 }, { "epoch": 0.04015168414008476, "grad_norm": 0.2172316312789917, "learning_rate": 1.9998271720589558e-05, "loss": 0.5679, "step": 180 }, { "epoch": 0.04037474905197413, "grad_norm": 0.3194250464439392, "learning_rate": 1.999822769105761e-05, "loss": 0.6105, "step": 181 }, { "epoch": 0.040597813963863486, "grad_norm": 0.2604449391365051, "learning_rate": 1.9998183107760915e-05, "loss": 0.5942, "step": 182 }, { "epoch": 0.04082087887575284, "grad_norm": 0.26620209217071533, "learning_rate": 1.9998137970701955e-05, "loss": 0.5859, "step": 183 }, { "epoch": 0.0410439437876422, "grad_norm": 0.2345753312110901, "learning_rate": 1.9998092279883215e-05, "loss": 0.5939, "step": 184 }, { "epoch": 0.041267008699531565, "grad_norm": 0.22213764488697052, "learning_rate": 1.999804603530724e-05, "loss": 0.5815, "step": 185 }, { "epoch": 0.04149007361142092, "grad_norm": 0.23276378214359283, "learning_rate": 1.9997999236976587e-05, "loss": 0.5842, "step": 186 }, { "epoch": 0.04171313852331028, "grad_norm": 0.22480995953083038, "learning_rate": 1.9997951884893843e-05, "loss": 0.5897, "step": 187 }, { "epoch": 0.041936203435199645, "grad_norm": 0.22761783003807068, "learning_rate": 1.9997903979061635e-05, "loss": 0.5873, "step": 188 }, { "epoch": 0.042159268347089, "grad_norm": 0.27698761224746704, "learning_rate": 1.9997855519482614e-05, "loss": 0.5933, "step": 189 }, { "epoch": 0.04238233325897836, "grad_norm": 0.2002975046634674, "learning_rate": 1.9997806506159466e-05, "loss": 0.5698, "step": 190 }, { "epoch": 0.042605398170867725, "grad_norm": 0.233673095703125, "learning_rate": 1.9997756939094905e-05, "loss": 0.5765, "step": 191 }, { "epoch": 0.04282846308275708, "grad_norm": 0.26376616954803467, "learning_rate": 1.999770681829168e-05, "loss": 0.5741, "step": 192 }, { "epoch": 0.04305152799464644, "grad_norm": 0.22886492311954498, "learning_rate": 1.9997656143752556e-05, "loss": 0.5709, "step": 193 }, { "epoch": 0.043274592906535805, "grad_norm": 0.3004070520401001, "learning_rate": 1.9997604915480352e-05, "loss": 0.5514, "step": 194 }, { "epoch": 0.04349765781842516, "grad_norm": 0.22209997475147247, "learning_rate": 1.99975531334779e-05, "loss": 0.6095, "step": 195 }, { "epoch": 0.04372072273031452, "grad_norm": 0.20531293749809265, "learning_rate": 1.9997500797748067e-05, "loss": 0.5618, "step": 196 }, { "epoch": 0.04394378764220388, "grad_norm": 0.20507480204105377, "learning_rate": 1.9997447908293753e-05, "loss": 0.5465, "step": 197 }, { "epoch": 0.04416685255409324, "grad_norm": 0.22681620717048645, "learning_rate": 1.999739446511789e-05, "loss": 0.5546, "step": 198 }, { "epoch": 0.0443899174659826, "grad_norm": 0.21772046387195587, "learning_rate": 1.999734046822343e-05, "loss": 0.5934, "step": 199 }, { "epoch": 0.04461298237787196, "grad_norm": 0.22321215271949768, "learning_rate": 1.9997285917613375e-05, "loss": 0.579, "step": 200 }, { "epoch": 0.04483604728976132, "grad_norm": 0.21166378259658813, "learning_rate": 1.9997230813290737e-05, "loss": 0.5978, "step": 201 }, { "epoch": 0.04505911220165068, "grad_norm": 0.23722383379936218, "learning_rate": 1.999717515525857e-05, "loss": 0.5747, "step": 202 }, { "epoch": 0.04528217711354004, "grad_norm": 0.36909613013267517, "learning_rate": 1.9997118943519962e-05, "loss": 0.5417, "step": 203 }, { "epoch": 0.0455052420254294, "grad_norm": 0.23573465645313263, "learning_rate": 1.9997062178078023e-05, "loss": 0.5971, "step": 204 }, { "epoch": 0.04572830693731876, "grad_norm": 0.25214946269989014, "learning_rate": 1.9997004858935894e-05, "loss": 0.5852, "step": 205 }, { "epoch": 0.04595137184920812, "grad_norm": 0.20637395977973938, "learning_rate": 1.9996946986096754e-05, "loss": 0.5394, "step": 206 }, { "epoch": 0.046174436761097476, "grad_norm": 0.21749068796634674, "learning_rate": 1.9996888559563804e-05, "loss": 0.5662, "step": 207 }, { "epoch": 0.04639750167298684, "grad_norm": 0.2306276559829712, "learning_rate": 1.9996829579340284e-05, "loss": 0.6038, "step": 208 }, { "epoch": 0.0466205665848762, "grad_norm": 0.22346408665180206, "learning_rate": 1.999677004542946e-05, "loss": 0.6024, "step": 209 }, { "epoch": 0.046843631496765556, "grad_norm": 0.22103498876094818, "learning_rate": 1.9996709957834627e-05, "loss": 0.5805, "step": 210 }, { "epoch": 0.04706669640865492, "grad_norm": 0.2120116651058197, "learning_rate": 1.9996649316559118e-05, "loss": 0.5712, "step": 211 }, { "epoch": 0.04728976132054428, "grad_norm": 0.24006275832653046, "learning_rate": 1.9996588121606286e-05, "loss": 0.5487, "step": 212 }, { "epoch": 0.047512826232433636, "grad_norm": 0.2095099836587906, "learning_rate": 1.9996526372979522e-05, "loss": 0.555, "step": 213 }, { "epoch": 0.047735891144323, "grad_norm": 0.23283496499061584, "learning_rate": 1.999646407068225e-05, "loss": 0.6025, "step": 214 }, { "epoch": 0.04795895605621236, "grad_norm": 0.2015821784734726, "learning_rate": 1.9996401214717912e-05, "loss": 0.5725, "step": 215 }, { "epoch": 0.048182020968101716, "grad_norm": 0.2395429164171219, "learning_rate": 1.999633780509e-05, "loss": 0.5549, "step": 216 }, { "epoch": 0.04840508587999108, "grad_norm": 0.2157035768032074, "learning_rate": 1.9996273841802017e-05, "loss": 0.5644, "step": 217 }, { "epoch": 0.04862815079188044, "grad_norm": 0.2156548947095871, "learning_rate": 1.9996209324857516e-05, "loss": 0.5901, "step": 218 }, { "epoch": 0.048851215703769796, "grad_norm": 0.2213698923587799, "learning_rate": 1.999614425426006e-05, "loss": 0.5463, "step": 219 }, { "epoch": 0.049074280615659154, "grad_norm": 0.21965420246124268, "learning_rate": 1.9996078630013253e-05, "loss": 0.5741, "step": 220 }, { "epoch": 0.04929734552754852, "grad_norm": 0.213240385055542, "learning_rate": 1.999601245212074e-05, "loss": 0.5698, "step": 221 }, { "epoch": 0.049520410439437876, "grad_norm": 0.21078208088874817, "learning_rate": 1.9995945720586177e-05, "loss": 0.5923, "step": 222 }, { "epoch": 0.04974347535132723, "grad_norm": 0.21629688143730164, "learning_rate": 1.9995878435413264e-05, "loss": 0.5438, "step": 223 }, { "epoch": 0.0499665402632166, "grad_norm": 0.22872765362262726, "learning_rate": 1.9995810596605725e-05, "loss": 0.5543, "step": 224 }, { "epoch": 0.050189605175105956, "grad_norm": 0.2012244164943695, "learning_rate": 1.999574220416732e-05, "loss": 0.557, "step": 225 }, { "epoch": 0.05041267008699531, "grad_norm": 0.21912412345409393, "learning_rate": 1.9995673258101837e-05, "loss": 0.5816, "step": 226 }, { "epoch": 0.05063573499888468, "grad_norm": 0.22358618676662445, "learning_rate": 1.999560375841309e-05, "loss": 0.6222, "step": 227 }, { "epoch": 0.050858799910774036, "grad_norm": 0.22544872760772705, "learning_rate": 1.9995533705104936e-05, "loss": 0.5481, "step": 228 }, { "epoch": 0.05108186482266339, "grad_norm": 0.20495697855949402, "learning_rate": 1.999546309818125e-05, "loss": 0.5371, "step": 229 }, { "epoch": 0.05130492973455276, "grad_norm": 0.22267979383468628, "learning_rate": 1.9995391937645944e-05, "loss": 0.5916, "step": 230 }, { "epoch": 0.051527994646442116, "grad_norm": 0.200364887714386, "learning_rate": 1.9995320223502958e-05, "loss": 0.541, "step": 231 }, { "epoch": 0.05175105955833147, "grad_norm": 0.20090313255786896, "learning_rate": 1.9995247955756267e-05, "loss": 0.565, "step": 232 }, { "epoch": 0.05197412447022083, "grad_norm": 5.10443639755249, "learning_rate": 1.9995175134409868e-05, "loss": 0.6056, "step": 233 }, { "epoch": 0.052197189382110196, "grad_norm": 0.30669254064559937, "learning_rate": 1.99951017594678e-05, "loss": 0.589, "step": 234 }, { "epoch": 0.05242025429399955, "grad_norm": 0.21791155636310577, "learning_rate": 1.9995027830934125e-05, "loss": 0.5438, "step": 235 }, { "epoch": 0.05264331920588891, "grad_norm": 0.25966677069664, "learning_rate": 1.9994953348812937e-05, "loss": 0.582, "step": 236 }, { "epoch": 0.052866384117778276, "grad_norm": 0.24935361742973328, "learning_rate": 1.9994878313108362e-05, "loss": 0.5934, "step": 237 }, { "epoch": 0.05308944902966763, "grad_norm": 0.22276830673217773, "learning_rate": 1.9994802723824557e-05, "loss": 0.5845, "step": 238 }, { "epoch": 0.05331251394155699, "grad_norm": 0.2349478304386139, "learning_rate": 1.9994726580965704e-05, "loss": 0.5716, "step": 239 }, { "epoch": 0.053535578853446356, "grad_norm": 0.22332008183002472, "learning_rate": 1.9994649884536026e-05, "loss": 0.6009, "step": 240 }, { "epoch": 0.05375864376533571, "grad_norm": 0.19660501182079315, "learning_rate": 1.9994572634539767e-05, "loss": 0.5574, "step": 241 }, { "epoch": 0.05398170867722507, "grad_norm": 0.2085440754890442, "learning_rate": 1.999449483098121e-05, "loss": 0.5676, "step": 242 }, { "epoch": 0.054204773589114436, "grad_norm": 0.21077819168567657, "learning_rate": 1.999441647386466e-05, "loss": 0.5507, "step": 243 }, { "epoch": 0.05442783850100379, "grad_norm": 0.2927658259868622, "learning_rate": 1.9994337563194457e-05, "loss": 0.5834, "step": 244 }, { "epoch": 0.05465090341289315, "grad_norm": 0.22845080494880676, "learning_rate": 1.9994258098974974e-05, "loss": 0.5524, "step": 245 }, { "epoch": 0.05487396832478251, "grad_norm": 0.24501702189445496, "learning_rate": 1.999417808121061e-05, "loss": 0.5376, "step": 246 }, { "epoch": 0.05509703323667187, "grad_norm": 0.19732032716274261, "learning_rate": 1.99940975099058e-05, "loss": 0.5697, "step": 247 }, { "epoch": 0.05532009814856123, "grad_norm": 0.21217837929725647, "learning_rate": 1.9994016385065005e-05, "loss": 0.5941, "step": 248 }, { "epoch": 0.05554316306045059, "grad_norm": 0.25333619117736816, "learning_rate": 1.999393470669272e-05, "loss": 0.5482, "step": 249 }, { "epoch": 0.05576622797233995, "grad_norm": 0.21620452404022217, "learning_rate": 1.9993852474793457e-05, "loss": 0.5712, "step": 250 }, { "epoch": 0.05598929288422931, "grad_norm": 0.20523187518119812, "learning_rate": 1.9993769689371788e-05, "loss": 0.5676, "step": 251 }, { "epoch": 0.05621235779611867, "grad_norm": 0.20364375412464142, "learning_rate": 1.999368635043229e-05, "loss": 0.5764, "step": 252 }, { "epoch": 0.05643542270800803, "grad_norm": 0.2179926186800003, "learning_rate": 1.9993602457979574e-05, "loss": 0.5583, "step": 253 }, { "epoch": 0.05665848761989739, "grad_norm": 0.23844636976718903, "learning_rate": 1.9993518012018297e-05, "loss": 0.5768, "step": 254 }, { "epoch": 0.05688155253178675, "grad_norm": 0.21450263261795044, "learning_rate": 1.9993433012553128e-05, "loss": 0.5814, "step": 255 }, { "epoch": 0.05710461744367611, "grad_norm": 0.22201555967330933, "learning_rate": 1.9993347459588777e-05, "loss": 0.5565, "step": 256 }, { "epoch": 0.05732768235556547, "grad_norm": 0.2182476669549942, "learning_rate": 1.9993261353129988e-05, "loss": 0.5818, "step": 257 }, { "epoch": 0.05755074726745483, "grad_norm": 0.1971082240343094, "learning_rate": 1.9993174693181517e-05, "loss": 0.5686, "step": 258 }, { "epoch": 0.057773812179344186, "grad_norm": 0.20535312592983246, "learning_rate": 1.999308747974818e-05, "loss": 0.5687, "step": 259 }, { "epoch": 0.05799687709123355, "grad_norm": 0.218933567404747, "learning_rate": 1.9992999712834794e-05, "loss": 0.5769, "step": 260 }, { "epoch": 0.05821994200312291, "grad_norm": 0.2132030874490738, "learning_rate": 1.9992911392446227e-05, "loss": 0.596, "step": 261 }, { "epoch": 0.058443006915012266, "grad_norm": 0.19926026463508606, "learning_rate": 1.999282251858737e-05, "loss": 0.558, "step": 262 }, { "epoch": 0.05866607182690163, "grad_norm": 0.2089562714099884, "learning_rate": 1.9992733091263144e-05, "loss": 0.582, "step": 263 }, { "epoch": 0.05888913673879099, "grad_norm": 0.20722773671150208, "learning_rate": 1.9992643110478504e-05, "loss": 0.5758, "step": 264 }, { "epoch": 0.059112201650680346, "grad_norm": 0.21631532907485962, "learning_rate": 1.999255257623843e-05, "loss": 0.5725, "step": 265 }, { "epoch": 0.05933526656256971, "grad_norm": 0.21997545659542084, "learning_rate": 1.999246148854794e-05, "loss": 0.5915, "step": 266 }, { "epoch": 0.05955833147445907, "grad_norm": 0.4292175769805908, "learning_rate": 1.9992369847412076e-05, "loss": 0.5891, "step": 267 }, { "epoch": 0.059781396386348426, "grad_norm": 0.2055254876613617, "learning_rate": 1.9992277652835918e-05, "loss": 0.5219, "step": 268 }, { "epoch": 0.06000446129823779, "grad_norm": 0.20326492190361023, "learning_rate": 1.9992184904824566e-05, "loss": 0.5462, "step": 269 }, { "epoch": 0.06022752621012715, "grad_norm": 0.19965752959251404, "learning_rate": 1.9992091603383164e-05, "loss": 0.5688, "step": 270 }, { "epoch": 0.060450591122016506, "grad_norm": 0.19592702388763428, "learning_rate": 1.9991997748516872e-05, "loss": 0.5556, "step": 271 }, { "epoch": 0.060673656033905864, "grad_norm": 0.2036285698413849, "learning_rate": 1.9991903340230898e-05, "loss": 0.5452, "step": 272 }, { "epoch": 0.06089672094579523, "grad_norm": 0.21509170532226562, "learning_rate": 1.9991808378530465e-05, "loss": 0.5682, "step": 273 }, { "epoch": 0.061119785857684586, "grad_norm": 0.2015625238418579, "learning_rate": 1.9991712863420832e-05, "loss": 0.5846, "step": 274 }, { "epoch": 0.061342850769573944, "grad_norm": 0.21707095205783844, "learning_rate": 1.9991616794907286e-05, "loss": 0.5666, "step": 275 }, { "epoch": 0.06156591568146331, "grad_norm": 0.1988460123538971, "learning_rate": 1.9991520172995158e-05, "loss": 0.5875, "step": 276 }, { "epoch": 0.061788980593352666, "grad_norm": 0.19290049374103546, "learning_rate": 1.999142299768979e-05, "loss": 0.5497, "step": 277 }, { "epoch": 0.062012045505242024, "grad_norm": 0.2069414258003235, "learning_rate": 1.9991325268996567e-05, "loss": 0.5438, "step": 278 }, { "epoch": 0.06223511041713139, "grad_norm": 0.22346992790699005, "learning_rate": 1.9991226986920906e-05, "loss": 0.5668, "step": 279 }, { "epoch": 0.062458175329020746, "grad_norm": 0.21194760501384735, "learning_rate": 1.9991128151468247e-05, "loss": 0.5957, "step": 280 }, { "epoch": 0.06268124024091011, "grad_norm": 0.209646537899971, "learning_rate": 1.9991028762644063e-05, "loss": 0.5748, "step": 281 }, { "epoch": 0.06290430515279946, "grad_norm": 0.20566052198410034, "learning_rate": 1.9990928820453858e-05, "loss": 0.5506, "step": 282 }, { "epoch": 0.06312737006468883, "grad_norm": 0.21374207735061646, "learning_rate": 1.999082832490317e-05, "loss": 0.5697, "step": 283 }, { "epoch": 0.06335043497657819, "grad_norm": 0.21012777090072632, "learning_rate": 1.999072727599757e-05, "loss": 0.5112, "step": 284 }, { "epoch": 0.06357349988846754, "grad_norm": 0.19954928755760193, "learning_rate": 1.9990625673742644e-05, "loss": 0.579, "step": 285 }, { "epoch": 0.0637965648003569, "grad_norm": 0.20394836366176605, "learning_rate": 1.9990523518144027e-05, "loss": 0.5425, "step": 286 }, { "epoch": 0.06401962971224627, "grad_norm": 0.20405294001102448, "learning_rate": 1.9990420809207375e-05, "loss": 0.5538, "step": 287 }, { "epoch": 0.06424269462413562, "grad_norm": 0.24065051972866058, "learning_rate": 1.9990317546938373e-05, "loss": 0.5786, "step": 288 }, { "epoch": 0.06446575953602499, "grad_norm": 0.20158305764198303, "learning_rate": 1.9990213731342747e-05, "loss": 0.5475, "step": 289 }, { "epoch": 0.06468882444791434, "grad_norm": 0.29271385073661804, "learning_rate": 1.9990109362426243e-05, "loss": 0.5468, "step": 290 }, { "epoch": 0.0649118893598037, "grad_norm": 0.2596181333065033, "learning_rate": 1.999000444019464e-05, "loss": 0.5632, "step": 291 }, { "epoch": 0.06513495427169307, "grad_norm": 0.20416077971458435, "learning_rate": 1.9989898964653753e-05, "loss": 0.5755, "step": 292 }, { "epoch": 0.06535801918358242, "grad_norm": 0.24204829335212708, "learning_rate": 1.998979293580942e-05, "loss": 0.5679, "step": 293 }, { "epoch": 0.06558108409547178, "grad_norm": 0.20215743780136108, "learning_rate": 1.9989686353667522e-05, "loss": 0.5596, "step": 294 }, { "epoch": 0.06580414900736115, "grad_norm": 0.24796119332313538, "learning_rate": 1.998957921823395e-05, "loss": 0.546, "step": 295 }, { "epoch": 0.0660272139192505, "grad_norm": 0.22253303229808807, "learning_rate": 1.9989471529514647e-05, "loss": 0.5606, "step": 296 }, { "epoch": 0.06625027883113986, "grad_norm": 0.19948336482048035, "learning_rate": 1.9989363287515577e-05, "loss": 0.596, "step": 297 }, { "epoch": 0.06647334374302923, "grad_norm": 0.26588407158851624, "learning_rate": 1.9989254492242727e-05, "loss": 0.5347, "step": 298 }, { "epoch": 0.06669640865491858, "grad_norm": 0.2680055797100067, "learning_rate": 1.9989145143702132e-05, "loss": 0.588, "step": 299 }, { "epoch": 0.06691947356680794, "grad_norm": 0.2872098684310913, "learning_rate": 1.9989035241899844e-05, "loss": 0.5261, "step": 300 }, { "epoch": 0.0671425384786973, "grad_norm": 0.19267728924751282, "learning_rate": 1.998892478684195e-05, "loss": 0.5807, "step": 301 }, { "epoch": 0.06736560339058666, "grad_norm": 0.20785115659236908, "learning_rate": 1.9988813778534568e-05, "loss": 0.5972, "step": 302 }, { "epoch": 0.06758866830247602, "grad_norm": 0.1926824450492859, "learning_rate": 1.998870221698385e-05, "loss": 0.5495, "step": 303 }, { "epoch": 0.06781173321436539, "grad_norm": 0.21504642069339752, "learning_rate": 1.9988590102195968e-05, "loss": 0.5593, "step": 304 }, { "epoch": 0.06803479812625474, "grad_norm": 0.22837965190410614, "learning_rate": 1.9988477434177136e-05, "loss": 0.5749, "step": 305 }, { "epoch": 0.0682578630381441, "grad_norm": 0.27670159935951233, "learning_rate": 1.9988364212933595e-05, "loss": 0.5474, "step": 306 }, { "epoch": 0.06848092795003347, "grad_norm": 0.1969708949327469, "learning_rate": 1.9988250438471612e-05, "loss": 0.573, "step": 307 }, { "epoch": 0.06870399286192282, "grad_norm": 0.19591858983039856, "learning_rate": 1.9988136110797494e-05, "loss": 0.5677, "step": 308 }, { "epoch": 0.06892705777381218, "grad_norm": 0.1888752281665802, "learning_rate": 1.998802122991757e-05, "loss": 0.5485, "step": 309 }, { "epoch": 0.06915012268570155, "grad_norm": 0.20352420210838318, "learning_rate": 1.9987905795838204e-05, "loss": 0.5635, "step": 310 }, { "epoch": 0.0693731875975909, "grad_norm": 0.19486026465892792, "learning_rate": 1.9987789808565785e-05, "loss": 0.5624, "step": 311 }, { "epoch": 0.06959625250948026, "grad_norm": 0.20236323773860931, "learning_rate": 1.9987673268106742e-05, "loss": 0.5817, "step": 312 }, { "epoch": 0.06981931742136963, "grad_norm": 0.2420659214258194, "learning_rate": 1.998755617446753e-05, "loss": 0.5586, "step": 313 }, { "epoch": 0.07004238233325898, "grad_norm": 0.21799051761627197, "learning_rate": 1.9987438527654633e-05, "loss": 0.5918, "step": 314 }, { "epoch": 0.07026544724514834, "grad_norm": 0.19442883133888245, "learning_rate": 1.9987320327674566e-05, "loss": 0.5621, "step": 315 }, { "epoch": 0.07048851215703769, "grad_norm": 0.19582122564315796, "learning_rate": 1.9987201574533876e-05, "loss": 0.5432, "step": 316 }, { "epoch": 0.07071157706892706, "grad_norm": 0.1984792798757553, "learning_rate": 1.998708226823914e-05, "loss": 0.5532, "step": 317 }, { "epoch": 0.07093464198081642, "grad_norm": 0.2259417176246643, "learning_rate": 1.9986962408796972e-05, "loss": 0.5676, "step": 318 }, { "epoch": 0.07115770689270577, "grad_norm": 0.2008512020111084, "learning_rate": 1.9986841996213998e-05, "loss": 0.5499, "step": 319 }, { "epoch": 0.07138077180459514, "grad_norm": 0.19798852503299713, "learning_rate": 1.99867210304969e-05, "loss": 0.564, "step": 320 }, { "epoch": 0.0716038367164845, "grad_norm": 0.21567919850349426, "learning_rate": 1.998659951165237e-05, "loss": 0.5701, "step": 321 }, { "epoch": 0.07182690162837385, "grad_norm": 0.21192528307437897, "learning_rate": 1.998647743968714e-05, "loss": 0.595, "step": 322 }, { "epoch": 0.07204996654026322, "grad_norm": 0.1853858083486557, "learning_rate": 1.9986354814607974e-05, "loss": 0.5546, "step": 323 }, { "epoch": 0.07227303145215258, "grad_norm": 0.19545771181583405, "learning_rate": 1.998623163642166e-05, "loss": 0.5602, "step": 324 }, { "epoch": 0.07249609636404193, "grad_norm": 0.20142099261283875, "learning_rate": 1.998610790513502e-05, "loss": 0.5589, "step": 325 }, { "epoch": 0.0727191612759313, "grad_norm": 0.1894591748714447, "learning_rate": 1.9985983620754914e-05, "loss": 0.5855, "step": 326 }, { "epoch": 0.07294222618782066, "grad_norm": 0.19726653397083282, "learning_rate": 1.998585878328822e-05, "loss": 0.559, "step": 327 }, { "epoch": 0.07316529109971001, "grad_norm": 0.19552524387836456, "learning_rate": 1.998573339274185e-05, "loss": 0.5643, "step": 328 }, { "epoch": 0.07338835601159938, "grad_norm": 0.2093891054391861, "learning_rate": 1.9985607449122754e-05, "loss": 0.5661, "step": 329 }, { "epoch": 0.07361142092348874, "grad_norm": 0.2397080659866333, "learning_rate": 1.9985480952437902e-05, "loss": 0.565, "step": 330 }, { "epoch": 0.07383448583537809, "grad_norm": 0.21570439636707306, "learning_rate": 1.998535390269431e-05, "loss": 0.5505, "step": 331 }, { "epoch": 0.07405755074726746, "grad_norm": 0.18352028727531433, "learning_rate": 1.9985226299899006e-05, "loss": 0.5501, "step": 332 }, { "epoch": 0.07428061565915682, "grad_norm": 0.20006653666496277, "learning_rate": 1.9985098144059058e-05, "loss": 0.5559, "step": 333 }, { "epoch": 0.07450368057104617, "grad_norm": 0.22780755162239075, "learning_rate": 1.998496943518157e-05, "loss": 0.5886, "step": 334 }, { "epoch": 0.07472674548293554, "grad_norm": 0.18735693395137787, "learning_rate": 1.9984840173273662e-05, "loss": 0.5714, "step": 335 }, { "epoch": 0.0749498103948249, "grad_norm": 0.1882167011499405, "learning_rate": 1.99847103583425e-05, "loss": 0.5586, "step": 336 }, { "epoch": 0.07517287530671425, "grad_norm": 0.20951317250728607, "learning_rate": 1.9984579990395274e-05, "loss": 0.5286, "step": 337 }, { "epoch": 0.07539594021860362, "grad_norm": 0.2166478931903839, "learning_rate": 1.9984449069439197e-05, "loss": 0.5661, "step": 338 }, { "epoch": 0.07561900513049297, "grad_norm": 0.191757470369339, "learning_rate": 1.998431759548153e-05, "loss": 0.5431, "step": 339 }, { "epoch": 0.07584207004238233, "grad_norm": 0.2109043151140213, "learning_rate": 1.998418556852955e-05, "loss": 0.5642, "step": 340 }, { "epoch": 0.0760651349542717, "grad_norm": 0.2524346709251404, "learning_rate": 1.9984052988590573e-05, "loss": 0.5505, "step": 341 }, { "epoch": 0.07628819986616105, "grad_norm": 0.22443033754825592, "learning_rate": 1.9983919855671937e-05, "loss": 0.5454, "step": 342 }, { "epoch": 0.07651126477805041, "grad_norm": 0.1956530660390854, "learning_rate": 1.9983786169781017e-05, "loss": 0.5546, "step": 343 }, { "epoch": 0.07673432968993978, "grad_norm": 0.20710930228233337, "learning_rate": 1.9983651930925217e-05, "loss": 0.5554, "step": 344 }, { "epoch": 0.07695739460182913, "grad_norm": 0.2150043398141861, "learning_rate": 1.9983517139111978e-05, "loss": 0.5434, "step": 345 }, { "epoch": 0.07718045951371849, "grad_norm": 0.21276871860027313, "learning_rate": 1.998338179434876e-05, "loss": 0.5623, "step": 346 }, { "epoch": 0.07740352442560786, "grad_norm": 0.20740285515785217, "learning_rate": 1.998324589664306e-05, "loss": 0.5804, "step": 347 }, { "epoch": 0.0776265893374972, "grad_norm": 0.20988604426383972, "learning_rate": 1.99831094460024e-05, "loss": 0.5742, "step": 348 }, { "epoch": 0.07784965424938657, "grad_norm": 0.19990280270576477, "learning_rate": 1.9982972442434346e-05, "loss": 0.5397, "step": 349 }, { "epoch": 0.07807271916127594, "grad_norm": 0.295187383890152, "learning_rate": 1.9982834885946482e-05, "loss": 0.5713, "step": 350 }, { "epoch": 0.07829578407316529, "grad_norm": 0.20176567137241364, "learning_rate": 1.998269677654643e-05, "loss": 0.5548, "step": 351 }, { "epoch": 0.07851884898505465, "grad_norm": 0.20890064537525177, "learning_rate": 1.9982558114241837e-05, "loss": 0.5739, "step": 352 }, { "epoch": 0.07874191389694402, "grad_norm": 0.2795037031173706, "learning_rate": 1.998241889904038e-05, "loss": 0.5343, "step": 353 }, { "epoch": 0.07896497880883337, "grad_norm": 0.22098244726657867, "learning_rate": 1.9982279130949775e-05, "loss": 0.517, "step": 354 }, { "epoch": 0.07918804372072273, "grad_norm": 0.20403707027435303, "learning_rate": 1.998213880997776e-05, "loss": 0.5599, "step": 355 }, { "epoch": 0.0794111086326121, "grad_norm": 0.2201310694217682, "learning_rate": 1.9981997936132107e-05, "loss": 0.5348, "step": 356 }, { "epoch": 0.07963417354450145, "grad_norm": 0.22818854451179504, "learning_rate": 1.998185650942062e-05, "loss": 0.551, "step": 357 }, { "epoch": 0.07985723845639081, "grad_norm": 0.23051077127456665, "learning_rate": 1.9981714529851127e-05, "loss": 0.5926, "step": 358 }, { "epoch": 0.08008030336828018, "grad_norm": 0.21485844254493713, "learning_rate": 1.99815719974315e-05, "loss": 0.5801, "step": 359 }, { "epoch": 0.08030336828016953, "grad_norm": 0.1935802698135376, "learning_rate": 1.998142891216963e-05, "loss": 0.5411, "step": 360 }, { "epoch": 0.08052643319205889, "grad_norm": 0.1971837282180786, "learning_rate": 1.998128527407344e-05, "loss": 0.5805, "step": 361 }, { "epoch": 0.08074949810394826, "grad_norm": 0.20717322826385498, "learning_rate": 1.9981141083150886e-05, "loss": 0.5493, "step": 362 }, { "epoch": 0.0809725630158376, "grad_norm": 0.2011585831642151, "learning_rate": 1.9980996339409957e-05, "loss": 0.5579, "step": 363 }, { "epoch": 0.08119562792772697, "grad_norm": 0.19211354851722717, "learning_rate": 1.9980851042858664e-05, "loss": 0.5383, "step": 364 }, { "epoch": 0.08141869283961632, "grad_norm": 0.21699954569339752, "learning_rate": 1.998070519350506e-05, "loss": 0.5524, "step": 365 }, { "epoch": 0.08164175775150569, "grad_norm": 0.20210997760295868, "learning_rate": 1.9980558791357222e-05, "loss": 0.5738, "step": 366 }, { "epoch": 0.08186482266339505, "grad_norm": 0.18855418264865875, "learning_rate": 1.9980411836423256e-05, "loss": 0.547, "step": 367 }, { "epoch": 0.0820878875752844, "grad_norm": 0.21640720963478088, "learning_rate": 1.9980264328711305e-05, "loss": 0.5964, "step": 368 }, { "epoch": 0.08231095248717377, "grad_norm": 0.2280052751302719, "learning_rate": 1.9980116268229536e-05, "loss": 0.561, "step": 369 }, { "epoch": 0.08253401739906313, "grad_norm": 0.1860683411359787, "learning_rate": 1.9979967654986155e-05, "loss": 0.545, "step": 370 }, { "epoch": 0.08275708231095248, "grad_norm": 0.19923582673072815, "learning_rate": 1.9979818488989383e-05, "loss": 0.5421, "step": 371 }, { "epoch": 0.08298014722284185, "grad_norm": 0.23478665947914124, "learning_rate": 1.997966877024749e-05, "loss": 0.5652, "step": 372 }, { "epoch": 0.08320321213473121, "grad_norm": 0.2126697599887848, "learning_rate": 1.9979518498768768e-05, "loss": 0.5599, "step": 373 }, { "epoch": 0.08342627704662056, "grad_norm": 0.20854973793029785, "learning_rate": 1.9979367674561535e-05, "loss": 0.5425, "step": 374 }, { "epoch": 0.08364934195850993, "grad_norm": 0.20865046977996826, "learning_rate": 1.9979216297634148e-05, "loss": 0.5529, "step": 375 }, { "epoch": 0.08387240687039929, "grad_norm": 0.24369299411773682, "learning_rate": 1.997906436799499e-05, "loss": 0.535, "step": 376 }, { "epoch": 0.08409547178228864, "grad_norm": 0.2064153105020523, "learning_rate": 1.9978911885652475e-05, "loss": 0.5488, "step": 377 }, { "epoch": 0.084318536694178, "grad_norm": 0.21306705474853516, "learning_rate": 1.997875885061505e-05, "loss": 0.5627, "step": 378 }, { "epoch": 0.08454160160606737, "grad_norm": 0.19575418531894684, "learning_rate": 1.9978605262891196e-05, "loss": 0.5565, "step": 379 }, { "epoch": 0.08476466651795672, "grad_norm": 0.19842910766601562, "learning_rate": 1.9978451122489412e-05, "loss": 0.5709, "step": 380 }, { "epoch": 0.08498773142984609, "grad_norm": 0.20483329892158508, "learning_rate": 1.9978296429418237e-05, "loss": 0.5773, "step": 381 }, { "epoch": 0.08521079634173545, "grad_norm": 0.22734715044498444, "learning_rate": 1.997814118368624e-05, "loss": 0.5469, "step": 382 }, { "epoch": 0.0854338612536248, "grad_norm": 0.19727206230163574, "learning_rate": 1.997798538530202e-05, "loss": 0.5541, "step": 383 }, { "epoch": 0.08565692616551417, "grad_norm": 0.19417184591293335, "learning_rate": 1.9977829034274205e-05, "loss": 0.5513, "step": 384 }, { "epoch": 0.08587999107740353, "grad_norm": 0.2037704885005951, "learning_rate": 1.9977672130611454e-05, "loss": 0.5567, "step": 385 }, { "epoch": 0.08610305598929288, "grad_norm": 0.18121762573719025, "learning_rate": 1.997751467432246e-05, "loss": 0.5357, "step": 386 }, { "epoch": 0.08632612090118225, "grad_norm": 0.27105584740638733, "learning_rate": 1.997735666541594e-05, "loss": 0.5654, "step": 387 }, { "epoch": 0.08654918581307161, "grad_norm": 0.20133094489574432, "learning_rate": 1.997719810390065e-05, "loss": 0.5349, "step": 388 }, { "epoch": 0.08677225072496096, "grad_norm": 0.18842002749443054, "learning_rate": 1.997703898978537e-05, "loss": 0.5352, "step": 389 }, { "epoch": 0.08699531563685033, "grad_norm": 0.2452889233827591, "learning_rate": 1.9976879323078913e-05, "loss": 0.5899, "step": 390 }, { "epoch": 0.08721838054873968, "grad_norm": 0.2256636619567871, "learning_rate": 1.9976719103790118e-05, "loss": 0.5115, "step": 391 }, { "epoch": 0.08744144546062904, "grad_norm": 0.1972501426935196, "learning_rate": 1.9976558331927868e-05, "loss": 0.5523, "step": 392 }, { "epoch": 0.0876645103725184, "grad_norm": 0.2026263177394867, "learning_rate": 1.9976397007501062e-05, "loss": 0.5758, "step": 393 }, { "epoch": 0.08788757528440776, "grad_norm": 0.20572836697101593, "learning_rate": 1.9976235130518632e-05, "loss": 0.5429, "step": 394 }, { "epoch": 0.08811064019629712, "grad_norm": 0.18851211667060852, "learning_rate": 1.997607270098955e-05, "loss": 0.5616, "step": 395 }, { "epoch": 0.08833370510818649, "grad_norm": 0.20447294414043427, "learning_rate": 1.9975909718922806e-05, "loss": 0.5527, "step": 396 }, { "epoch": 0.08855677002007584, "grad_norm": 0.19188085198402405, "learning_rate": 1.997574618432744e-05, "loss": 0.558, "step": 397 }, { "epoch": 0.0887798349319652, "grad_norm": 0.19854480028152466, "learning_rate": 1.997558209721249e-05, "loss": 0.5756, "step": 398 }, { "epoch": 0.08900289984385457, "grad_norm": 0.19832177460193634, "learning_rate": 1.997541745758706e-05, "loss": 0.569, "step": 399 }, { "epoch": 0.08922596475574392, "grad_norm": 0.20165055990219116, "learning_rate": 1.9975252265460265e-05, "loss": 0.5376, "step": 400 }, { "epoch": 0.08944902966763328, "grad_norm": 0.18997038900852203, "learning_rate": 1.997508652084125e-05, "loss": 0.5745, "step": 401 }, { "epoch": 0.08967209457952265, "grad_norm": 0.19646713137626648, "learning_rate": 1.9974920223739195e-05, "loss": 0.5454, "step": 402 }, { "epoch": 0.089895159491412, "grad_norm": 0.1976052224636078, "learning_rate": 1.997475337416332e-05, "loss": 0.537, "step": 403 }, { "epoch": 0.09011822440330136, "grad_norm": 0.2045581340789795, "learning_rate": 1.9974585972122857e-05, "loss": 0.5405, "step": 404 }, { "epoch": 0.09034128931519073, "grad_norm": 0.18631087243556976, "learning_rate": 1.9974418017627076e-05, "loss": 0.6034, "step": 405 }, { "epoch": 0.09056435422708008, "grad_norm": 0.2022215723991394, "learning_rate": 1.9974249510685285e-05, "loss": 0.5682, "step": 406 }, { "epoch": 0.09078741913896944, "grad_norm": 0.19569364190101624, "learning_rate": 1.9974080451306816e-05, "loss": 0.5739, "step": 407 }, { "epoch": 0.0910104840508588, "grad_norm": 0.20746709406375885, "learning_rate": 1.9973910839501035e-05, "loss": 0.55, "step": 408 }, { "epoch": 0.09123354896274816, "grad_norm": 0.21757768094539642, "learning_rate": 1.997374067527733e-05, "loss": 0.5404, "step": 409 }, { "epoch": 0.09145661387463752, "grad_norm": 0.1881277859210968, "learning_rate": 1.997356995864513e-05, "loss": 0.5435, "step": 410 }, { "epoch": 0.09167967878652689, "grad_norm": 0.1868787705898285, "learning_rate": 1.9973398689613892e-05, "loss": 0.5373, "step": 411 }, { "epoch": 0.09190274369841624, "grad_norm": 0.1938547044992447, "learning_rate": 1.9973226868193096e-05, "loss": 0.5846, "step": 412 }, { "epoch": 0.0921258086103056, "grad_norm": 0.19920659065246582, "learning_rate": 1.9973054494392265e-05, "loss": 0.5615, "step": 413 }, { "epoch": 0.09234887352219495, "grad_norm": 0.18801911175251007, "learning_rate": 1.997288156822094e-05, "loss": 0.5675, "step": 414 }, { "epoch": 0.09257193843408432, "grad_norm": 0.1990530639886856, "learning_rate": 1.9972708089688705e-05, "loss": 0.5626, "step": 415 }, { "epoch": 0.09279500334597368, "grad_norm": 0.18555153906345367, "learning_rate": 1.9972534058805163e-05, "loss": 0.557, "step": 416 }, { "epoch": 0.09301806825786303, "grad_norm": 0.20318473875522614, "learning_rate": 1.9972359475579953e-05, "loss": 0.5196, "step": 417 }, { "epoch": 0.0932411331697524, "grad_norm": 0.20757220685482025, "learning_rate": 1.997218434002275e-05, "loss": 0.5454, "step": 418 }, { "epoch": 0.09346419808164176, "grad_norm": 0.20588824152946472, "learning_rate": 1.997200865214325e-05, "loss": 0.5347, "step": 419 }, { "epoch": 0.09368726299353111, "grad_norm": 0.20199880003929138, "learning_rate": 1.9971832411951186e-05, "loss": 0.5531, "step": 420 }, { "epoch": 0.09391032790542048, "grad_norm": 0.19605985283851624, "learning_rate": 1.9971655619456313e-05, "loss": 0.5263, "step": 421 }, { "epoch": 0.09413339281730984, "grad_norm": 0.19716261327266693, "learning_rate": 1.997147827466843e-05, "loss": 0.5722, "step": 422 }, { "epoch": 0.09435645772919919, "grad_norm": 0.20400184392929077, "learning_rate": 1.997130037759736e-05, "loss": 0.5629, "step": 423 }, { "epoch": 0.09457952264108856, "grad_norm": 0.203161358833313, "learning_rate": 1.997112192825295e-05, "loss": 0.5538, "step": 424 }, { "epoch": 0.09480258755297792, "grad_norm": 0.20870056748390198, "learning_rate": 1.997094292664509e-05, "loss": 0.5548, "step": 425 }, { "epoch": 0.09502565246486727, "grad_norm": 0.1856376975774765, "learning_rate": 1.9970763372783687e-05, "loss": 0.5193, "step": 426 }, { "epoch": 0.09524871737675664, "grad_norm": 0.1954246610403061, "learning_rate": 1.997058326667869e-05, "loss": 0.5336, "step": 427 }, { "epoch": 0.095471782288646, "grad_norm": 0.1996607482433319, "learning_rate": 1.9970402608340076e-05, "loss": 0.5708, "step": 428 }, { "epoch": 0.09569484720053535, "grad_norm": 0.21878129243850708, "learning_rate": 1.9970221397777848e-05, "loss": 0.5794, "step": 429 }, { "epoch": 0.09591791211242472, "grad_norm": 0.19142527878284454, "learning_rate": 1.9970039635002044e-05, "loss": 0.5221, "step": 430 }, { "epoch": 0.09614097702431408, "grad_norm": 0.18871352076530457, "learning_rate": 1.996985732002273e-05, "loss": 0.5321, "step": 431 }, { "epoch": 0.09636404193620343, "grad_norm": 0.1942790001630783, "learning_rate": 1.996967445285001e-05, "loss": 0.5269, "step": 432 }, { "epoch": 0.0965871068480928, "grad_norm": 0.19900670647621155, "learning_rate": 1.9969491033494e-05, "loss": 0.6048, "step": 433 }, { "epoch": 0.09681017175998216, "grad_norm": 0.193121075630188, "learning_rate": 1.9969307061964873e-05, "loss": 0.5613, "step": 434 }, { "epoch": 0.09703323667187151, "grad_norm": 0.1859043687582016, "learning_rate": 1.9969122538272807e-05, "loss": 0.5657, "step": 435 }, { "epoch": 0.09725630158376088, "grad_norm": 0.18597924709320068, "learning_rate": 1.9968937462428028e-05, "loss": 0.5336, "step": 436 }, { "epoch": 0.09747936649565024, "grad_norm": 0.17796443402767181, "learning_rate": 1.9968751834440783e-05, "loss": 0.5611, "step": 437 }, { "epoch": 0.09770243140753959, "grad_norm": 0.1958586871623993, "learning_rate": 1.9968565654321356e-05, "loss": 0.5377, "step": 438 }, { "epoch": 0.09792549631942896, "grad_norm": 0.21395692229270935, "learning_rate": 1.996837892208006e-05, "loss": 0.5347, "step": 439 }, { "epoch": 0.09814856123131831, "grad_norm": 0.19348031282424927, "learning_rate": 1.9968191637727235e-05, "loss": 0.5576, "step": 440 }, { "epoch": 0.09837162614320767, "grad_norm": 0.19345401227474213, "learning_rate": 1.9968003801273253e-05, "loss": 0.5697, "step": 441 }, { "epoch": 0.09859469105509704, "grad_norm": 0.18203531205654144, "learning_rate": 1.9967815412728523e-05, "loss": 0.5416, "step": 442 }, { "epoch": 0.09881775596698639, "grad_norm": 0.20109198987483978, "learning_rate": 1.9967626472103472e-05, "loss": 0.5514, "step": 443 }, { "epoch": 0.09904082087887575, "grad_norm": 0.18818293511867523, "learning_rate": 1.996743697940857e-05, "loss": 0.5535, "step": 444 }, { "epoch": 0.09926388579076512, "grad_norm": 0.18718019127845764, "learning_rate": 1.996724693465431e-05, "loss": 0.5595, "step": 445 }, { "epoch": 0.09948695070265447, "grad_norm": 0.18856103718280792, "learning_rate": 1.9967056337851217e-05, "loss": 0.5364, "step": 446 }, { "epoch": 0.09971001561454383, "grad_norm": 0.18582871556282043, "learning_rate": 1.996686518900985e-05, "loss": 0.5632, "step": 447 }, { "epoch": 0.0999330805264332, "grad_norm": 0.19166669249534607, "learning_rate": 1.9966673488140794e-05, "loss": 0.5727, "step": 448 }, { "epoch": 0.10015614543832255, "grad_norm": 0.18631108105182648, "learning_rate": 1.9966481235254667e-05, "loss": 0.5472, "step": 449 }, { "epoch": 0.10037921035021191, "grad_norm": 0.18970704078674316, "learning_rate": 1.996628843036212e-05, "loss": 0.5355, "step": 450 }, { "epoch": 0.10060227526210128, "grad_norm": 0.18836888670921326, "learning_rate": 1.9966095073473828e-05, "loss": 0.5309, "step": 451 }, { "epoch": 0.10082534017399063, "grad_norm": 0.1924007087945938, "learning_rate": 1.99659011646005e-05, "loss": 0.5213, "step": 452 }, { "epoch": 0.10104840508587999, "grad_norm": 0.19710196554660797, "learning_rate": 1.996570670375288e-05, "loss": 0.5333, "step": 453 }, { "epoch": 0.10127146999776936, "grad_norm": 0.1823819875717163, "learning_rate": 1.9965511690941737e-05, "loss": 0.5378, "step": 454 }, { "epoch": 0.1014945349096587, "grad_norm": 0.21200565993785858, "learning_rate": 1.9965316126177867e-05, "loss": 0.5594, "step": 455 }, { "epoch": 0.10171759982154807, "grad_norm": 0.18618617951869965, "learning_rate": 1.9965120009472106e-05, "loss": 0.5317, "step": 456 }, { "epoch": 0.10194066473343744, "grad_norm": 0.19600717723369598, "learning_rate": 1.996492334083532e-05, "loss": 0.5612, "step": 457 }, { "epoch": 0.10216372964532679, "grad_norm": 0.1988876461982727, "learning_rate": 1.9964726120278394e-05, "loss": 0.5709, "step": 458 }, { "epoch": 0.10238679455721615, "grad_norm": 0.18938012421131134, "learning_rate": 1.9964528347812255e-05, "loss": 0.5413, "step": 459 }, { "epoch": 0.10260985946910552, "grad_norm": 0.18799841403961182, "learning_rate": 1.9964330023447854e-05, "loss": 0.5534, "step": 460 }, { "epoch": 0.10283292438099487, "grad_norm": 0.229729562997818, "learning_rate": 1.9964131147196185e-05, "loss": 0.5471, "step": 461 }, { "epoch": 0.10305598929288423, "grad_norm": 0.19921550154685974, "learning_rate": 1.9963931719068253e-05, "loss": 0.5419, "step": 462 }, { "epoch": 0.1032790542047736, "grad_norm": 0.19583772122859955, "learning_rate": 1.9963731739075106e-05, "loss": 0.5412, "step": 463 }, { "epoch": 0.10350211911666295, "grad_norm": 0.19680215418338776, "learning_rate": 1.996353120722782e-05, "loss": 0.5477, "step": 464 }, { "epoch": 0.10372518402855231, "grad_norm": 0.20173701643943787, "learning_rate": 1.9963330123537507e-05, "loss": 0.5523, "step": 465 }, { "epoch": 0.10394824894044166, "grad_norm": 0.23238573968410492, "learning_rate": 1.9963128488015294e-05, "loss": 0.559, "step": 466 }, { "epoch": 0.10417131385233103, "grad_norm": 0.22626003623008728, "learning_rate": 1.996292630067236e-05, "loss": 0.526, "step": 467 }, { "epoch": 0.10439437876422039, "grad_norm": 0.21178299188613892, "learning_rate": 1.9962723561519893e-05, "loss": 0.5301, "step": 468 }, { "epoch": 0.10461744367610974, "grad_norm": 0.1937963217496872, "learning_rate": 1.996252027056913e-05, "loss": 0.5449, "step": 469 }, { "epoch": 0.1048405085879991, "grad_norm": 0.18095934391021729, "learning_rate": 1.996231642783133e-05, "loss": 0.5676, "step": 470 }, { "epoch": 0.10506357349988847, "grad_norm": 0.2064565271139145, "learning_rate": 1.9962112033317776e-05, "loss": 0.5157, "step": 471 }, { "epoch": 0.10528663841177782, "grad_norm": 0.1964641511440277, "learning_rate": 1.9961907087039796e-05, "loss": 0.6127, "step": 472 }, { "epoch": 0.10550970332366719, "grad_norm": 0.17736569046974182, "learning_rate": 1.996170158900874e-05, "loss": 0.545, "step": 473 }, { "epoch": 0.10573276823555655, "grad_norm": 0.1836784929037094, "learning_rate": 1.9961495539235985e-05, "loss": 0.5525, "step": 474 }, { "epoch": 0.1059558331474459, "grad_norm": 0.2029949426651001, "learning_rate": 1.996128893773295e-05, "loss": 0.5558, "step": 475 }, { "epoch": 0.10617889805933527, "grad_norm": 0.207282155752182, "learning_rate": 1.9961081784511073e-05, "loss": 0.5789, "step": 476 }, { "epoch": 0.10640196297122463, "grad_norm": 0.20277360081672668, "learning_rate": 1.9960874079581828e-05, "loss": 0.5555, "step": 477 }, { "epoch": 0.10662502788311398, "grad_norm": 0.21180874109268188, "learning_rate": 1.996066582295672e-05, "loss": 0.574, "step": 478 }, { "epoch": 0.10684809279500335, "grad_norm": 0.22067199647426605, "learning_rate": 1.996045701464729e-05, "loss": 0.5641, "step": 479 }, { "epoch": 0.10707115770689271, "grad_norm": 0.2735600471496582, "learning_rate": 1.9960247654665088e-05, "loss": 0.5473, "step": 480 }, { "epoch": 0.10729422261878206, "grad_norm": 0.19656887650489807, "learning_rate": 1.9960037743021723e-05, "loss": 0.5614, "step": 481 }, { "epoch": 0.10751728753067143, "grad_norm": 0.19436019659042358, "learning_rate": 1.9959827279728815e-05, "loss": 0.5554, "step": 482 }, { "epoch": 0.10774035244256079, "grad_norm": 0.1874009072780609, "learning_rate": 1.9959616264798022e-05, "loss": 0.5723, "step": 483 }, { "epoch": 0.10796341735445014, "grad_norm": 0.2946108877658844, "learning_rate": 1.9959404698241037e-05, "loss": 0.5572, "step": 484 }, { "epoch": 0.1081864822663395, "grad_norm": 0.1900031566619873, "learning_rate": 1.9959192580069567e-05, "loss": 0.5629, "step": 485 }, { "epoch": 0.10840954717822887, "grad_norm": 0.2330961972475052, "learning_rate": 1.9958979910295367e-05, "loss": 0.5725, "step": 486 }, { "epoch": 0.10863261209011822, "grad_norm": 0.23831723630428314, "learning_rate": 1.9958766688930215e-05, "loss": 0.5549, "step": 487 }, { "epoch": 0.10885567700200759, "grad_norm": 0.2095993012189865, "learning_rate": 1.9958552915985923e-05, "loss": 0.541, "step": 488 }, { "epoch": 0.10907874191389694, "grad_norm": 0.18837721645832062, "learning_rate": 1.9958338591474327e-05, "loss": 0.5245, "step": 489 }, { "epoch": 0.1093018068257863, "grad_norm": 0.19427752494812012, "learning_rate": 1.99581237154073e-05, "loss": 0.5442, "step": 490 }, { "epoch": 0.10952487173767567, "grad_norm": 0.19493944942951202, "learning_rate": 1.9957908287796743e-05, "loss": 0.5559, "step": 491 }, { "epoch": 0.10974793664956502, "grad_norm": 0.18494555354118347, "learning_rate": 1.9957692308654586e-05, "loss": 0.5427, "step": 492 }, { "epoch": 0.10997100156145438, "grad_norm": 0.19978608191013336, "learning_rate": 1.9957475777992794e-05, "loss": 0.5391, "step": 493 }, { "epoch": 0.11019406647334375, "grad_norm": 0.18339702486991882, "learning_rate": 1.9957258695823358e-05, "loss": 0.5454, "step": 494 }, { "epoch": 0.1104171313852331, "grad_norm": 0.18613296747207642, "learning_rate": 1.99570410621583e-05, "loss": 0.5371, "step": 495 }, { "epoch": 0.11064019629712246, "grad_norm": 0.19776608049869537, "learning_rate": 1.9956822877009676e-05, "loss": 0.5405, "step": 496 }, { "epoch": 0.11086326120901183, "grad_norm": 0.1891903132200241, "learning_rate": 1.9956604140389574e-05, "loss": 0.5755, "step": 497 }, { "epoch": 0.11108632612090118, "grad_norm": 0.20654836297035217, "learning_rate": 1.9956384852310102e-05, "loss": 0.5471, "step": 498 }, { "epoch": 0.11130939103279054, "grad_norm": 0.18682821094989777, "learning_rate": 1.995616501278341e-05, "loss": 0.5479, "step": 499 }, { "epoch": 0.1115324559446799, "grad_norm": 0.20868845283985138, "learning_rate": 1.995594462182167e-05, "loss": 0.5405, "step": 500 }, { "epoch": 0.11175552085656926, "grad_norm": 0.20793192088603973, "learning_rate": 1.9955723679437093e-05, "loss": 0.5468, "step": 501 }, { "epoch": 0.11197858576845862, "grad_norm": 0.18734110891819, "learning_rate": 1.9955502185641915e-05, "loss": 0.5336, "step": 502 }, { "epoch": 0.11220165068034799, "grad_norm": 0.18563862144947052, "learning_rate": 1.9955280140448404e-05, "loss": 0.549, "step": 503 }, { "epoch": 0.11242471559223734, "grad_norm": 0.19853924214839935, "learning_rate": 1.9955057543868858e-05, "loss": 0.5494, "step": 504 }, { "epoch": 0.1126477805041267, "grad_norm": 0.2054789960384369, "learning_rate": 1.9954834395915604e-05, "loss": 0.6006, "step": 505 }, { "epoch": 0.11287084541601607, "grad_norm": 0.1880260407924652, "learning_rate": 1.9954610696601e-05, "loss": 0.5664, "step": 506 }, { "epoch": 0.11309391032790542, "grad_norm": 0.20359358191490173, "learning_rate": 1.9954386445937444e-05, "loss": 0.5387, "step": 507 }, { "epoch": 0.11331697523979478, "grad_norm": 0.20446328818798065, "learning_rate": 1.995416164393735e-05, "loss": 0.6078, "step": 508 }, { "epoch": 0.11354004015168415, "grad_norm": 0.20027638971805573, "learning_rate": 1.9953936290613166e-05, "loss": 0.4875, "step": 509 }, { "epoch": 0.1137631050635735, "grad_norm": 0.19979824125766754, "learning_rate": 1.9953710385977382e-05, "loss": 0.5599, "step": 510 }, { "epoch": 0.11398616997546286, "grad_norm": 0.17714561522006989, "learning_rate": 1.9953483930042503e-05, "loss": 0.536, "step": 511 }, { "epoch": 0.11420923488735223, "grad_norm": 0.21002604067325592, "learning_rate": 1.9953256922821075e-05, "loss": 0.5388, "step": 512 }, { "epoch": 0.11443229979924158, "grad_norm": 0.19017495214939117, "learning_rate": 1.995302936432567e-05, "loss": 0.5474, "step": 513 }, { "epoch": 0.11465536471113094, "grad_norm": 0.1896909773349762, "learning_rate": 1.995280125456889e-05, "loss": 0.5577, "step": 514 }, { "epoch": 0.11487842962302029, "grad_norm": 0.20575307309627533, "learning_rate": 1.9952572593563375e-05, "loss": 0.5204, "step": 515 }, { "epoch": 0.11510149453490966, "grad_norm": 0.19565477967262268, "learning_rate": 1.9952343381321785e-05, "loss": 0.5971, "step": 516 }, { "epoch": 0.11532455944679902, "grad_norm": 0.1984451860189438, "learning_rate": 1.995211361785681e-05, "loss": 0.5531, "step": 517 }, { "epoch": 0.11554762435868837, "grad_norm": 0.18850469589233398, "learning_rate": 1.9951883303181184e-05, "loss": 0.5419, "step": 518 }, { "epoch": 0.11577068927057774, "grad_norm": 0.22787964344024658, "learning_rate": 1.9951652437307664e-05, "loss": 0.5403, "step": 519 }, { "epoch": 0.1159937541824671, "grad_norm": 0.29980340600013733, "learning_rate": 1.995142102024903e-05, "loss": 0.5365, "step": 520 }, { "epoch": 0.11621681909435645, "grad_norm": 0.17774684727191925, "learning_rate": 1.995118905201811e-05, "loss": 0.545, "step": 521 }, { "epoch": 0.11643988400624582, "grad_norm": 0.20445318520069122, "learning_rate": 1.995095653262774e-05, "loss": 0.5883, "step": 522 }, { "epoch": 0.11666294891813518, "grad_norm": 0.1800977885723114, "learning_rate": 1.9950723462090803e-05, "loss": 0.5579, "step": 523 }, { "epoch": 0.11688601383002453, "grad_norm": 0.18807321786880493, "learning_rate": 1.9950489840420207e-05, "loss": 0.5295, "step": 524 }, { "epoch": 0.1171090787419139, "grad_norm": 0.17663832008838654, "learning_rate": 1.9950255667628894e-05, "loss": 0.5377, "step": 525 }, { "epoch": 0.11733214365380326, "grad_norm": 0.18556755781173706, "learning_rate": 1.9950020943729834e-05, "loss": 0.5314, "step": 526 }, { "epoch": 0.11755520856569261, "grad_norm": 0.18376345932483673, "learning_rate": 1.994978566873602e-05, "loss": 0.5383, "step": 527 }, { "epoch": 0.11777827347758198, "grad_norm": 0.18771792948246002, "learning_rate": 1.9949549842660495e-05, "loss": 0.5273, "step": 528 }, { "epoch": 0.11800133838947134, "grad_norm": 0.1931321620941162, "learning_rate": 1.9949313465516312e-05, "loss": 0.5692, "step": 529 }, { "epoch": 0.11822440330136069, "grad_norm": 0.18806159496307373, "learning_rate": 1.9949076537316566e-05, "loss": 0.5375, "step": 530 }, { "epoch": 0.11844746821325006, "grad_norm": 0.18746712803840637, "learning_rate": 1.9948839058074383e-05, "loss": 0.5418, "step": 531 }, { "epoch": 0.11867053312513942, "grad_norm": 0.18869024515151978, "learning_rate": 1.9948601027802908e-05, "loss": 0.5059, "step": 532 }, { "epoch": 0.11889359803702877, "grad_norm": 0.20482954382896423, "learning_rate": 1.994836244651533e-05, "loss": 0.5359, "step": 533 }, { "epoch": 0.11911666294891814, "grad_norm": 0.17584972083568573, "learning_rate": 1.9948123314224862e-05, "loss": 0.5221, "step": 534 }, { "epoch": 0.1193397278608075, "grad_norm": 0.1878448724746704, "learning_rate": 1.994788363094475e-05, "loss": 0.5358, "step": 535 }, { "epoch": 0.11956279277269685, "grad_norm": 0.17549312114715576, "learning_rate": 1.9947643396688266e-05, "loss": 0.5391, "step": 536 }, { "epoch": 0.11978585768458622, "grad_norm": 0.18577958643436432, "learning_rate": 1.9947402611468714e-05, "loss": 0.5606, "step": 537 }, { "epoch": 0.12000892259647558, "grad_norm": 0.17562542855739594, "learning_rate": 1.994716127529944e-05, "loss": 0.5186, "step": 538 }, { "epoch": 0.12023198750836493, "grad_norm": 0.18019866943359375, "learning_rate": 1.9946919388193803e-05, "loss": 0.524, "step": 539 }, { "epoch": 0.1204550524202543, "grad_norm": 0.18850797414779663, "learning_rate": 1.99466769501652e-05, "loss": 0.5496, "step": 540 }, { "epoch": 0.12067811733214365, "grad_norm": 0.1817145198583603, "learning_rate": 1.9946433961227062e-05, "loss": 0.5303, "step": 541 }, { "epoch": 0.12090118224403301, "grad_norm": 0.2111491858959198, "learning_rate": 1.9946190421392845e-05, "loss": 0.5526, "step": 542 }, { "epoch": 0.12112424715592238, "grad_norm": 0.294716477394104, "learning_rate": 1.9945946330676036e-05, "loss": 0.5172, "step": 543 }, { "epoch": 0.12134731206781173, "grad_norm": 0.21202930808067322, "learning_rate": 1.994570168909016e-05, "loss": 0.5295, "step": 544 }, { "epoch": 0.12157037697970109, "grad_norm": 0.18554560840129852, "learning_rate": 1.9945456496648763e-05, "loss": 0.5347, "step": 545 }, { "epoch": 0.12179344189159046, "grad_norm": 0.18445701897144318, "learning_rate": 1.9945210753365426e-05, "loss": 0.5433, "step": 546 }, { "epoch": 0.12201650680347981, "grad_norm": 0.2190207540988922, "learning_rate": 1.9944964459253757e-05, "loss": 0.5342, "step": 547 }, { "epoch": 0.12223957171536917, "grad_norm": 0.19571848213672638, "learning_rate": 1.99447176143274e-05, "loss": 0.5675, "step": 548 }, { "epoch": 0.12246263662725854, "grad_norm": 0.1853405386209488, "learning_rate": 1.994447021860003e-05, "loss": 0.5857, "step": 549 }, { "epoch": 0.12268570153914789, "grad_norm": 0.19547483325004578, "learning_rate": 1.9944222272085344e-05, "loss": 0.5208, "step": 550 }, { "epoch": 0.12290876645103725, "grad_norm": 0.20552557706832886, "learning_rate": 1.994397377479708e-05, "loss": 0.5518, "step": 551 }, { "epoch": 0.12313183136292662, "grad_norm": 0.21979011595249176, "learning_rate": 1.9943724726748996e-05, "loss": 0.5658, "step": 552 }, { "epoch": 0.12335489627481597, "grad_norm": 0.18478159606456757, "learning_rate": 1.994347512795489e-05, "loss": 0.5599, "step": 553 }, { "epoch": 0.12357796118670533, "grad_norm": 0.20098286867141724, "learning_rate": 1.9943224978428582e-05, "loss": 0.5455, "step": 554 }, { "epoch": 0.1238010260985947, "grad_norm": 0.2217499017715454, "learning_rate": 1.994297427818393e-05, "loss": 0.5512, "step": 555 }, { "epoch": 0.12402409101048405, "grad_norm": 0.17720763385295868, "learning_rate": 1.9942723027234817e-05, "loss": 0.5295, "step": 556 }, { "epoch": 0.12424715592237341, "grad_norm": 0.1762504130601883, "learning_rate": 1.9942471225595162e-05, "loss": 0.5361, "step": 557 }, { "epoch": 0.12447022083426278, "grad_norm": 0.2265862226486206, "learning_rate": 1.994221887327891e-05, "loss": 0.5669, "step": 558 }, { "epoch": 0.12469328574615213, "grad_norm": 0.19906531274318695, "learning_rate": 1.994196597030004e-05, "loss": 0.5302, "step": 559 }, { "epoch": 0.12491635065804149, "grad_norm": 0.17008185386657715, "learning_rate": 1.9941712516672553e-05, "loss": 0.5333, "step": 560 }, { "epoch": 0.12513941556993086, "grad_norm": 0.18816471099853516, "learning_rate": 1.994145851241049e-05, "loss": 0.5679, "step": 561 }, { "epoch": 0.12536248048182022, "grad_norm": 0.19027023017406464, "learning_rate": 1.9941203957527927e-05, "loss": 0.5507, "step": 562 }, { "epoch": 0.12558554539370956, "grad_norm": 0.17551641166210175, "learning_rate": 1.994094885203895e-05, "loss": 0.5311, "step": 563 }, { "epoch": 0.12580861030559892, "grad_norm": 0.1766747236251831, "learning_rate": 1.9940693195957696e-05, "loss": 0.5279, "step": 564 }, { "epoch": 0.1260316752174883, "grad_norm": 0.1826571673154831, "learning_rate": 1.9940436989298322e-05, "loss": 0.5195, "step": 565 }, { "epoch": 0.12625474012937765, "grad_norm": 0.18142053484916687, "learning_rate": 1.9940180232075025e-05, "loss": 0.5678, "step": 566 }, { "epoch": 0.12647780504126702, "grad_norm": 0.21495957672595978, "learning_rate": 1.993992292430202e-05, "loss": 0.5393, "step": 567 }, { "epoch": 0.12670086995315638, "grad_norm": 0.18905828893184662, "learning_rate": 1.9939665065993556e-05, "loss": 0.5507, "step": 568 }, { "epoch": 0.12692393486504572, "grad_norm": 0.19366420805454254, "learning_rate": 1.9939406657163916e-05, "loss": 0.5458, "step": 569 }, { "epoch": 0.12714699977693508, "grad_norm": 0.20456644892692566, "learning_rate": 1.9939147697827415e-05, "loss": 0.5532, "step": 570 }, { "epoch": 0.12737006468882445, "grad_norm": 0.20060132443904877, "learning_rate": 1.9938888187998397e-05, "loss": 0.5865, "step": 571 }, { "epoch": 0.1275931296007138, "grad_norm": 0.18896770477294922, "learning_rate": 1.9938628127691232e-05, "loss": 0.5372, "step": 572 }, { "epoch": 0.12781619451260318, "grad_norm": 0.1824081540107727, "learning_rate": 1.9938367516920323e-05, "loss": 0.5232, "step": 573 }, { "epoch": 0.12803925942449254, "grad_norm": 0.4999215602874756, "learning_rate": 1.993810635570011e-05, "loss": 0.5323, "step": 574 }, { "epoch": 0.12826232433638188, "grad_norm": 0.23097532987594604, "learning_rate": 1.993784464404505e-05, "loss": 0.544, "step": 575 }, { "epoch": 0.12848538924827124, "grad_norm": 0.1975661814212799, "learning_rate": 1.993758238196964e-05, "loss": 0.5411, "step": 576 }, { "epoch": 0.1287084541601606, "grad_norm": 0.18706879019737244, "learning_rate": 1.9937319569488414e-05, "loss": 0.5386, "step": 577 }, { "epoch": 0.12893151907204997, "grad_norm": 0.19232505559921265, "learning_rate": 1.993705620661592e-05, "loss": 0.5327, "step": 578 }, { "epoch": 0.12915458398393934, "grad_norm": 0.20714089274406433, "learning_rate": 1.9936792293366744e-05, "loss": 0.5381, "step": 579 }, { "epoch": 0.12937764889582867, "grad_norm": 0.18375153839588165, "learning_rate": 1.993652782975551e-05, "loss": 0.5664, "step": 580 }, { "epoch": 0.12960071380771804, "grad_norm": 0.19149671494960785, "learning_rate": 1.993626281579686e-05, "loss": 0.5265, "step": 581 }, { "epoch": 0.1298237787196074, "grad_norm": 0.19362886250019073, "learning_rate": 1.9935997251505473e-05, "loss": 0.517, "step": 582 }, { "epoch": 0.13004684363149677, "grad_norm": 0.19237902760505676, "learning_rate": 1.993573113689606e-05, "loss": 0.529, "step": 583 }, { "epoch": 0.13026990854338613, "grad_norm": 0.2098451405763626, "learning_rate": 1.9935464471983354e-05, "loss": 0.536, "step": 584 }, { "epoch": 0.1304929734552755, "grad_norm": 0.17832371592521667, "learning_rate": 1.993519725678213e-05, "loss": 0.528, "step": 585 }, { "epoch": 0.13071603836716483, "grad_norm": 0.19082553684711456, "learning_rate": 1.9934929491307194e-05, "loss": 0.5208, "step": 586 }, { "epoch": 0.1309391032790542, "grad_norm": 0.19741079211235046, "learning_rate": 1.9934661175573363e-05, "loss": 0.5605, "step": 587 }, { "epoch": 0.13116216819094356, "grad_norm": 0.17371487617492676, "learning_rate": 1.9934392309595504e-05, "loss": 0.5462, "step": 588 }, { "epoch": 0.13138523310283293, "grad_norm": 0.17552132904529572, "learning_rate": 1.9934122893388512e-05, "loss": 0.5343, "step": 589 }, { "epoch": 0.1316082980147223, "grad_norm": 0.19698654115200043, "learning_rate": 1.9933852926967305e-05, "loss": 0.5711, "step": 590 }, { "epoch": 0.13183136292661166, "grad_norm": 0.19045893847942352, "learning_rate": 1.993358241034684e-05, "loss": 0.5601, "step": 591 }, { "epoch": 0.132054427838501, "grad_norm": 0.17538850009441376, "learning_rate": 1.9933311343542094e-05, "loss": 0.5403, "step": 592 }, { "epoch": 0.13227749275039036, "grad_norm": 0.1708664894104004, "learning_rate": 1.9933039726568078e-05, "loss": 0.5346, "step": 593 }, { "epoch": 0.13250055766227972, "grad_norm": 0.1795656532049179, "learning_rate": 1.9932767559439844e-05, "loss": 0.5516, "step": 594 }, { "epoch": 0.1327236225741691, "grad_norm": 0.18116529285907745, "learning_rate": 1.9932494842172465e-05, "loss": 0.5311, "step": 595 }, { "epoch": 0.13294668748605845, "grad_norm": 0.20366770029067993, "learning_rate": 1.9932221574781043e-05, "loss": 0.5546, "step": 596 }, { "epoch": 0.13316975239794782, "grad_norm": 0.1805567592382431, "learning_rate": 1.9931947757280713e-05, "loss": 0.5754, "step": 597 }, { "epoch": 0.13339281730983715, "grad_norm": 0.1815255880355835, "learning_rate": 1.9931673389686642e-05, "loss": 0.5172, "step": 598 }, { "epoch": 0.13361588222172652, "grad_norm": 0.1993352770805359, "learning_rate": 1.9931398472014024e-05, "loss": 0.5483, "step": 599 }, { "epoch": 0.13383894713361588, "grad_norm": 0.1812392920255661, "learning_rate": 1.993112300427809e-05, "loss": 0.5418, "step": 600 }, { "epoch": 0.13406201204550525, "grad_norm": 0.23115472495555878, "learning_rate": 1.9930846986494098e-05, "loss": 0.5661, "step": 601 }, { "epoch": 0.1342850769573946, "grad_norm": 0.1799275279045105, "learning_rate": 1.9930570418677327e-05, "loss": 0.5536, "step": 602 }, { "epoch": 0.13450814186928395, "grad_norm": 0.19140706956386566, "learning_rate": 1.9930293300843103e-05, "loss": 0.5405, "step": 603 }, { "epoch": 0.1347312067811733, "grad_norm": 0.20953412353992462, "learning_rate": 1.993001563300677e-05, "loss": 0.5215, "step": 604 }, { "epoch": 0.13495427169306268, "grad_norm": 0.18991845846176147, "learning_rate": 1.992973741518371e-05, "loss": 0.5577, "step": 605 }, { "epoch": 0.13517733660495204, "grad_norm": 0.18286292254924774, "learning_rate": 1.9929458647389333e-05, "loss": 0.5531, "step": 606 }, { "epoch": 0.1354004015168414, "grad_norm": 0.1848326176404953, "learning_rate": 1.9929179329639075e-05, "loss": 0.5387, "step": 607 }, { "epoch": 0.13562346642873077, "grad_norm": 0.17560309171676636, "learning_rate": 1.9928899461948407e-05, "loss": 0.5444, "step": 608 }, { "epoch": 0.1358465313406201, "grad_norm": 0.18911704421043396, "learning_rate": 1.9928619044332837e-05, "loss": 0.506, "step": 609 }, { "epoch": 0.13606959625250947, "grad_norm": 0.19542188942432404, "learning_rate": 1.9928338076807888e-05, "loss": 0.5222, "step": 610 }, { "epoch": 0.13629266116439884, "grad_norm": 0.16921466588974, "learning_rate": 1.9928056559389123e-05, "loss": 0.5095, "step": 611 }, { "epoch": 0.1365157260762882, "grad_norm": 0.19137442111968994, "learning_rate": 1.9927774492092137e-05, "loss": 0.5504, "step": 612 }, { "epoch": 0.13673879098817757, "grad_norm": 0.2050313800573349, "learning_rate": 1.9927491874932553e-05, "loss": 0.5695, "step": 613 }, { "epoch": 0.13696185590006693, "grad_norm": 0.18225258588790894, "learning_rate": 1.992720870792602e-05, "loss": 0.5286, "step": 614 }, { "epoch": 0.13718492081195627, "grad_norm": 0.18813620507717133, "learning_rate": 1.9926924991088226e-05, "loss": 0.5362, "step": 615 }, { "epoch": 0.13740798572384563, "grad_norm": 0.21695192158222198, "learning_rate": 1.9926640724434882e-05, "loss": 0.5548, "step": 616 }, { "epoch": 0.137631050635735, "grad_norm": 0.1780216544866562, "learning_rate": 1.9926355907981735e-05, "loss": 0.547, "step": 617 }, { "epoch": 0.13785411554762436, "grad_norm": 0.182960644364357, "learning_rate": 1.9926070541744557e-05, "loss": 0.524, "step": 618 }, { "epoch": 0.13807718045951373, "grad_norm": 0.18580903112888336, "learning_rate": 1.9925784625739157e-05, "loss": 0.5515, "step": 619 }, { "epoch": 0.1383002453714031, "grad_norm": 0.17631803452968597, "learning_rate": 1.9925498159981368e-05, "loss": 0.5269, "step": 620 }, { "epoch": 0.13852331028329243, "grad_norm": 0.1774953305721283, "learning_rate": 1.9925211144487057e-05, "loss": 0.5279, "step": 621 }, { "epoch": 0.1387463751951818, "grad_norm": 0.1902550458908081, "learning_rate": 1.992492357927212e-05, "loss": 0.5199, "step": 622 }, { "epoch": 0.13896944010707116, "grad_norm": 0.18148526549339294, "learning_rate": 1.9924635464352486e-05, "loss": 0.5273, "step": 623 }, { "epoch": 0.13919250501896052, "grad_norm": 0.18067635595798492, "learning_rate": 1.9924346799744108e-05, "loss": 0.5431, "step": 624 }, { "epoch": 0.1394155699308499, "grad_norm": 0.1752486228942871, "learning_rate": 1.992405758546298e-05, "loss": 0.5012, "step": 625 }, { "epoch": 0.13963863484273925, "grad_norm": 0.18866072595119476, "learning_rate": 1.992376782152512e-05, "loss": 0.5408, "step": 626 }, { "epoch": 0.1398616997546286, "grad_norm": 0.21433256566524506, "learning_rate": 1.9923477507946573e-05, "loss": 0.5625, "step": 627 }, { "epoch": 0.14008476466651795, "grad_norm": 0.17377375066280365, "learning_rate": 1.9923186644743425e-05, "loss": 0.516, "step": 628 }, { "epoch": 0.14030782957840732, "grad_norm": 0.19306115806102753, "learning_rate": 1.9922895231931775e-05, "loss": 0.5397, "step": 629 }, { "epoch": 0.14053089449029668, "grad_norm": 0.18144343793392181, "learning_rate": 1.992260326952777e-05, "loss": 0.5215, "step": 630 }, { "epoch": 0.14075395940218605, "grad_norm": 0.19684460759162903, "learning_rate": 1.9922310757547584e-05, "loss": 0.5472, "step": 631 }, { "epoch": 0.14097702431407538, "grad_norm": 0.18824423849582672, "learning_rate": 1.9922017696007413e-05, "loss": 0.5595, "step": 632 }, { "epoch": 0.14120008922596475, "grad_norm": 0.18633319437503815, "learning_rate": 1.992172408492349e-05, "loss": 0.568, "step": 633 }, { "epoch": 0.1414231541378541, "grad_norm": 0.18547959625720978, "learning_rate": 1.9921429924312074e-05, "loss": 0.5421, "step": 634 }, { "epoch": 0.14164621904974348, "grad_norm": 0.16735389828681946, "learning_rate": 1.9921135214189466e-05, "loss": 0.499, "step": 635 }, { "epoch": 0.14186928396163284, "grad_norm": 0.1912708729505539, "learning_rate": 1.992083995457198e-05, "loss": 0.5655, "step": 636 }, { "epoch": 0.1420923488735222, "grad_norm": 0.19804726541042328, "learning_rate": 1.9920544145475975e-05, "loss": 0.5639, "step": 637 }, { "epoch": 0.14231541378541154, "grad_norm": 0.17582711577415466, "learning_rate": 1.992024778691783e-05, "loss": 0.5577, "step": 638 }, { "epoch": 0.1425384786973009, "grad_norm": 0.18506725132465363, "learning_rate": 1.9919950878913962e-05, "loss": 0.5287, "step": 639 }, { "epoch": 0.14276154360919027, "grad_norm": 0.1914191097021103, "learning_rate": 1.9919653421480816e-05, "loss": 0.5125, "step": 640 }, { "epoch": 0.14298460852107964, "grad_norm": 0.17707392573356628, "learning_rate": 1.9919355414634864e-05, "loss": 0.5351, "step": 641 }, { "epoch": 0.143207673432969, "grad_norm": 0.1675226092338562, "learning_rate": 1.9919056858392618e-05, "loss": 0.5408, "step": 642 }, { "epoch": 0.14343073834485837, "grad_norm": 0.18883782625198364, "learning_rate": 1.9918757752770607e-05, "loss": 0.5387, "step": 643 }, { "epoch": 0.1436538032567477, "grad_norm": 0.18036174774169922, "learning_rate": 1.99184580977854e-05, "loss": 0.5444, "step": 644 }, { "epoch": 0.14387686816863707, "grad_norm": 0.1817016750574112, "learning_rate": 1.99181578934536e-05, "loss": 0.527, "step": 645 }, { "epoch": 0.14409993308052643, "grad_norm": 0.18899092078208923, "learning_rate": 1.991785713979182e-05, "loss": 0.5245, "step": 646 }, { "epoch": 0.1443229979924158, "grad_norm": 0.18643809854984283, "learning_rate": 1.991755583681673e-05, "loss": 0.5247, "step": 647 }, { "epoch": 0.14454606290430516, "grad_norm": 0.1903519332408905, "learning_rate": 1.9917253984545014e-05, "loss": 0.5497, "step": 648 }, { "epoch": 0.14476912781619453, "grad_norm": 0.1695372760295868, "learning_rate": 1.991695158299339e-05, "loss": 0.5535, "step": 649 }, { "epoch": 0.14499219272808386, "grad_norm": 0.19261977076530457, "learning_rate": 1.9916648632178605e-05, "loss": 0.5324, "step": 650 }, { "epoch": 0.14521525763997323, "grad_norm": 0.1795564889907837, "learning_rate": 1.9916345132117442e-05, "loss": 0.5316, "step": 651 }, { "epoch": 0.1454383225518626, "grad_norm": 0.27002382278442383, "learning_rate": 1.9916041082826713e-05, "loss": 0.5521, "step": 652 }, { "epoch": 0.14566138746375196, "grad_norm": 0.17605867981910706, "learning_rate": 1.9915736484323246e-05, "loss": 0.5228, "step": 653 }, { "epoch": 0.14588445237564132, "grad_norm": 0.1808420866727829, "learning_rate": 1.9915431336623928e-05, "loss": 0.5593, "step": 654 }, { "epoch": 0.14610751728753066, "grad_norm": 0.189579576253891, "learning_rate": 1.991512563974565e-05, "loss": 0.5405, "step": 655 }, { "epoch": 0.14633058219942002, "grad_norm": 0.1883007138967514, "learning_rate": 1.9914819393705342e-05, "loss": 0.5571, "step": 656 }, { "epoch": 0.1465536471113094, "grad_norm": 0.18864606320858002, "learning_rate": 1.9914512598519972e-05, "loss": 0.519, "step": 657 }, { "epoch": 0.14677671202319875, "grad_norm": 0.17059919238090515, "learning_rate": 1.9914205254206527e-05, "loss": 0.533, "step": 658 }, { "epoch": 0.14699977693508812, "grad_norm": 0.4270278215408325, "learning_rate": 1.9913897360782036e-05, "loss": 0.5282, "step": 659 }, { "epoch": 0.14722284184697748, "grad_norm": 0.18411633372306824, "learning_rate": 1.9913588918263545e-05, "loss": 0.5128, "step": 660 }, { "epoch": 0.14744590675886682, "grad_norm": 0.2003640979528427, "learning_rate": 1.9913279926668146e-05, "loss": 0.5213, "step": 661 }, { "epoch": 0.14766897167075618, "grad_norm": 0.18532606959342957, "learning_rate": 1.9912970386012943e-05, "loss": 0.498, "step": 662 }, { "epoch": 0.14789203658264555, "grad_norm": 0.1674346923828125, "learning_rate": 1.9912660296315083e-05, "loss": 0.5383, "step": 663 }, { "epoch": 0.1481151014945349, "grad_norm": 0.19120754301548004, "learning_rate": 1.9912349657591748e-05, "loss": 0.5301, "step": 664 }, { "epoch": 0.14833816640642428, "grad_norm": 0.178116112947464, "learning_rate": 1.9912038469860135e-05, "loss": 0.5583, "step": 665 }, { "epoch": 0.14856123131831364, "grad_norm": 0.1826133131980896, "learning_rate": 1.9911726733137484e-05, "loss": 0.5188, "step": 666 }, { "epoch": 0.14878429623020298, "grad_norm": 0.18221695721149445, "learning_rate": 1.991141444744106e-05, "loss": 0.5236, "step": 667 }, { "epoch": 0.14900736114209234, "grad_norm": 0.18043336272239685, "learning_rate": 1.9911101612788157e-05, "loss": 0.5201, "step": 668 }, { "epoch": 0.1492304260539817, "grad_norm": 0.19176125526428223, "learning_rate": 1.9910788229196104e-05, "loss": 0.5501, "step": 669 }, { "epoch": 0.14945349096587107, "grad_norm": 0.18294784426689148, "learning_rate": 1.9910474296682256e-05, "loss": 0.5304, "step": 670 }, { "epoch": 0.14967655587776044, "grad_norm": 0.19021780788898468, "learning_rate": 1.9910159815264e-05, "loss": 0.5561, "step": 671 }, { "epoch": 0.1498996207896498, "grad_norm": 0.18545284867286682, "learning_rate": 1.9909844784958762e-05, "loss": 0.536, "step": 672 }, { "epoch": 0.15012268570153914, "grad_norm": 0.2078470140695572, "learning_rate": 1.990952920578398e-05, "loss": 0.513, "step": 673 }, { "epoch": 0.1503457506134285, "grad_norm": 0.20093394815921783, "learning_rate": 1.9909213077757138e-05, "loss": 0.5406, "step": 674 }, { "epoch": 0.15056881552531787, "grad_norm": 0.212370365858078, "learning_rate": 1.9908896400895745e-05, "loss": 0.5108, "step": 675 }, { "epoch": 0.15079188043720723, "grad_norm": 0.1903519630432129, "learning_rate": 1.990857917521734e-05, "loss": 0.5163, "step": 676 }, { "epoch": 0.1510149453490966, "grad_norm": 0.18388496339321136, "learning_rate": 1.9908261400739494e-05, "loss": 0.5361, "step": 677 }, { "epoch": 0.15123801026098593, "grad_norm": 0.1817573606967926, "learning_rate": 1.9907943077479802e-05, "loss": 0.5582, "step": 678 }, { "epoch": 0.1514610751728753, "grad_norm": 0.19696584343910217, "learning_rate": 1.9907624205455903e-05, "loss": 0.5326, "step": 679 }, { "epoch": 0.15168414008476466, "grad_norm": 0.17569434642791748, "learning_rate": 1.990730478468545e-05, "loss": 0.5073, "step": 680 }, { "epoch": 0.15190720499665403, "grad_norm": 0.1812848299741745, "learning_rate": 1.9906984815186142e-05, "loss": 0.539, "step": 681 }, { "epoch": 0.1521302699085434, "grad_norm": 0.1694687008857727, "learning_rate": 1.9906664296975696e-05, "loss": 0.5286, "step": 682 }, { "epoch": 0.15235333482043276, "grad_norm": 0.18631073832511902, "learning_rate": 1.990634323007187e-05, "loss": 0.5807, "step": 683 }, { "epoch": 0.1525763997323221, "grad_norm": 0.18517658114433289, "learning_rate": 1.9906021614492438e-05, "loss": 0.5393, "step": 684 }, { "epoch": 0.15279946464421146, "grad_norm": 0.19088414311408997, "learning_rate": 1.990569945025522e-05, "loss": 0.5583, "step": 685 }, { "epoch": 0.15302252955610082, "grad_norm": 0.1954737901687622, "learning_rate": 1.9905376737378056e-05, "loss": 0.5348, "step": 686 }, { "epoch": 0.1532455944679902, "grad_norm": 0.1780342310667038, "learning_rate": 1.990505347587882e-05, "loss": 0.5083, "step": 687 }, { "epoch": 0.15346865937987955, "grad_norm": 0.18818046152591705, "learning_rate": 1.9904729665775417e-05, "loss": 0.51, "step": 688 }, { "epoch": 0.15369172429176892, "grad_norm": 0.1797480434179306, "learning_rate": 1.990440530708578e-05, "loss": 0.5114, "step": 689 }, { "epoch": 0.15391478920365825, "grad_norm": 0.18315070867538452, "learning_rate": 1.9904080399827883e-05, "loss": 0.5322, "step": 690 }, { "epoch": 0.15413785411554762, "grad_norm": 0.2024601846933365, "learning_rate": 1.990375494401971e-05, "loss": 0.5198, "step": 691 }, { "epoch": 0.15436091902743698, "grad_norm": 0.2151622325181961, "learning_rate": 1.990342893967929e-05, "loss": 0.5308, "step": 692 }, { "epoch": 0.15458398393932635, "grad_norm": 0.1977054476737976, "learning_rate": 1.990310238682468e-05, "loss": 0.579, "step": 693 }, { "epoch": 0.1548070488512157, "grad_norm": 0.1899496465921402, "learning_rate": 1.990277528547397e-05, "loss": 0.5435, "step": 694 }, { "epoch": 0.15503011376310508, "grad_norm": 0.18425041437149048, "learning_rate": 1.9902447635645273e-05, "loss": 0.5582, "step": 695 }, { "epoch": 0.1552531786749944, "grad_norm": 0.1868348866701126, "learning_rate": 1.9902119437356737e-05, "loss": 0.5208, "step": 696 }, { "epoch": 0.15547624358688378, "grad_norm": 0.19318504631519318, "learning_rate": 1.990179069062654e-05, "loss": 0.581, "step": 697 }, { "epoch": 0.15569930849877314, "grad_norm": 0.17516325414180756, "learning_rate": 1.990146139547289e-05, "loss": 0.522, "step": 698 }, { "epoch": 0.1559223734106625, "grad_norm": 0.18644174933433533, "learning_rate": 1.990113155191402e-05, "loss": 0.5361, "step": 699 }, { "epoch": 0.15614543832255187, "grad_norm": 0.204257071018219, "learning_rate": 1.9900801159968207e-05, "loss": 0.5216, "step": 700 }, { "epoch": 0.15636850323444124, "grad_norm": 0.19151191413402557, "learning_rate": 1.990047021965375e-05, "loss": 0.5476, "step": 701 }, { "epoch": 0.15659156814633057, "grad_norm": 0.18692530691623688, "learning_rate": 1.9900138730988976e-05, "loss": 0.5516, "step": 702 }, { "epoch": 0.15681463305821994, "grad_norm": 0.19439953565597534, "learning_rate": 1.9899806693992242e-05, "loss": 0.5404, "step": 703 }, { "epoch": 0.1570376979701093, "grad_norm": 0.1809748411178589, "learning_rate": 1.989947410868194e-05, "loss": 0.5571, "step": 704 }, { "epoch": 0.15726076288199867, "grad_norm": 0.19498126208782196, "learning_rate": 1.9899140975076495e-05, "loss": 0.5121, "step": 705 }, { "epoch": 0.15748382779388803, "grad_norm": 0.19838570058345795, "learning_rate": 1.9898807293194352e-05, "loss": 0.5535, "step": 706 }, { "epoch": 0.15770689270577737, "grad_norm": 0.18156638741493225, "learning_rate": 1.9898473063054e-05, "loss": 0.5275, "step": 707 }, { "epoch": 0.15792995761766673, "grad_norm": 0.1888674944639206, "learning_rate": 1.989813828467394e-05, "loss": 0.5274, "step": 708 }, { "epoch": 0.1581530225295561, "grad_norm": 0.2016175538301468, "learning_rate": 1.9897802958072722e-05, "loss": 0.5057, "step": 709 }, { "epoch": 0.15837608744144546, "grad_norm": 0.21129021048545837, "learning_rate": 1.989746708326892e-05, "loss": 0.5691, "step": 710 }, { "epoch": 0.15859915235333483, "grad_norm": 0.20351482927799225, "learning_rate": 1.9897130660281127e-05, "loss": 0.5513, "step": 711 }, { "epoch": 0.1588222172652242, "grad_norm": 0.2297467589378357, "learning_rate": 1.9896793689127988e-05, "loss": 0.5618, "step": 712 }, { "epoch": 0.15904528217711353, "grad_norm": 0.222783625125885, "learning_rate": 1.989645616982816e-05, "loss": 0.5336, "step": 713 }, { "epoch": 0.1592683470890029, "grad_norm": 0.1896630972623825, "learning_rate": 1.9896118102400334e-05, "loss": 0.5562, "step": 714 }, { "epoch": 0.15949141200089226, "grad_norm": 0.188043013215065, "learning_rate": 1.989577948686324e-05, "loss": 0.497, "step": 715 }, { "epoch": 0.15971447691278162, "grad_norm": 0.21135394275188446, "learning_rate": 1.9895440323235635e-05, "loss": 0.5714, "step": 716 }, { "epoch": 0.159937541824671, "grad_norm": 0.1962571144104004, "learning_rate": 1.98951006115363e-05, "loss": 0.5686, "step": 717 }, { "epoch": 0.16016060673656035, "grad_norm": 0.196267768740654, "learning_rate": 1.9894760351784047e-05, "loss": 0.557, "step": 718 }, { "epoch": 0.1603836716484497, "grad_norm": 0.1771324872970581, "learning_rate": 1.9894419543997724e-05, "loss": 0.5492, "step": 719 }, { "epoch": 0.16060673656033905, "grad_norm": 0.17055559158325195, "learning_rate": 1.9894078188196213e-05, "loss": 0.5311, "step": 720 }, { "epoch": 0.16082980147222842, "grad_norm": 0.18929804861545563, "learning_rate": 1.9893736284398414e-05, "loss": 0.5388, "step": 721 }, { "epoch": 0.16105286638411778, "grad_norm": 0.1930275857448578, "learning_rate": 1.9893393832623266e-05, "loss": 0.554, "step": 722 }, { "epoch": 0.16127593129600715, "grad_norm": 0.1900511384010315, "learning_rate": 1.9893050832889734e-05, "loss": 0.5446, "step": 723 }, { "epoch": 0.1614989962078965, "grad_norm": 0.17943377792835236, "learning_rate": 1.9892707285216816e-05, "loss": 0.541, "step": 724 }, { "epoch": 0.16172206111978585, "grad_norm": 0.18837152421474457, "learning_rate": 1.9892363189623546e-05, "loss": 0.5424, "step": 725 }, { "epoch": 0.1619451260316752, "grad_norm": 0.1987512707710266, "learning_rate": 1.989201854612897e-05, "loss": 0.5412, "step": 726 }, { "epoch": 0.16216819094356458, "grad_norm": 0.1869790405035019, "learning_rate": 1.9891673354752192e-05, "loss": 0.5139, "step": 727 }, { "epoch": 0.16239125585545394, "grad_norm": 0.17572949826717377, "learning_rate": 1.9891327615512315e-05, "loss": 0.5137, "step": 728 }, { "epoch": 0.1626143207673433, "grad_norm": 0.17914965748786926, "learning_rate": 1.9890981328428502e-05, "loss": 0.5416, "step": 729 }, { "epoch": 0.16283738567923264, "grad_norm": 0.17955482006072998, "learning_rate": 1.989063449351992e-05, "loss": 0.5307, "step": 730 }, { "epoch": 0.163060450591122, "grad_norm": 0.17228074371814728, "learning_rate": 1.9890287110805787e-05, "loss": 0.5179, "step": 731 }, { "epoch": 0.16328351550301137, "grad_norm": 0.19471096992492676, "learning_rate": 1.9889939180305343e-05, "loss": 0.5787, "step": 732 }, { "epoch": 0.16350658041490074, "grad_norm": 0.19749003648757935, "learning_rate": 1.9889590702037857e-05, "loss": 0.5369, "step": 733 }, { "epoch": 0.1637296453267901, "grad_norm": 0.17162089049816132, "learning_rate": 1.9889241676022628e-05, "loss": 0.5426, "step": 734 }, { "epoch": 0.16395271023867947, "grad_norm": 0.18129977583885193, "learning_rate": 1.988889210227899e-05, "loss": 0.5297, "step": 735 }, { "epoch": 0.1641757751505688, "grad_norm": 0.18625618517398834, "learning_rate": 1.9888541980826307e-05, "loss": 0.5169, "step": 736 }, { "epoch": 0.16439884006245817, "grad_norm": 0.21191106736660004, "learning_rate": 1.9888191311683966e-05, "loss": 0.5322, "step": 737 }, { "epoch": 0.16462190497434753, "grad_norm": 0.4791422486305237, "learning_rate": 1.988784009487139e-05, "loss": 0.5285, "step": 738 }, { "epoch": 0.1648449698862369, "grad_norm": 0.18190808594226837, "learning_rate": 1.9887488330408033e-05, "loss": 0.5627, "step": 739 }, { "epoch": 0.16506803479812626, "grad_norm": 0.17037086188793182, "learning_rate": 1.9887136018313374e-05, "loss": 0.5329, "step": 740 }, { "epoch": 0.16529109971001563, "grad_norm": 0.18794186413288116, "learning_rate": 1.9886783158606934e-05, "loss": 0.5469, "step": 741 }, { "epoch": 0.16551416462190496, "grad_norm": 0.19635480642318726, "learning_rate": 1.9886429751308252e-05, "loss": 0.5707, "step": 742 }, { "epoch": 0.16573722953379433, "grad_norm": 0.1907324194908142, "learning_rate": 1.9886075796436902e-05, "loss": 0.5676, "step": 743 }, { "epoch": 0.1659602944456837, "grad_norm": 0.1943301409482956, "learning_rate": 1.9885721294012487e-05, "loss": 0.5653, "step": 744 }, { "epoch": 0.16618335935757306, "grad_norm": 0.19706295430660248, "learning_rate": 1.9885366244054646e-05, "loss": 0.5258, "step": 745 }, { "epoch": 0.16640642426946242, "grad_norm": 0.2110588550567627, "learning_rate": 1.9885010646583038e-05, "loss": 0.5509, "step": 746 }, { "epoch": 0.1666294891813518, "grad_norm": 0.18079644441604614, "learning_rate": 1.988465450161736e-05, "loss": 0.5299, "step": 747 }, { "epoch": 0.16685255409324112, "grad_norm": 0.2338915318250656, "learning_rate": 1.988429780917734e-05, "loss": 0.5637, "step": 748 }, { "epoch": 0.1670756190051305, "grad_norm": 0.18777087330818176, "learning_rate": 1.9883940569282737e-05, "loss": 0.5502, "step": 749 }, { "epoch": 0.16729868391701985, "grad_norm": 0.17644049227237701, "learning_rate": 1.988358278195333e-05, "loss": 0.5477, "step": 750 }, { "epoch": 0.16752174882890922, "grad_norm": 0.1912970095872879, "learning_rate": 1.9883224447208936e-05, "loss": 0.5447, "step": 751 }, { "epoch": 0.16774481374079858, "grad_norm": 0.1746956706047058, "learning_rate": 1.9882865565069408e-05, "loss": 0.5425, "step": 752 }, { "epoch": 0.16796787865268792, "grad_norm": 0.18779419362545013, "learning_rate": 1.9882506135554614e-05, "loss": 0.5329, "step": 753 }, { "epoch": 0.16819094356457728, "grad_norm": 0.21110400557518005, "learning_rate": 1.9882146158684473e-05, "loss": 0.5421, "step": 754 }, { "epoch": 0.16841400847646665, "grad_norm": 0.17395779490470886, "learning_rate": 1.9881785634478915e-05, "loss": 0.5321, "step": 755 }, { "epoch": 0.168637073388356, "grad_norm": 0.1749841570854187, "learning_rate": 1.988142456295791e-05, "loss": 0.522, "step": 756 }, { "epoch": 0.16886013830024538, "grad_norm": 0.17398926615715027, "learning_rate": 1.988106294414145e-05, "loss": 0.5563, "step": 757 }, { "epoch": 0.16908320321213474, "grad_norm": 0.1813582479953766, "learning_rate": 1.9880700778049575e-05, "loss": 0.5242, "step": 758 }, { "epoch": 0.16930626812402408, "grad_norm": 0.18003934621810913, "learning_rate": 1.9880338064702337e-05, "loss": 0.5468, "step": 759 }, { "epoch": 0.16952933303591344, "grad_norm": 0.17160138487815857, "learning_rate": 1.9879974804119827e-05, "loss": 0.5321, "step": 760 }, { "epoch": 0.1697523979478028, "grad_norm": 0.19145694375038147, "learning_rate": 1.9879610996322168e-05, "loss": 0.5365, "step": 761 }, { "epoch": 0.16997546285969217, "grad_norm": 0.18664413690567017, "learning_rate": 1.9879246641329505e-05, "loss": 0.5278, "step": 762 }, { "epoch": 0.17019852777158154, "grad_norm": 0.18776154518127441, "learning_rate": 1.987888173916202e-05, "loss": 0.5639, "step": 763 }, { "epoch": 0.1704215926834709, "grad_norm": 0.17712879180908203, "learning_rate": 1.9878516289839923e-05, "loss": 0.5119, "step": 764 }, { "epoch": 0.17064465759536024, "grad_norm": 0.1806483119726181, "learning_rate": 1.9878150293383457e-05, "loss": 0.5507, "step": 765 }, { "epoch": 0.1708677225072496, "grad_norm": 0.18238821625709534, "learning_rate": 1.9877783749812892e-05, "loss": 0.5581, "step": 766 }, { "epoch": 0.17109078741913897, "grad_norm": 0.17094913125038147, "learning_rate": 1.9877416659148525e-05, "loss": 0.5502, "step": 767 }, { "epoch": 0.17131385233102833, "grad_norm": 0.18648453056812286, "learning_rate": 1.9877049021410696e-05, "loss": 0.5568, "step": 768 }, { "epoch": 0.1715369172429177, "grad_norm": 0.20364215970039368, "learning_rate": 1.9876680836619762e-05, "loss": 0.5233, "step": 769 }, { "epoch": 0.17175998215480706, "grad_norm": 0.17506776750087738, "learning_rate": 1.9876312104796117e-05, "loss": 0.5479, "step": 770 }, { "epoch": 0.1719830470666964, "grad_norm": 0.17632229626178741, "learning_rate": 1.9875942825960183e-05, "loss": 0.5241, "step": 771 }, { "epoch": 0.17220611197858576, "grad_norm": 0.18557725846767426, "learning_rate": 1.9875573000132414e-05, "loss": 0.5794, "step": 772 }, { "epoch": 0.17242917689047513, "grad_norm": 0.1729423552751541, "learning_rate": 1.987520262733329e-05, "loss": 0.5247, "step": 773 }, { "epoch": 0.1726522418023645, "grad_norm": 0.18643899261951447, "learning_rate": 1.9874831707583328e-05, "loss": 0.5305, "step": 774 }, { "epoch": 0.17287530671425386, "grad_norm": 0.17992724478244781, "learning_rate": 1.987446024090307e-05, "loss": 0.5548, "step": 775 }, { "epoch": 0.17309837162614322, "grad_norm": 0.19330288469791412, "learning_rate": 1.9874088227313093e-05, "loss": 0.5151, "step": 776 }, { "epoch": 0.17332143653803256, "grad_norm": 0.18407322466373444, "learning_rate": 1.9873715666834e-05, "loss": 0.5262, "step": 777 }, { "epoch": 0.17354450144992192, "grad_norm": 0.17531730234622955, "learning_rate": 1.987334255948642e-05, "loss": 0.51, "step": 778 }, { "epoch": 0.1737675663618113, "grad_norm": 0.1791767179965973, "learning_rate": 1.987296890529103e-05, "loss": 0.5217, "step": 779 }, { "epoch": 0.17399063127370065, "grad_norm": 0.17693190276622772, "learning_rate": 1.9872594704268516e-05, "loss": 0.5346, "step": 780 }, { "epoch": 0.17421369618559002, "grad_norm": 0.17644071578979492, "learning_rate": 1.9872219956439607e-05, "loss": 0.5335, "step": 781 }, { "epoch": 0.17443676109747935, "grad_norm": 0.17572622001171112, "learning_rate": 1.987184466182506e-05, "loss": 0.5302, "step": 782 }, { "epoch": 0.17465982600936872, "grad_norm": 0.18111130595207214, "learning_rate": 1.987146882044565e-05, "loss": 0.5205, "step": 783 }, { "epoch": 0.17488289092125808, "grad_norm": 0.17094580829143524, "learning_rate": 1.987109243232221e-05, "loss": 0.527, "step": 784 }, { "epoch": 0.17510595583314745, "grad_norm": 0.1917412430047989, "learning_rate": 1.9870715497475583e-05, "loss": 0.5289, "step": 785 }, { "epoch": 0.1753290207450368, "grad_norm": 0.19271767139434814, "learning_rate": 1.9870338015926634e-05, "loss": 0.5123, "step": 786 }, { "epoch": 0.17555208565692618, "grad_norm": 0.19731736183166504, "learning_rate": 1.9869959987696282e-05, "loss": 0.543, "step": 787 }, { "epoch": 0.1757751505688155, "grad_norm": 0.172173410654068, "learning_rate": 1.9869581412805462e-05, "loss": 0.5211, "step": 788 }, { "epoch": 0.17599821548070488, "grad_norm": 0.1778416931629181, "learning_rate": 1.9869202291275144e-05, "loss": 0.5168, "step": 789 }, { "epoch": 0.17622128039259424, "grad_norm": 0.1906319111585617, "learning_rate": 1.986882262312632e-05, "loss": 0.5502, "step": 790 }, { "epoch": 0.1764443453044836, "grad_norm": 0.18600904941558838, "learning_rate": 1.986844240838002e-05, "loss": 0.5093, "step": 791 }, { "epoch": 0.17666741021637297, "grad_norm": 0.17453651130199432, "learning_rate": 1.986806164705731e-05, "loss": 0.5316, "step": 792 }, { "epoch": 0.17689047512826234, "grad_norm": 0.18647123873233795, "learning_rate": 1.9867680339179268e-05, "loss": 0.5293, "step": 793 }, { "epoch": 0.17711354004015167, "grad_norm": 0.18260005116462708, "learning_rate": 1.9867298484767022e-05, "loss": 0.5429, "step": 794 }, { "epoch": 0.17733660495204104, "grad_norm": 0.1722402721643448, "learning_rate": 1.9866916083841715e-05, "loss": 0.5211, "step": 795 }, { "epoch": 0.1775596698639304, "grad_norm": 0.16579583287239075, "learning_rate": 1.9866533136424537e-05, "loss": 0.5173, "step": 796 }, { "epoch": 0.17778273477581977, "grad_norm": 0.18849937617778778, "learning_rate": 1.9866149642536683e-05, "loss": 0.5482, "step": 797 }, { "epoch": 0.17800579968770913, "grad_norm": 0.1786874532699585, "learning_rate": 1.98657656021994e-05, "loss": 0.5203, "step": 798 }, { "epoch": 0.1782288645995985, "grad_norm": 0.1848941594362259, "learning_rate": 1.986538101543397e-05, "loss": 0.5248, "step": 799 }, { "epoch": 0.17845192951148783, "grad_norm": 0.1833941787481308, "learning_rate": 1.9864995882261674e-05, "loss": 0.5192, "step": 800 }, { "epoch": 0.1786749944233772, "grad_norm": 0.1813460886478424, "learning_rate": 1.9864610202703858e-05, "loss": 0.5307, "step": 801 }, { "epoch": 0.17889805933526656, "grad_norm": 0.18154440820217133, "learning_rate": 1.9864223976781876e-05, "loss": 0.5468, "step": 802 }, { "epoch": 0.17912112424715593, "grad_norm": 0.18413223326206207, "learning_rate": 1.9863837204517124e-05, "loss": 0.5376, "step": 803 }, { "epoch": 0.1793441891590453, "grad_norm": 0.17483587563037872, "learning_rate": 1.986344988593102e-05, "loss": 0.5333, "step": 804 }, { "epoch": 0.17956725407093463, "grad_norm": 0.17205438017845154, "learning_rate": 1.9863062021045017e-05, "loss": 0.4933, "step": 805 }, { "epoch": 0.179790318982824, "grad_norm": 0.17578768730163574, "learning_rate": 1.98626736098806e-05, "loss": 0.5153, "step": 806 }, { "epoch": 0.18001338389471336, "grad_norm": 0.1773952841758728, "learning_rate": 1.9862284652459275e-05, "loss": 0.5265, "step": 807 }, { "epoch": 0.18023644880660272, "grad_norm": 0.17616188526153564, "learning_rate": 1.9861895148802594e-05, "loss": 0.5438, "step": 808 }, { "epoch": 0.1804595137184921, "grad_norm": 0.1791033297777176, "learning_rate": 1.9861505098932127e-05, "loss": 0.5294, "step": 809 }, { "epoch": 0.18068257863038145, "grad_norm": 0.18261539936065674, "learning_rate": 1.986111450286947e-05, "loss": 0.548, "step": 810 }, { "epoch": 0.1809056435422708, "grad_norm": 0.1767009198665619, "learning_rate": 1.986072336063627e-05, "loss": 0.5278, "step": 811 }, { "epoch": 0.18112870845416015, "grad_norm": 0.19362005591392517, "learning_rate": 1.9860331672254182e-05, "loss": 0.5206, "step": 812 }, { "epoch": 0.18135177336604952, "grad_norm": 0.185451477766037, "learning_rate": 1.98599394377449e-05, "loss": 0.5513, "step": 813 }, { "epoch": 0.18157483827793888, "grad_norm": 0.17266923189163208, "learning_rate": 1.985954665713015e-05, "loss": 0.5648, "step": 814 }, { "epoch": 0.18179790318982825, "grad_norm": 0.170868918299675, "learning_rate": 1.9859153330431692e-05, "loss": 0.5343, "step": 815 }, { "epoch": 0.1820209681017176, "grad_norm": 0.17769230902194977, "learning_rate": 1.98587594576713e-05, "loss": 0.5539, "step": 816 }, { "epoch": 0.18224403301360695, "grad_norm": 0.16811443865299225, "learning_rate": 1.9858365038870803e-05, "loss": 0.5209, "step": 817 }, { "epoch": 0.1824670979254963, "grad_norm": 0.1751745492219925, "learning_rate": 1.985797007405203e-05, "loss": 0.5509, "step": 818 }, { "epoch": 0.18269016283738568, "grad_norm": 0.1694861203432083, "learning_rate": 1.985757456323687e-05, "loss": 0.5505, "step": 819 }, { "epoch": 0.18291322774927504, "grad_norm": 0.1734897792339325, "learning_rate": 1.985717850644722e-05, "loss": 0.5022, "step": 820 }, { "epoch": 0.1831362926611644, "grad_norm": 0.18104127049446106, "learning_rate": 1.9856781903705026e-05, "loss": 0.5434, "step": 821 }, { "epoch": 0.18335935757305377, "grad_norm": 0.17157487571239471, "learning_rate": 1.9856384755032245e-05, "loss": 0.5356, "step": 822 }, { "epoch": 0.1835824224849431, "grad_norm": 0.1781257838010788, "learning_rate": 1.985598706045088e-05, "loss": 0.522, "step": 823 }, { "epoch": 0.18380548739683247, "grad_norm": 0.16910432279109955, "learning_rate": 1.985558881998295e-05, "loss": 0.5324, "step": 824 }, { "epoch": 0.18402855230872184, "grad_norm": 0.180936798453331, "learning_rate": 1.985519003365052e-05, "loss": 0.5307, "step": 825 }, { "epoch": 0.1842516172206112, "grad_norm": 0.1872517466545105, "learning_rate": 1.9854790701475676e-05, "loss": 0.5667, "step": 826 }, { "epoch": 0.18447468213250057, "grad_norm": 0.16592343151569366, "learning_rate": 1.985439082348053e-05, "loss": 0.4938, "step": 827 }, { "epoch": 0.1846977470443899, "grad_norm": 0.18184059858322144, "learning_rate": 1.9853990399687237e-05, "loss": 0.536, "step": 828 }, { "epoch": 0.18492081195627927, "grad_norm": 0.1687706708908081, "learning_rate": 1.985358943011797e-05, "loss": 0.4902, "step": 829 }, { "epoch": 0.18514387686816863, "grad_norm": 0.18062466382980347, "learning_rate": 1.985318791479494e-05, "loss": 0.5235, "step": 830 }, { "epoch": 0.185366941780058, "grad_norm": 0.1865355372428894, "learning_rate": 1.985278585374038e-05, "loss": 0.5439, "step": 831 }, { "epoch": 0.18559000669194736, "grad_norm": 0.21087662875652313, "learning_rate": 1.985238324697657e-05, "loss": 0.5338, "step": 832 }, { "epoch": 0.18581307160383673, "grad_norm": 0.18593360483646393, "learning_rate": 1.9851980094525795e-05, "loss": 0.5139, "step": 833 }, { "epoch": 0.18603613651572606, "grad_norm": 0.17105735838413239, "learning_rate": 1.9851576396410395e-05, "loss": 0.5385, "step": 834 }, { "epoch": 0.18625920142761543, "grad_norm": 0.1764969825744629, "learning_rate": 1.9851172152652722e-05, "loss": 0.5381, "step": 835 }, { "epoch": 0.1864822663395048, "grad_norm": 0.211869478225708, "learning_rate": 1.985076736327517e-05, "loss": 0.512, "step": 836 }, { "epoch": 0.18670533125139416, "grad_norm": 0.17822885513305664, "learning_rate": 1.9850362028300162e-05, "loss": 0.5394, "step": 837 }, { "epoch": 0.18692839616328352, "grad_norm": 0.17853499948978424, "learning_rate": 1.9849956147750137e-05, "loss": 0.5273, "step": 838 }, { "epoch": 0.1871514610751729, "grad_norm": 0.1717289835214615, "learning_rate": 1.9849549721647586e-05, "loss": 0.5126, "step": 839 }, { "epoch": 0.18737452598706222, "grad_norm": 0.167042076587677, "learning_rate": 1.9849142750015014e-05, "loss": 0.5154, "step": 840 }, { "epoch": 0.1875975908989516, "grad_norm": 0.1893247812986374, "learning_rate": 1.9848735232874966e-05, "loss": 0.5205, "step": 841 }, { "epoch": 0.18782065581084095, "grad_norm": 0.18898561596870422, "learning_rate": 1.984832717025001e-05, "loss": 0.5513, "step": 842 }, { "epoch": 0.18804372072273032, "grad_norm": 0.18172813951969147, "learning_rate": 1.984791856216274e-05, "loss": 0.5056, "step": 843 }, { "epoch": 0.18826678563461968, "grad_norm": 0.1696682870388031, "learning_rate": 1.98475094086358e-05, "loss": 0.5313, "step": 844 }, { "epoch": 0.18848985054650905, "grad_norm": 0.1731012910604477, "learning_rate": 1.9847099709691843e-05, "loss": 0.5108, "step": 845 }, { "epoch": 0.18871291545839838, "grad_norm": 0.18047916889190674, "learning_rate": 1.9846689465353563e-05, "loss": 0.5108, "step": 846 }, { "epoch": 0.18893598037028775, "grad_norm": 0.17469045519828796, "learning_rate": 1.9846278675643684e-05, "loss": 0.5273, "step": 847 }, { "epoch": 0.1891590452821771, "grad_norm": 0.18713539838790894, "learning_rate": 1.9845867340584957e-05, "loss": 0.5587, "step": 848 }, { "epoch": 0.18938211019406648, "grad_norm": 0.2040695995092392, "learning_rate": 1.984545546020016e-05, "loss": 0.5651, "step": 849 }, { "epoch": 0.18960517510595584, "grad_norm": 0.24084708094596863, "learning_rate": 1.984504303451211e-05, "loss": 0.5186, "step": 850 }, { "epoch": 0.1898282400178452, "grad_norm": 0.18628886342048645, "learning_rate": 1.9844630063543655e-05, "loss": 0.5091, "step": 851 }, { "epoch": 0.19005130492973454, "grad_norm": 0.18985594809055328, "learning_rate": 1.9844216547317656e-05, "loss": 0.5457, "step": 852 }, { "epoch": 0.1902743698416239, "grad_norm": 0.1807604879140854, "learning_rate": 1.9843802485857028e-05, "loss": 0.5137, "step": 853 }, { "epoch": 0.19049743475351327, "grad_norm": 0.1864684522151947, "learning_rate": 1.984338787918469e-05, "loss": 0.5232, "step": 854 }, { "epoch": 0.19072049966540264, "grad_norm": 0.17398470640182495, "learning_rate": 1.984297272732362e-05, "loss": 0.5207, "step": 855 }, { "epoch": 0.190943564577292, "grad_norm": 0.21071763336658478, "learning_rate": 1.9842557030296804e-05, "loss": 0.5032, "step": 856 }, { "epoch": 0.19116662948918134, "grad_norm": 0.17372627556324005, "learning_rate": 1.9842140788127264e-05, "loss": 0.4992, "step": 857 }, { "epoch": 0.1913896944010707, "grad_norm": 0.18683859705924988, "learning_rate": 1.9841724000838064e-05, "loss": 0.5342, "step": 858 }, { "epoch": 0.19161275931296007, "grad_norm": 0.18670214712619781, "learning_rate": 1.9841306668452275e-05, "loss": 0.521, "step": 859 }, { "epoch": 0.19183582422484943, "grad_norm": 0.18776099383831024, "learning_rate": 1.9840888790993023e-05, "loss": 0.5164, "step": 860 }, { "epoch": 0.1920588891367388, "grad_norm": 0.20680485665798187, "learning_rate": 1.9840470368483448e-05, "loss": 0.5433, "step": 861 }, { "epoch": 0.19228195404862816, "grad_norm": 0.18562763929367065, "learning_rate": 1.9840051400946724e-05, "loss": 0.5225, "step": 862 }, { "epoch": 0.1925050189605175, "grad_norm": 0.1958203911781311, "learning_rate": 1.9839631888406055e-05, "loss": 0.5333, "step": 863 }, { "epoch": 0.19272808387240686, "grad_norm": 0.1797972470521927, "learning_rate": 1.9839211830884682e-05, "loss": 0.5259, "step": 864 }, { "epoch": 0.19295114878429623, "grad_norm": 0.18006913363933563, "learning_rate": 1.9838791228405866e-05, "loss": 0.5355, "step": 865 }, { "epoch": 0.1931742136961856, "grad_norm": 0.19286584854125977, "learning_rate": 1.9838370080992902e-05, "loss": 0.5548, "step": 866 }, { "epoch": 0.19339727860807496, "grad_norm": 0.17611972987651825, "learning_rate": 1.9837948388669118e-05, "loss": 0.4975, "step": 867 }, { "epoch": 0.19362034351996432, "grad_norm": 0.17204061150550842, "learning_rate": 1.983752615145787e-05, "loss": 0.5353, "step": 868 }, { "epoch": 0.19384340843185366, "grad_norm": 0.17773252725601196, "learning_rate": 1.9837103369382542e-05, "loss": 0.5621, "step": 869 }, { "epoch": 0.19406647334374302, "grad_norm": 0.17241743206977844, "learning_rate": 1.983668004246655e-05, "loss": 0.5514, "step": 870 }, { "epoch": 0.1942895382556324, "grad_norm": 0.1765352487564087, "learning_rate": 1.9836256170733343e-05, "loss": 0.5262, "step": 871 }, { "epoch": 0.19451260316752175, "grad_norm": 0.18487049639225006, "learning_rate": 1.98358317542064e-05, "loss": 0.4941, "step": 872 }, { "epoch": 0.19473566807941112, "grad_norm": 0.17895649373531342, "learning_rate": 1.983540679290922e-05, "loss": 0.5501, "step": 873 }, { "epoch": 0.19495873299130048, "grad_norm": 0.1671830266714096, "learning_rate": 1.9834981286865343e-05, "loss": 0.5207, "step": 874 }, { "epoch": 0.19518179790318982, "grad_norm": 0.18288585543632507, "learning_rate": 1.9834555236098344e-05, "loss": 0.5405, "step": 875 }, { "epoch": 0.19540486281507918, "grad_norm": 0.2788625657558441, "learning_rate": 1.983412864063181e-05, "loss": 0.5029, "step": 876 }, { "epoch": 0.19562792772696855, "grad_norm": 0.17516101896762848, "learning_rate": 1.983370150048938e-05, "loss": 0.5181, "step": 877 }, { "epoch": 0.1958509926388579, "grad_norm": 0.18724878132343292, "learning_rate": 1.9833273815694695e-05, "loss": 0.5399, "step": 878 }, { "epoch": 0.19607405755074728, "grad_norm": 0.1845855563879013, "learning_rate": 1.9832845586271456e-05, "loss": 0.5493, "step": 879 }, { "epoch": 0.19629712246263661, "grad_norm": 0.1939295083284378, "learning_rate": 1.9832416812243377e-05, "loss": 0.5453, "step": 880 }, { "epoch": 0.19652018737452598, "grad_norm": 0.17018291354179382, "learning_rate": 1.9831987493634207e-05, "loss": 0.5096, "step": 881 }, { "epoch": 0.19674325228641534, "grad_norm": 0.18090112507343292, "learning_rate": 1.9831557630467725e-05, "loss": 0.5519, "step": 882 }, { "epoch": 0.1969663171983047, "grad_norm": 0.19264590740203857, "learning_rate": 1.983112722276774e-05, "loss": 0.5469, "step": 883 }, { "epoch": 0.19718938211019407, "grad_norm": 0.185777947306633, "learning_rate": 1.9830696270558084e-05, "loss": 0.5484, "step": 884 }, { "epoch": 0.19741244702208344, "grad_norm": 0.15953023731708527, "learning_rate": 1.9830264773862633e-05, "loss": 0.5139, "step": 885 }, { "epoch": 0.19763551193397277, "grad_norm": 0.17809630930423737, "learning_rate": 1.9829832732705284e-05, "loss": 0.5178, "step": 886 }, { "epoch": 0.19785857684586214, "grad_norm": 0.20061112940311432, "learning_rate": 1.982940014710997e-05, "loss": 0.5393, "step": 887 }, { "epoch": 0.1980816417577515, "grad_norm": 0.1748671978712082, "learning_rate": 1.9828967017100642e-05, "loss": 0.5332, "step": 888 }, { "epoch": 0.19830470666964087, "grad_norm": 0.1713806539773941, "learning_rate": 1.9828533342701296e-05, "loss": 0.5165, "step": 889 }, { "epoch": 0.19852777158153023, "grad_norm": 0.16470858454704285, "learning_rate": 1.9828099123935948e-05, "loss": 0.5133, "step": 890 }, { "epoch": 0.1987508364934196, "grad_norm": 0.17374449968338013, "learning_rate": 1.9827664360828647e-05, "loss": 0.5475, "step": 891 }, { "epoch": 0.19897390140530893, "grad_norm": 0.1877843141555786, "learning_rate": 1.982722905340348e-05, "loss": 0.5253, "step": 892 }, { "epoch": 0.1991969663171983, "grad_norm": 0.17328353226184845, "learning_rate": 1.982679320168455e-05, "loss": 0.5219, "step": 893 }, { "epoch": 0.19942003122908766, "grad_norm": 0.17716079950332642, "learning_rate": 1.9826356805696e-05, "loss": 0.535, "step": 894 }, { "epoch": 0.19964309614097703, "grad_norm": 0.17511911690235138, "learning_rate": 1.9825919865462004e-05, "loss": 0.548, "step": 895 }, { "epoch": 0.1998661610528664, "grad_norm": 0.1711902767419815, "learning_rate": 1.9825482381006752e-05, "loss": 0.5402, "step": 896 }, { "epoch": 0.20008922596475576, "grad_norm": 0.17732638120651245, "learning_rate": 1.9825044352354482e-05, "loss": 0.5672, "step": 897 }, { "epoch": 0.2003122908766451, "grad_norm": 0.17266635596752167, "learning_rate": 1.9824605779529456e-05, "loss": 0.5312, "step": 898 }, { "epoch": 0.20053535578853446, "grad_norm": 0.16797249019145966, "learning_rate": 1.982416666255596e-05, "loss": 0.5294, "step": 899 }, { "epoch": 0.20075842070042382, "grad_norm": 0.17062917351722717, "learning_rate": 1.9823727001458318e-05, "loss": 0.51, "step": 900 }, { "epoch": 0.2009814856123132, "grad_norm": 0.17725898325443268, "learning_rate": 1.9823286796260887e-05, "loss": 0.5284, "step": 901 }, { "epoch": 0.20120455052420255, "grad_norm": 0.17662313580513, "learning_rate": 1.9822846046988037e-05, "loss": 0.515, "step": 902 }, { "epoch": 0.2014276154360919, "grad_norm": 0.17648808658123016, "learning_rate": 1.9822404753664183e-05, "loss": 0.5437, "step": 903 }, { "epoch": 0.20165068034798125, "grad_norm": 0.17179900407791138, "learning_rate": 1.982196291631377e-05, "loss": 0.5332, "step": 904 }, { "epoch": 0.20187374525987062, "grad_norm": 0.19034990668296814, "learning_rate": 1.982152053496127e-05, "loss": 0.5279, "step": 905 }, { "epoch": 0.20209681017175998, "grad_norm": 0.17066267132759094, "learning_rate": 1.9821077609631184e-05, "loss": 0.5473, "step": 906 }, { "epoch": 0.20231987508364935, "grad_norm": 0.1806066334247589, "learning_rate": 1.982063414034804e-05, "loss": 0.5495, "step": 907 }, { "epoch": 0.2025429399955387, "grad_norm": 0.17627598345279694, "learning_rate": 1.9820190127136403e-05, "loss": 0.5469, "step": 908 }, { "epoch": 0.20276600490742805, "grad_norm": 0.18282422423362732, "learning_rate": 1.9819745570020867e-05, "loss": 0.5228, "step": 909 }, { "epoch": 0.2029890698193174, "grad_norm": 0.17409084737300873, "learning_rate": 1.981930046902605e-05, "loss": 0.5183, "step": 910 }, { "epoch": 0.20321213473120678, "grad_norm": 0.1846904307603836, "learning_rate": 1.9818854824176612e-05, "loss": 0.5198, "step": 911 }, { "epoch": 0.20343519964309614, "grad_norm": 0.18149109184741974, "learning_rate": 1.9818408635497224e-05, "loss": 0.5078, "step": 912 }, { "epoch": 0.2036582645549855, "grad_norm": 0.18114425241947174, "learning_rate": 1.981796190301261e-05, "loss": 0.531, "step": 913 }, { "epoch": 0.20388132946687487, "grad_norm": 0.1718929558992386, "learning_rate": 1.981751462674751e-05, "loss": 0.4939, "step": 914 }, { "epoch": 0.2041043943787642, "grad_norm": 0.1930830329656601, "learning_rate": 1.9817066806726695e-05, "loss": 0.5055, "step": 915 }, { "epoch": 0.20432745929065357, "grad_norm": 0.17051468789577484, "learning_rate": 1.9816618442974964e-05, "loss": 0.5058, "step": 916 }, { "epoch": 0.20455052420254294, "grad_norm": 0.1774812787771225, "learning_rate": 1.9816169535517157e-05, "loss": 0.5341, "step": 917 }, { "epoch": 0.2047735891144323, "grad_norm": 0.17879648506641388, "learning_rate": 1.9815720084378134e-05, "loss": 0.534, "step": 918 }, { "epoch": 0.20499665402632167, "grad_norm": 0.1766035556793213, "learning_rate": 1.9815270089582795e-05, "loss": 0.5407, "step": 919 }, { "epoch": 0.20521971893821103, "grad_norm": 0.17892096936702728, "learning_rate": 1.981481955115605e-05, "loss": 0.4676, "step": 920 }, { "epoch": 0.20544278385010037, "grad_norm": 0.17289894819259644, "learning_rate": 1.9814368469122866e-05, "loss": 0.5416, "step": 921 }, { "epoch": 0.20566584876198973, "grad_norm": 0.1802949607372284, "learning_rate": 1.981391684350822e-05, "loss": 0.5425, "step": 922 }, { "epoch": 0.2058889136738791, "grad_norm": 0.17635071277618408, "learning_rate": 1.9813464674337126e-05, "loss": 0.5294, "step": 923 }, { "epoch": 0.20611197858576846, "grad_norm": 0.1723651885986328, "learning_rate": 1.981301196163463e-05, "loss": 0.5045, "step": 924 }, { "epoch": 0.20633504349765783, "grad_norm": 0.18130990862846375, "learning_rate": 1.9812558705425805e-05, "loss": 0.5264, "step": 925 }, { "epoch": 0.2065581084095472, "grad_norm": 0.1879456490278244, "learning_rate": 1.9812104905735756e-05, "loss": 0.5215, "step": 926 }, { "epoch": 0.20678117332143653, "grad_norm": 0.1742536425590515, "learning_rate": 1.9811650562589616e-05, "loss": 0.5093, "step": 927 }, { "epoch": 0.2070042382333259, "grad_norm": 0.18259446322917938, "learning_rate": 1.981119567601255e-05, "loss": 0.5528, "step": 928 }, { "epoch": 0.20722730314521526, "grad_norm": 0.19506299495697021, "learning_rate": 1.9810740246029755e-05, "loss": 0.5338, "step": 929 }, { "epoch": 0.20745036805710462, "grad_norm": 0.166092187166214, "learning_rate": 1.981028427266645e-05, "loss": 0.5308, "step": 930 }, { "epoch": 0.207673432968994, "grad_norm": 0.18397627770900726, "learning_rate": 1.980982775594789e-05, "loss": 0.5593, "step": 931 }, { "epoch": 0.20789649788088332, "grad_norm": 0.18404512107372284, "learning_rate": 1.980937069589937e-05, "loss": 0.5004, "step": 932 }, { "epoch": 0.2081195627927727, "grad_norm": 0.1727455109357834, "learning_rate": 1.9808913092546195e-05, "loss": 0.5245, "step": 933 }, { "epoch": 0.20834262770466205, "grad_norm": 0.18410643935203552, "learning_rate": 1.980845494591371e-05, "loss": 0.532, "step": 934 }, { "epoch": 0.20856569261655142, "grad_norm": 0.19003835320472717, "learning_rate": 1.9807996256027296e-05, "loss": 0.5129, "step": 935 }, { "epoch": 0.20878875752844078, "grad_norm": 0.23560728132724762, "learning_rate": 1.980753702291235e-05, "loss": 0.5484, "step": 936 }, { "epoch": 0.20901182244033015, "grad_norm": 0.17955288290977478, "learning_rate": 1.9807077246594316e-05, "loss": 0.5198, "step": 937 }, { "epoch": 0.20923488735221948, "grad_norm": 0.19580209255218506, "learning_rate": 1.9806616927098653e-05, "loss": 0.5023, "step": 938 }, { "epoch": 0.20945795226410885, "grad_norm": 0.17852741479873657, "learning_rate": 1.9806156064450855e-05, "loss": 0.5101, "step": 939 }, { "epoch": 0.2096810171759982, "grad_norm": 0.17126716673374176, "learning_rate": 1.9805694658676458e-05, "loss": 0.5415, "step": 940 }, { "epoch": 0.20990408208788758, "grad_norm": 0.16866669058799744, "learning_rate": 1.9805232709801008e-05, "loss": 0.5066, "step": 941 }, { "epoch": 0.21012714699977694, "grad_norm": 0.17179332673549652, "learning_rate": 1.9804770217850093e-05, "loss": 0.5059, "step": 942 }, { "epoch": 0.2103502119116663, "grad_norm": 0.1836715042591095, "learning_rate": 1.9804307182849326e-05, "loss": 0.5291, "step": 943 }, { "epoch": 0.21057327682355564, "grad_norm": 0.2429589480161667, "learning_rate": 1.980384360482436e-05, "loss": 0.5282, "step": 944 }, { "epoch": 0.210796341735445, "grad_norm": 0.17777171730995178, "learning_rate": 1.9803379483800866e-05, "loss": 0.5338, "step": 945 }, { "epoch": 0.21101940664733437, "grad_norm": 0.20419363677501678, "learning_rate": 1.9802914819804546e-05, "loss": 0.5345, "step": 946 }, { "epoch": 0.21124247155922374, "grad_norm": 0.23102112114429474, "learning_rate": 1.9802449612861144e-05, "loss": 0.5272, "step": 947 }, { "epoch": 0.2114655364711131, "grad_norm": 0.17385543882846832, "learning_rate": 1.9801983862996423e-05, "loss": 0.5206, "step": 948 }, { "epoch": 0.21168860138300247, "grad_norm": 0.20227134227752686, "learning_rate": 1.980151757023618e-05, "loss": 0.5362, "step": 949 }, { "epoch": 0.2119116662948918, "grad_norm": 0.3727608025074005, "learning_rate": 1.9801050734606236e-05, "loss": 0.511, "step": 950 }, { "epoch": 0.21213473120678117, "grad_norm": 0.17483538389205933, "learning_rate": 1.9800583356132453e-05, "loss": 0.5251, "step": 951 }, { "epoch": 0.21235779611867053, "grad_norm": 0.17400579154491425, "learning_rate": 1.9800115434840716e-05, "loss": 0.541, "step": 952 }, { "epoch": 0.2125808610305599, "grad_norm": 0.16707202792167664, "learning_rate": 1.979964697075694e-05, "loss": 0.5391, "step": 953 }, { "epoch": 0.21280392594244926, "grad_norm": 0.16636815667152405, "learning_rate": 1.9799177963907074e-05, "loss": 0.5434, "step": 954 }, { "epoch": 0.2130269908543386, "grad_norm": 0.17174044251441956, "learning_rate": 1.9798708414317095e-05, "loss": 0.5389, "step": 955 }, { "epoch": 0.21325005576622796, "grad_norm": 0.16985206305980682, "learning_rate": 1.9798238322013002e-05, "loss": 0.5327, "step": 956 }, { "epoch": 0.21347312067811733, "grad_norm": 0.17486560344696045, "learning_rate": 1.9797767687020843e-05, "loss": 0.5428, "step": 957 }, { "epoch": 0.2136961855900067, "grad_norm": 0.17041227221488953, "learning_rate": 1.9797296509366678e-05, "loss": 0.4995, "step": 958 }, { "epoch": 0.21391925050189606, "grad_norm": 0.1798069328069687, "learning_rate": 1.97968247890766e-05, "loss": 0.5364, "step": 959 }, { "epoch": 0.21414231541378542, "grad_norm": 0.17648300528526306, "learning_rate": 1.9796352526176746e-05, "loss": 0.5317, "step": 960 }, { "epoch": 0.21436538032567476, "grad_norm": 0.16647587716579437, "learning_rate": 1.9795879720693264e-05, "loss": 0.4989, "step": 961 }, { "epoch": 0.21458844523756412, "grad_norm": 0.17619404196739197, "learning_rate": 1.9795406372652345e-05, "loss": 0.5123, "step": 962 }, { "epoch": 0.2148115101494535, "grad_norm": 0.1789608895778656, "learning_rate": 1.979493248208021e-05, "loss": 0.5421, "step": 963 }, { "epoch": 0.21503457506134285, "grad_norm": 0.16893987357616425, "learning_rate": 1.97944580490031e-05, "loss": 0.5226, "step": 964 }, { "epoch": 0.21525763997323222, "grad_norm": 0.1653861552476883, "learning_rate": 1.9793983073447288e-05, "loss": 0.5221, "step": 965 }, { "epoch": 0.21548070488512158, "grad_norm": 0.17093954980373383, "learning_rate": 1.9793507555439092e-05, "loss": 0.535, "step": 966 }, { "epoch": 0.21570376979701092, "grad_norm": 0.18254542350769043, "learning_rate": 1.9793031495004845e-05, "loss": 0.5585, "step": 967 }, { "epoch": 0.21592683470890028, "grad_norm": 0.19674773514270782, "learning_rate": 1.9792554892170908e-05, "loss": 0.5159, "step": 968 }, { "epoch": 0.21614989962078965, "grad_norm": 0.17958855628967285, "learning_rate": 1.9792077746963686e-05, "loss": 0.5185, "step": 969 }, { "epoch": 0.216372964532679, "grad_norm": 0.1741204559803009, "learning_rate": 1.9791600059409606e-05, "loss": 0.5325, "step": 970 }, { "epoch": 0.21659602944456838, "grad_norm": 0.1755409985780716, "learning_rate": 1.9791121829535122e-05, "loss": 0.5005, "step": 971 }, { "epoch": 0.21681909435645774, "grad_norm": 0.17128707468509674, "learning_rate": 1.979064305736672e-05, "loss": 0.5345, "step": 972 }, { "epoch": 0.21704215926834708, "grad_norm": 0.17565475404262543, "learning_rate": 1.9790163742930922e-05, "loss": 0.4964, "step": 973 }, { "epoch": 0.21726522418023644, "grad_norm": 0.1766713559627533, "learning_rate": 1.978968388625427e-05, "loss": 0.5151, "step": 974 }, { "epoch": 0.2174882890921258, "grad_norm": 0.1699419617652893, "learning_rate": 1.9789203487363352e-05, "loss": 0.5365, "step": 975 }, { "epoch": 0.21771135400401517, "grad_norm": 0.20117329061031342, "learning_rate": 1.978872254628476e-05, "loss": 0.4942, "step": 976 }, { "epoch": 0.21793441891590454, "grad_norm": 0.17031805217266083, "learning_rate": 1.9788241063045147e-05, "loss": 0.5262, "step": 977 }, { "epoch": 0.21815748382779387, "grad_norm": 0.17150211334228516, "learning_rate": 1.9787759037671172e-05, "loss": 0.5169, "step": 978 }, { "epoch": 0.21838054873968324, "grad_norm": 0.17386960983276367, "learning_rate": 1.978727647018953e-05, "loss": 0.5464, "step": 979 }, { "epoch": 0.2186036136515726, "grad_norm": 0.17337769269943237, "learning_rate": 1.9786793360626956e-05, "loss": 0.5217, "step": 980 }, { "epoch": 0.21882667856346197, "grad_norm": 0.17415766417980194, "learning_rate": 1.9786309709010204e-05, "loss": 0.5222, "step": 981 }, { "epoch": 0.21904974347535133, "grad_norm": 0.17464500665664673, "learning_rate": 1.978582551536606e-05, "loss": 0.504, "step": 982 }, { "epoch": 0.2192728083872407, "grad_norm": 0.1703118085861206, "learning_rate": 1.9785340779721348e-05, "loss": 0.5419, "step": 983 }, { "epoch": 0.21949587329913003, "grad_norm": 0.1777229905128479, "learning_rate": 1.9784855502102908e-05, "loss": 0.5396, "step": 984 }, { "epoch": 0.2197189382110194, "grad_norm": 0.1821225881576538, "learning_rate": 1.978436968253762e-05, "loss": 0.5214, "step": 985 }, { "epoch": 0.21994200312290876, "grad_norm": 0.17865097522735596, "learning_rate": 1.9783883321052394e-05, "loss": 0.5354, "step": 986 }, { "epoch": 0.22016506803479813, "grad_norm": 0.17980031669139862, "learning_rate": 1.978339641767417e-05, "loss": 0.5461, "step": 987 }, { "epoch": 0.2203881329466875, "grad_norm": 0.18906496465206146, "learning_rate": 1.9782908972429906e-05, "loss": 0.5466, "step": 988 }, { "epoch": 0.22061119785857686, "grad_norm": 0.20680966973304749, "learning_rate": 1.978242098534661e-05, "loss": 0.4944, "step": 989 }, { "epoch": 0.2208342627704662, "grad_norm": 0.18064817786216736, "learning_rate": 1.978193245645131e-05, "loss": 0.5418, "step": 990 }, { "epoch": 0.22105732768235556, "grad_norm": 0.18148301541805267, "learning_rate": 1.978144338577105e-05, "loss": 0.5348, "step": 991 }, { "epoch": 0.22128039259424492, "grad_norm": 0.18182723224163055, "learning_rate": 1.9780953773332933e-05, "loss": 0.5159, "step": 992 }, { "epoch": 0.2215034575061343, "grad_norm": 0.17886720597743988, "learning_rate": 1.9780463619164073e-05, "loss": 0.5211, "step": 993 }, { "epoch": 0.22172652241802365, "grad_norm": 0.1731138974428177, "learning_rate": 1.9779972923291615e-05, "loss": 0.5386, "step": 994 }, { "epoch": 0.22194958732991302, "grad_norm": 0.17276941239833832, "learning_rate": 1.977948168574274e-05, "loss": 0.553, "step": 995 }, { "epoch": 0.22217265224180235, "grad_norm": 0.1833122968673706, "learning_rate": 1.977898990654465e-05, "loss": 0.5082, "step": 996 }, { "epoch": 0.22239571715369172, "grad_norm": 0.17474150657653809, "learning_rate": 1.9778497585724586e-05, "loss": 0.5167, "step": 997 }, { "epoch": 0.22261878206558108, "grad_norm": 0.18032675981521606, "learning_rate": 1.977800472330982e-05, "loss": 0.5351, "step": 998 }, { "epoch": 0.22284184697747045, "grad_norm": 0.19136174023151398, "learning_rate": 1.9777511319327645e-05, "loss": 0.5387, "step": 999 }, { "epoch": 0.2230649118893598, "grad_norm": 0.17949000000953674, "learning_rate": 1.977701737380539e-05, "loss": 0.5132, "step": 1000 }, { "epoch": 0.22328797680124918, "grad_norm": 0.16305360198020935, "learning_rate": 1.9776522886770413e-05, "loss": 0.4798, "step": 1001 }, { "epoch": 0.22351104171313851, "grad_norm": 0.18127582967281342, "learning_rate": 1.9776027858250102e-05, "loss": 0.5314, "step": 1002 }, { "epoch": 0.22373410662502788, "grad_norm": 0.17198516428470612, "learning_rate": 1.9775532288271876e-05, "loss": 0.5356, "step": 1003 }, { "epoch": 0.22395717153691724, "grad_norm": 0.17598840594291687, "learning_rate": 1.9775036176863178e-05, "loss": 0.5243, "step": 1004 }, { "epoch": 0.2241802364488066, "grad_norm": 0.17470763623714447, "learning_rate": 1.977453952405149e-05, "loss": 0.5138, "step": 1005 }, { "epoch": 0.22440330136069597, "grad_norm": 0.17710572481155396, "learning_rate": 1.977404232986432e-05, "loss": 0.5103, "step": 1006 }, { "epoch": 0.2246263662725853, "grad_norm": 0.16758602857589722, "learning_rate": 1.9773544594329202e-05, "loss": 0.5685, "step": 1007 }, { "epoch": 0.22484943118447467, "grad_norm": 0.17261147499084473, "learning_rate": 1.977304631747371e-05, "loss": 0.5149, "step": 1008 }, { "epoch": 0.22507249609636404, "grad_norm": 0.1722407191991806, "learning_rate": 1.9772547499325437e-05, "loss": 0.5277, "step": 1009 }, { "epoch": 0.2252955610082534, "grad_norm": 0.17808939516544342, "learning_rate": 1.9772048139912012e-05, "loss": 0.5392, "step": 1010 }, { "epoch": 0.22551862592014277, "grad_norm": 0.17333589494228363, "learning_rate": 1.9771548239261088e-05, "loss": 0.5347, "step": 1011 }, { "epoch": 0.22574169083203213, "grad_norm": 0.1775379180908203, "learning_rate": 1.9771047797400363e-05, "loss": 0.5243, "step": 1012 }, { "epoch": 0.22596475574392147, "grad_norm": 0.19107598066329956, "learning_rate": 1.9770546814357546e-05, "loss": 0.5464, "step": 1013 }, { "epoch": 0.22618782065581083, "grad_norm": 0.1707477569580078, "learning_rate": 1.9770045290160388e-05, "loss": 0.5337, "step": 1014 }, { "epoch": 0.2264108855677002, "grad_norm": 0.19580954313278198, "learning_rate": 1.9769543224836668e-05, "loss": 0.5138, "step": 1015 }, { "epoch": 0.22663395047958956, "grad_norm": 0.17244374752044678, "learning_rate": 1.9769040618414187e-05, "loss": 0.5232, "step": 1016 }, { "epoch": 0.22685701539147893, "grad_norm": 0.16292127966880798, "learning_rate": 1.9768537470920788e-05, "loss": 0.4989, "step": 1017 }, { "epoch": 0.2270800803033683, "grad_norm": 0.18515698611736298, "learning_rate": 1.9768033782384338e-05, "loss": 0.5316, "step": 1018 }, { "epoch": 0.22730314521525763, "grad_norm": 0.16262215375900269, "learning_rate": 1.9767529552832732e-05, "loss": 0.4855, "step": 1019 }, { "epoch": 0.227526210127147, "grad_norm": 0.17309491336345673, "learning_rate": 1.9767024782293902e-05, "loss": 0.5041, "step": 1020 }, { "epoch": 0.22774927503903636, "grad_norm": 0.1719563603401184, "learning_rate": 1.9766519470795803e-05, "loss": 0.5412, "step": 1021 }, { "epoch": 0.22797233995092572, "grad_norm": 0.2674558460712433, "learning_rate": 1.9766013618366417e-05, "loss": 0.5274, "step": 1022 }, { "epoch": 0.2281954048628151, "grad_norm": 0.16945339739322662, "learning_rate": 1.9765507225033772e-05, "loss": 0.5034, "step": 1023 }, { "epoch": 0.22841846977470445, "grad_norm": 0.19735541939735413, "learning_rate": 1.9765000290825908e-05, "loss": 0.5059, "step": 1024 }, { "epoch": 0.2286415346865938, "grad_norm": 0.17197129130363464, "learning_rate": 1.97644928157709e-05, "loss": 0.5133, "step": 1025 }, { "epoch": 0.22886459959848315, "grad_norm": 0.18425902724266052, "learning_rate": 1.976398479989686e-05, "loss": 0.5062, "step": 1026 }, { "epoch": 0.22908766451037252, "grad_norm": 0.4941766858100891, "learning_rate": 1.9763476243231924e-05, "loss": 0.535, "step": 1027 }, { "epoch": 0.22931072942226188, "grad_norm": 0.16707998514175415, "learning_rate": 1.976296714580426e-05, "loss": 0.5177, "step": 1028 }, { "epoch": 0.22953379433415125, "grad_norm": 0.173675537109375, "learning_rate": 1.9762457507642066e-05, "loss": 0.5234, "step": 1029 }, { "epoch": 0.22975685924604058, "grad_norm": 0.18508677184581757, "learning_rate": 1.9761947328773565e-05, "loss": 0.5073, "step": 1030 }, { "epoch": 0.22997992415792995, "grad_norm": 0.2566058039665222, "learning_rate": 1.9761436609227016e-05, "loss": 0.5176, "step": 1031 }, { "epoch": 0.2302029890698193, "grad_norm": 0.1702575534582138, "learning_rate": 1.9760925349030704e-05, "loss": 0.5106, "step": 1032 }, { "epoch": 0.23042605398170868, "grad_norm": 0.18858478963375092, "learning_rate": 1.976041354821295e-05, "loss": 0.5052, "step": 1033 }, { "epoch": 0.23064911889359804, "grad_norm": 0.191171795129776, "learning_rate": 1.9759901206802098e-05, "loss": 0.5643, "step": 1034 }, { "epoch": 0.2308721838054874, "grad_norm": 0.17536719143390656, "learning_rate": 1.9759388324826523e-05, "loss": 0.5344, "step": 1035 }, { "epoch": 0.23109524871737674, "grad_norm": 0.21609671413898468, "learning_rate": 1.9758874902314634e-05, "loss": 0.5197, "step": 1036 }, { "epoch": 0.2313183136292661, "grad_norm": 0.16850803792476654, "learning_rate": 1.9758360939294867e-05, "loss": 0.4967, "step": 1037 }, { "epoch": 0.23154137854115547, "grad_norm": 0.19448642432689667, "learning_rate": 1.9757846435795688e-05, "loss": 0.5171, "step": 1038 }, { "epoch": 0.23176444345304484, "grad_norm": 0.19304804503917694, "learning_rate": 1.9757331391845596e-05, "loss": 0.5491, "step": 1039 }, { "epoch": 0.2319875083649342, "grad_norm": 0.17683899402618408, "learning_rate": 1.975681580747312e-05, "loss": 0.535, "step": 1040 }, { "epoch": 0.23221057327682357, "grad_norm": 0.17681746184825897, "learning_rate": 1.9756299682706804e-05, "loss": 0.5053, "step": 1041 }, { "epoch": 0.2324336381887129, "grad_norm": 0.17382597923278809, "learning_rate": 1.9755783017575244e-05, "loss": 0.5063, "step": 1042 }, { "epoch": 0.23265670310060227, "grad_norm": 0.17841431498527527, "learning_rate": 1.9755265812107053e-05, "loss": 0.4845, "step": 1043 }, { "epoch": 0.23287976801249163, "grad_norm": 0.18297810852527618, "learning_rate": 1.9754748066330883e-05, "loss": 0.5418, "step": 1044 }, { "epoch": 0.233102832924381, "grad_norm": 0.19104409217834473, "learning_rate": 1.97542297802754e-05, "loss": 0.5417, "step": 1045 }, { "epoch": 0.23332589783627036, "grad_norm": 0.1772020310163498, "learning_rate": 1.975371095396932e-05, "loss": 0.5296, "step": 1046 }, { "epoch": 0.23354896274815973, "grad_norm": 0.18861500918865204, "learning_rate": 1.9753191587441372e-05, "loss": 0.5667, "step": 1047 }, { "epoch": 0.23377202766004906, "grad_norm": 0.17586645483970642, "learning_rate": 1.9752671680720324e-05, "loss": 0.5001, "step": 1048 }, { "epoch": 0.23399509257193843, "grad_norm": 0.7259854674339294, "learning_rate": 1.975215123383497e-05, "loss": 0.5456, "step": 1049 }, { "epoch": 0.2342181574838278, "grad_norm": 0.18736739456653595, "learning_rate": 1.9751630246814136e-05, "loss": 0.5231, "step": 1050 }, { "epoch": 0.23444122239571716, "grad_norm": 0.2538264989852905, "learning_rate": 1.9751108719686683e-05, "loss": 0.5387, "step": 1051 }, { "epoch": 0.23466428730760652, "grad_norm": 0.17714229226112366, "learning_rate": 1.9750586652481492e-05, "loss": 0.5076, "step": 1052 }, { "epoch": 0.23488735221949586, "grad_norm": 0.3667065501213074, "learning_rate": 1.9750064045227474e-05, "loss": 0.5432, "step": 1053 }, { "epoch": 0.23511041713138522, "grad_norm": 0.18610940873622894, "learning_rate": 1.9749540897953584e-05, "loss": 0.5307, "step": 1054 }, { "epoch": 0.2353334820432746, "grad_norm": 0.18711121380329132, "learning_rate": 1.974901721068879e-05, "loss": 0.5256, "step": 1055 }, { "epoch": 0.23555654695516395, "grad_norm": 0.17803776264190674, "learning_rate": 1.97484929834621e-05, "loss": 0.5076, "step": 1056 }, { "epoch": 0.23577961186705332, "grad_norm": 0.19948653876781464, "learning_rate": 1.9747968216302545e-05, "loss": 0.5185, "step": 1057 }, { "epoch": 0.23600267677894268, "grad_norm": 0.3181793987751007, "learning_rate": 1.9747442909239198e-05, "loss": 0.4874, "step": 1058 }, { "epoch": 0.23622574169083202, "grad_norm": 0.1865629106760025, "learning_rate": 1.9746917062301146e-05, "loss": 0.52, "step": 1059 }, { "epoch": 0.23644880660272138, "grad_norm": 0.23274224996566772, "learning_rate": 1.9746390675517514e-05, "loss": 0.496, "step": 1060 }, { "epoch": 0.23667187151461075, "grad_norm": 0.2044648379087448, "learning_rate": 1.974586374891746e-05, "loss": 0.5173, "step": 1061 }, { "epoch": 0.2368949364265001, "grad_norm": 0.18317201733589172, "learning_rate": 1.974533628253017e-05, "loss": 0.5282, "step": 1062 }, { "epoch": 0.23711800133838948, "grad_norm": 0.18429698050022125, "learning_rate": 1.9744808276384858e-05, "loss": 0.5395, "step": 1063 }, { "epoch": 0.23734106625027884, "grad_norm": 0.20926502346992493, "learning_rate": 1.9744279730510764e-05, "loss": 0.5111, "step": 1064 }, { "epoch": 0.23756413116216818, "grad_norm": 0.17823931574821472, "learning_rate": 1.974375064493716e-05, "loss": 0.5258, "step": 1065 }, { "epoch": 0.23778719607405754, "grad_norm": 0.19109368324279785, "learning_rate": 1.9743221019693362e-05, "loss": 0.5512, "step": 1066 }, { "epoch": 0.2380102609859469, "grad_norm": 0.21680758893489838, "learning_rate": 1.9742690854808692e-05, "loss": 0.4951, "step": 1067 }, { "epoch": 0.23823332589783627, "grad_norm": 0.1768484115600586, "learning_rate": 1.974216015031252e-05, "loss": 0.534, "step": 1068 }, { "epoch": 0.23845639080972564, "grad_norm": 0.18384598195552826, "learning_rate": 1.974162890623424e-05, "loss": 0.502, "step": 1069 }, { "epoch": 0.238679455721615, "grad_norm": 0.17585726082324982, "learning_rate": 1.974109712260327e-05, "loss": 0.5177, "step": 1070 }, { "epoch": 0.23890252063350434, "grad_norm": 0.17469032108783722, "learning_rate": 1.9740564799449073e-05, "loss": 0.553, "step": 1071 }, { "epoch": 0.2391255855453937, "grad_norm": 0.21859519183635712, "learning_rate": 1.9740031936801122e-05, "loss": 0.5204, "step": 1072 }, { "epoch": 0.23934865045728307, "grad_norm": 0.36654403805732727, "learning_rate": 1.9739498534688936e-05, "loss": 0.5375, "step": 1073 }, { "epoch": 0.23957171536917243, "grad_norm": 0.17866800725460052, "learning_rate": 1.973896459314206e-05, "loss": 0.5477, "step": 1074 }, { "epoch": 0.2397947802810618, "grad_norm": 0.1822461485862732, "learning_rate": 1.973843011219006e-05, "loss": 0.5197, "step": 1075 }, { "epoch": 0.24001784519295116, "grad_norm": 0.17180079221725464, "learning_rate": 1.9737895091862545e-05, "loss": 0.5269, "step": 1076 }, { "epoch": 0.2402409101048405, "grad_norm": 0.17935633659362793, "learning_rate": 1.9737359532189147e-05, "loss": 0.5279, "step": 1077 }, { "epoch": 0.24046397501672986, "grad_norm": 0.22954648733139038, "learning_rate": 1.9736823433199524e-05, "loss": 0.5199, "step": 1078 }, { "epoch": 0.24068703992861923, "grad_norm": 0.17607101798057556, "learning_rate": 1.973628679492338e-05, "loss": 0.5121, "step": 1079 }, { "epoch": 0.2409101048405086, "grad_norm": 0.17458245158195496, "learning_rate": 1.9735749617390422e-05, "loss": 0.5363, "step": 1080 }, { "epoch": 0.24113316975239796, "grad_norm": 0.17682591080665588, "learning_rate": 1.9735211900630414e-05, "loss": 0.5254, "step": 1081 }, { "epoch": 0.2413562346642873, "grad_norm": 0.17350362241268158, "learning_rate": 1.9734673644673133e-05, "loss": 0.5381, "step": 1082 }, { "epoch": 0.24157929957617666, "grad_norm": 0.17231930792331696, "learning_rate": 1.973413484954839e-05, "loss": 0.5118, "step": 1083 }, { "epoch": 0.24180236448806602, "grad_norm": 0.21497975289821625, "learning_rate": 1.9733595515286032e-05, "loss": 0.5353, "step": 1084 }, { "epoch": 0.2420254293999554, "grad_norm": 0.1669948548078537, "learning_rate": 1.9733055641915926e-05, "loss": 0.5216, "step": 1085 }, { "epoch": 0.24224849431184475, "grad_norm": 0.18229244649410248, "learning_rate": 1.9732515229467973e-05, "loss": 0.5379, "step": 1086 }, { "epoch": 0.24247155922373412, "grad_norm": 0.17432808876037598, "learning_rate": 1.973197427797211e-05, "loss": 0.5461, "step": 1087 }, { "epoch": 0.24269462413562345, "grad_norm": 0.18571536242961884, "learning_rate": 1.9731432787458294e-05, "loss": 0.5469, "step": 1088 }, { "epoch": 0.24291768904751282, "grad_norm": 0.18052545189857483, "learning_rate": 1.9730890757956517e-05, "loss": 0.5416, "step": 1089 }, { "epoch": 0.24314075395940218, "grad_norm": 0.23118536174297333, "learning_rate": 1.97303481894968e-05, "loss": 0.5063, "step": 1090 }, { "epoch": 0.24336381887129155, "grad_norm": 0.16191843152046204, "learning_rate": 1.9729805082109194e-05, "loss": 0.5178, "step": 1091 }, { "epoch": 0.2435868837831809, "grad_norm": 0.17042016983032227, "learning_rate": 1.9729261435823782e-05, "loss": 0.5024, "step": 1092 }, { "epoch": 0.24380994869507028, "grad_norm": 0.21003969013690948, "learning_rate": 1.972871725067067e-05, "loss": 0.5106, "step": 1093 }, { "epoch": 0.24403301360695961, "grad_norm": 0.22672854363918304, "learning_rate": 1.972817252668e-05, "loss": 0.5309, "step": 1094 }, { "epoch": 0.24425607851884898, "grad_norm": 0.16994351148605347, "learning_rate": 1.9727627263881942e-05, "loss": 0.5023, "step": 1095 }, { "epoch": 0.24447914343073834, "grad_norm": 0.17019368708133698, "learning_rate": 1.9727081462306697e-05, "loss": 0.5316, "step": 1096 }, { "epoch": 0.2447022083426277, "grad_norm": 0.17145198583602905, "learning_rate": 1.97265351219845e-05, "loss": 0.5148, "step": 1097 }, { "epoch": 0.24492527325451707, "grad_norm": 0.20565351843833923, "learning_rate": 1.9725988242945598e-05, "loss": 0.5445, "step": 1098 }, { "epoch": 0.24514833816640644, "grad_norm": 0.1741577833890915, "learning_rate": 1.9725440825220296e-05, "loss": 0.4958, "step": 1099 }, { "epoch": 0.24537140307829577, "grad_norm": 0.18705230951309204, "learning_rate": 1.9724892868838902e-05, "loss": 0.5105, "step": 1100 }, { "epoch": 0.24559446799018514, "grad_norm": 0.5166819095611572, "learning_rate": 1.9724344373831768e-05, "loss": 0.5414, "step": 1101 }, { "epoch": 0.2458175329020745, "grad_norm": 0.17351533472537994, "learning_rate": 1.9723795340229274e-05, "loss": 0.5024, "step": 1102 }, { "epoch": 0.24604059781396387, "grad_norm": 0.1658518761396408, "learning_rate": 1.972324576806183e-05, "loss": 0.4977, "step": 1103 }, { "epoch": 0.24626366272585323, "grad_norm": 0.16890370845794678, "learning_rate": 1.972269565735987e-05, "loss": 0.5359, "step": 1104 }, { "epoch": 0.24648672763774257, "grad_norm": 0.18016333878040314, "learning_rate": 1.9722145008153873e-05, "loss": 0.5394, "step": 1105 }, { "epoch": 0.24670979254963193, "grad_norm": 0.16652169823646545, "learning_rate": 1.9721593820474326e-05, "loss": 0.5089, "step": 1106 }, { "epoch": 0.2469328574615213, "grad_norm": 0.1999325454235077, "learning_rate": 1.9721042094351764e-05, "loss": 0.5541, "step": 1107 }, { "epoch": 0.24715592237341066, "grad_norm": 0.16372385621070862, "learning_rate": 1.972048982981674e-05, "loss": 0.4942, "step": 1108 }, { "epoch": 0.24737898728530003, "grad_norm": 0.20245228707790375, "learning_rate": 1.971993702689985e-05, "loss": 0.5034, "step": 1109 }, { "epoch": 0.2476020521971894, "grad_norm": 0.17487917840480804, "learning_rate": 1.97193836856317e-05, "loss": 0.5231, "step": 1110 }, { "epoch": 0.24782511710907873, "grad_norm": 0.1696334332227707, "learning_rate": 1.971882980604295e-05, "loss": 0.5141, "step": 1111 }, { "epoch": 0.2480481820209681, "grad_norm": 0.17021594941616058, "learning_rate": 1.971827538816427e-05, "loss": 0.5081, "step": 1112 }, { "epoch": 0.24827124693285746, "grad_norm": 0.18088696897029877, "learning_rate": 1.9717720432026367e-05, "loss": 0.5743, "step": 1113 }, { "epoch": 0.24849431184474682, "grad_norm": 0.17647258937358856, "learning_rate": 1.9717164937659984e-05, "loss": 0.5289, "step": 1114 }, { "epoch": 0.2487173767566362, "grad_norm": 0.17265263199806213, "learning_rate": 1.971660890509588e-05, "loss": 0.5296, "step": 1115 }, { "epoch": 0.24894044166852555, "grad_norm": 0.16816379129886627, "learning_rate": 1.971605233436485e-05, "loss": 0.5263, "step": 1116 }, { "epoch": 0.2491635065804149, "grad_norm": 0.17548470199108124, "learning_rate": 1.9715495225497736e-05, "loss": 0.5315, "step": 1117 }, { "epoch": 0.24938657149230425, "grad_norm": 0.1775428056716919, "learning_rate": 1.9714937578525374e-05, "loss": 0.5227, "step": 1118 }, { "epoch": 0.24960963640419362, "grad_norm": 0.17629674077033997, "learning_rate": 1.971437939347866e-05, "loss": 0.5475, "step": 1119 }, { "epoch": 0.24983270131608298, "grad_norm": 0.20265498757362366, "learning_rate": 1.9713820670388518e-05, "loss": 0.5415, "step": 1120 }, { "epoch": 0.2500557662279723, "grad_norm": 0.17958010733127594, "learning_rate": 1.9713261409285876e-05, "loss": 0.5491, "step": 1121 }, { "epoch": 0.2502788311398617, "grad_norm": 0.1667289137840271, "learning_rate": 1.9712701610201723e-05, "loss": 0.5319, "step": 1122 }, { "epoch": 0.25050189605175105, "grad_norm": 0.17537854611873627, "learning_rate": 1.9712141273167058e-05, "loss": 0.5033, "step": 1123 }, { "epoch": 0.25072496096364044, "grad_norm": 0.1665065735578537, "learning_rate": 1.9711580398212918e-05, "loss": 0.5043, "step": 1124 }, { "epoch": 0.2509480258755298, "grad_norm": 0.17893299460411072, "learning_rate": 1.9711018985370366e-05, "loss": 0.5424, "step": 1125 }, { "epoch": 0.2511710907874191, "grad_norm": 0.17293697595596313, "learning_rate": 1.97104570346705e-05, "loss": 0.546, "step": 1126 }, { "epoch": 0.2513941556993085, "grad_norm": 0.16615547239780426, "learning_rate": 1.970989454614444e-05, "loss": 0.5163, "step": 1127 }, { "epoch": 0.25161722061119784, "grad_norm": 0.16944670677185059, "learning_rate": 1.9709331519823343e-05, "loss": 0.5471, "step": 1128 }, { "epoch": 0.25184028552308724, "grad_norm": 0.179295152425766, "learning_rate": 1.9708767955738394e-05, "loss": 0.5381, "step": 1129 }, { "epoch": 0.2520633504349766, "grad_norm": 0.1665600687265396, "learning_rate": 1.9708203853920803e-05, "loss": 0.4863, "step": 1130 }, { "epoch": 0.2522864153468659, "grad_norm": 0.1918114423751831, "learning_rate": 1.970763921440182e-05, "loss": 0.4875, "step": 1131 }, { "epoch": 0.2525094802587553, "grad_norm": 0.16761860251426697, "learning_rate": 1.9707074037212707e-05, "loss": 0.5314, "step": 1132 }, { "epoch": 0.25273254517064464, "grad_norm": 0.19442524015903473, "learning_rate": 1.970650832238478e-05, "loss": 0.5186, "step": 1133 }, { "epoch": 0.25295561008253403, "grad_norm": 0.17174071073532104, "learning_rate": 1.9705942069949362e-05, "loss": 0.5327, "step": 1134 }, { "epoch": 0.25317867499442337, "grad_norm": 0.18218085169792175, "learning_rate": 1.970537527993782e-05, "loss": 0.5665, "step": 1135 }, { "epoch": 0.25340173990631276, "grad_norm": 0.17595386505126953, "learning_rate": 1.9704807952381542e-05, "loss": 0.5581, "step": 1136 }, { "epoch": 0.2536248048182021, "grad_norm": 0.20507752895355225, "learning_rate": 1.9704240087311963e-05, "loss": 0.541, "step": 1137 }, { "epoch": 0.25384786973009144, "grad_norm": 0.1604541391134262, "learning_rate": 1.970367168476052e-05, "loss": 0.5223, "step": 1138 }, { "epoch": 0.25407093464198083, "grad_norm": 0.16794823110103607, "learning_rate": 1.9703102744758703e-05, "loss": 0.5444, "step": 1139 }, { "epoch": 0.25429399955387016, "grad_norm": 0.16887861490249634, "learning_rate": 1.9702533267338015e-05, "loss": 0.5237, "step": 1140 }, { "epoch": 0.25451706446575956, "grad_norm": 0.16295503079891205, "learning_rate": 1.970196325253001e-05, "loss": 0.5293, "step": 1141 }, { "epoch": 0.2547401293776489, "grad_norm": 0.16665813326835632, "learning_rate": 1.9701392700366247e-05, "loss": 0.5104, "step": 1142 }, { "epoch": 0.25496319428953823, "grad_norm": 0.16648006439208984, "learning_rate": 1.970082161087834e-05, "loss": 0.5001, "step": 1143 }, { "epoch": 0.2551862592014276, "grad_norm": 0.16400828957557678, "learning_rate": 1.9700249984097907e-05, "loss": 0.5106, "step": 1144 }, { "epoch": 0.25540932411331696, "grad_norm": 0.16578614711761475, "learning_rate": 1.969967782005661e-05, "loss": 0.5134, "step": 1145 }, { "epoch": 0.25563238902520635, "grad_norm": 0.16847828030586243, "learning_rate": 1.9699105118786145e-05, "loss": 0.4994, "step": 1146 }, { "epoch": 0.2558554539370957, "grad_norm": 0.18443261086940765, "learning_rate": 1.9698531880318228e-05, "loss": 0.5136, "step": 1147 }, { "epoch": 0.2560785188489851, "grad_norm": 0.17291708290576935, "learning_rate": 1.969795810468461e-05, "loss": 0.5243, "step": 1148 }, { "epoch": 0.2563015837608744, "grad_norm": 0.18102054297924042, "learning_rate": 1.9697383791917068e-05, "loss": 0.4966, "step": 1149 }, { "epoch": 0.25652464867276376, "grad_norm": 0.17706115543842316, "learning_rate": 1.9696808942047414e-05, "loss": 0.5332, "step": 1150 }, { "epoch": 0.25674771358465315, "grad_norm": 0.1936006247997284, "learning_rate": 1.9696233555107484e-05, "loss": 0.5325, "step": 1151 }, { "epoch": 0.2569707784965425, "grad_norm": 0.19508236646652222, "learning_rate": 1.969565763112915e-05, "loss": 0.509, "step": 1152 }, { "epoch": 0.2571938434084319, "grad_norm": 0.16369056701660156, "learning_rate": 1.9695081170144306e-05, "loss": 0.5188, "step": 1153 }, { "epoch": 0.2574169083203212, "grad_norm": 0.16955684125423431, "learning_rate": 1.9694504172184885e-05, "loss": 0.5316, "step": 1154 }, { "epoch": 0.25763997323221055, "grad_norm": 0.16252164542675018, "learning_rate": 1.969392663728284e-05, "loss": 0.5291, "step": 1155 }, { "epoch": 0.25786303814409994, "grad_norm": 0.17399340867996216, "learning_rate": 1.969334856547016e-05, "loss": 0.5153, "step": 1156 }, { "epoch": 0.2580861030559893, "grad_norm": 0.17638690769672394, "learning_rate": 1.9692769956778867e-05, "loss": 0.5128, "step": 1157 }, { "epoch": 0.2583091679678787, "grad_norm": 0.1711435467004776, "learning_rate": 1.9692190811241e-05, "loss": 0.5205, "step": 1158 }, { "epoch": 0.258532232879768, "grad_norm": 0.18419131636619568, "learning_rate": 1.9691611128888643e-05, "loss": 0.5312, "step": 1159 }, { "epoch": 0.25875529779165735, "grad_norm": 0.1925041675567627, "learning_rate": 1.9691030909753894e-05, "loss": 0.5479, "step": 1160 }, { "epoch": 0.25897836270354674, "grad_norm": 0.1986900418996811, "learning_rate": 1.9690450153868895e-05, "loss": 0.5095, "step": 1161 }, { "epoch": 0.2592014276154361, "grad_norm": 0.17978021502494812, "learning_rate": 1.9689868861265816e-05, "loss": 0.5015, "step": 1162 }, { "epoch": 0.25942449252732547, "grad_norm": 0.16919931769371033, "learning_rate": 1.9689287031976845e-05, "loss": 0.5227, "step": 1163 }, { "epoch": 0.2596475574392148, "grad_norm": 0.17485311627388, "learning_rate": 1.9688704666034208e-05, "loss": 0.5386, "step": 1164 }, { "epoch": 0.2598706223511042, "grad_norm": 0.17683085799217224, "learning_rate": 1.9688121763470165e-05, "loss": 0.5201, "step": 1165 }, { "epoch": 0.26009368726299353, "grad_norm": 0.18902894854545593, "learning_rate": 1.9687538324316997e-05, "loss": 0.51, "step": 1166 }, { "epoch": 0.26031675217488287, "grad_norm": 0.17308862507343292, "learning_rate": 1.968695434860702e-05, "loss": 0.5112, "step": 1167 }, { "epoch": 0.26053981708677226, "grad_norm": 0.17456986010074615, "learning_rate": 1.9686369836372577e-05, "loss": 0.5182, "step": 1168 }, { "epoch": 0.2607628819986616, "grad_norm": 0.17276804149150848, "learning_rate": 1.9685784787646044e-05, "loss": 0.5389, "step": 1169 }, { "epoch": 0.260985946910551, "grad_norm": 0.1710672229528427, "learning_rate": 1.9685199202459824e-05, "loss": 0.4948, "step": 1170 }, { "epoch": 0.26120901182244033, "grad_norm": 0.18158847093582153, "learning_rate": 1.9684613080846347e-05, "loss": 0.5332, "step": 1171 }, { "epoch": 0.26143207673432967, "grad_norm": 0.17782646417617798, "learning_rate": 1.968402642283808e-05, "loss": 0.5519, "step": 1172 }, { "epoch": 0.26165514164621906, "grad_norm": 0.1716913878917694, "learning_rate": 1.9683439228467515e-05, "loss": 0.5074, "step": 1173 }, { "epoch": 0.2618782065581084, "grad_norm": 0.1818932741880417, "learning_rate": 1.9682851497767175e-05, "loss": 0.5401, "step": 1174 }, { "epoch": 0.2621012714699978, "grad_norm": 0.1722068190574646, "learning_rate": 1.9682263230769612e-05, "loss": 0.52, "step": 1175 }, { "epoch": 0.2623243363818871, "grad_norm": 0.17454420030117035, "learning_rate": 1.9681674427507405e-05, "loss": 0.5063, "step": 1176 }, { "epoch": 0.2625474012937765, "grad_norm": 0.1726730465888977, "learning_rate": 1.9681085088013174e-05, "loss": 0.5343, "step": 1177 }, { "epoch": 0.26277046620566585, "grad_norm": 0.1802458018064499, "learning_rate": 1.9680495212319547e-05, "loss": 0.5218, "step": 1178 }, { "epoch": 0.2629935311175552, "grad_norm": 0.1708204448223114, "learning_rate": 1.9679904800459205e-05, "loss": 0.4943, "step": 1179 }, { "epoch": 0.2632165960294446, "grad_norm": 0.1644791066646576, "learning_rate": 1.9679313852464846e-05, "loss": 0.534, "step": 1180 }, { "epoch": 0.2634396609413339, "grad_norm": 0.16891272366046906, "learning_rate": 1.9678722368369203e-05, "loss": 0.519, "step": 1181 }, { "epoch": 0.2636627258532233, "grad_norm": 0.1720377802848816, "learning_rate": 1.9678130348205032e-05, "loss": 0.5362, "step": 1182 }, { "epoch": 0.26388579076511265, "grad_norm": 0.17827892303466797, "learning_rate": 1.9677537792005124e-05, "loss": 0.5387, "step": 1183 }, { "epoch": 0.264108855677002, "grad_norm": 0.17692813277244568, "learning_rate": 1.96769446998023e-05, "loss": 0.5051, "step": 1184 }, { "epoch": 0.2643319205888914, "grad_norm": 0.17577511072158813, "learning_rate": 1.9676351071629405e-05, "loss": 0.5162, "step": 1185 }, { "epoch": 0.2645549855007807, "grad_norm": 0.1805581897497177, "learning_rate": 1.9675756907519325e-05, "loss": 0.5528, "step": 1186 }, { "epoch": 0.2647780504126701, "grad_norm": 0.17259716987609863, "learning_rate": 1.967516220750496e-05, "loss": 0.5381, "step": 1187 }, { "epoch": 0.26500111532455944, "grad_norm": 0.16992929577827454, "learning_rate": 1.9674566971619256e-05, "loss": 0.51, "step": 1188 }, { "epoch": 0.2652241802364488, "grad_norm": 0.16945737600326538, "learning_rate": 1.9673971199895177e-05, "loss": 0.4987, "step": 1189 }, { "epoch": 0.2654472451483382, "grad_norm": 0.16880926489830017, "learning_rate": 1.967337489236572e-05, "loss": 0.5188, "step": 1190 }, { "epoch": 0.2656703100602275, "grad_norm": 0.17628756165504456, "learning_rate": 1.9672778049063915e-05, "loss": 0.5346, "step": 1191 }, { "epoch": 0.2658933749721169, "grad_norm": 0.17406663298606873, "learning_rate": 1.967218067002282e-05, "loss": 0.5493, "step": 1192 }, { "epoch": 0.26611643988400624, "grad_norm": 0.17142999172210693, "learning_rate": 1.9671582755275515e-05, "loss": 0.5256, "step": 1193 }, { "epoch": 0.26633950479589563, "grad_norm": 0.17752200365066528, "learning_rate": 1.9670984304855125e-05, "loss": 0.5237, "step": 1194 }, { "epoch": 0.26656256970778497, "grad_norm": 0.17670097947120667, "learning_rate": 1.9670385318794785e-05, "loss": 0.5226, "step": 1195 }, { "epoch": 0.2667856346196743, "grad_norm": 0.1656324565410614, "learning_rate": 1.966978579712768e-05, "loss": 0.494, "step": 1196 }, { "epoch": 0.2670086995315637, "grad_norm": 0.17369796335697174, "learning_rate": 1.966918573988701e-05, "loss": 0.506, "step": 1197 }, { "epoch": 0.26723176444345303, "grad_norm": 0.17040349543094635, "learning_rate": 1.9668585147106017e-05, "loss": 0.5112, "step": 1198 }, { "epoch": 0.2674548293553424, "grad_norm": 0.1750401258468628, "learning_rate": 1.9667984018817957e-05, "loss": 0.5465, "step": 1199 }, { "epoch": 0.26767789426723176, "grad_norm": 0.16083772480487823, "learning_rate": 1.9667382355056128e-05, "loss": 0.5105, "step": 1200 }, { "epoch": 0.2679009591791211, "grad_norm": 0.16108831763267517, "learning_rate": 1.9666780155853854e-05, "loss": 0.512, "step": 1201 }, { "epoch": 0.2681240240910105, "grad_norm": 0.17286653816699982, "learning_rate": 1.966617742124449e-05, "loss": 0.5686, "step": 1202 }, { "epoch": 0.26834708900289983, "grad_norm": 0.17905832827091217, "learning_rate": 1.9665574151261418e-05, "loss": 0.5423, "step": 1203 }, { "epoch": 0.2685701539147892, "grad_norm": 0.16681291162967682, "learning_rate": 1.966497034593805e-05, "loss": 0.5041, "step": 1204 }, { "epoch": 0.26879321882667856, "grad_norm": 0.17184089124202728, "learning_rate": 1.9664366005307828e-05, "loss": 0.5501, "step": 1205 }, { "epoch": 0.2690162837385679, "grad_norm": 0.19392690062522888, "learning_rate": 1.9663761129404228e-05, "loss": 0.5398, "step": 1206 }, { "epoch": 0.2692393486504573, "grad_norm": 0.16529332101345062, "learning_rate": 1.9663155718260746e-05, "loss": 0.5086, "step": 1207 }, { "epoch": 0.2694624135623466, "grad_norm": 0.1649094521999359, "learning_rate": 1.966254977191092e-05, "loss": 0.515, "step": 1208 }, { "epoch": 0.269685478474236, "grad_norm": 0.1709623634815216, "learning_rate": 1.9661943290388302e-05, "loss": 0.5315, "step": 1209 }, { "epoch": 0.26990854338612535, "grad_norm": 0.164725661277771, "learning_rate": 1.9661336273726496e-05, "loss": 0.4817, "step": 1210 }, { "epoch": 0.27013160829801475, "grad_norm": 0.1715349704027176, "learning_rate": 1.966072872195911e-05, "loss": 0.4706, "step": 1211 }, { "epoch": 0.2703546732099041, "grad_norm": 0.1747862696647644, "learning_rate": 1.9660120635119798e-05, "loss": 0.5055, "step": 1212 }, { "epoch": 0.2705777381217934, "grad_norm": 0.17992718517780304, "learning_rate": 1.9659512013242245e-05, "loss": 0.4946, "step": 1213 }, { "epoch": 0.2708008030336828, "grad_norm": 0.1895056813955307, "learning_rate": 1.9658902856360153e-05, "loss": 0.5258, "step": 1214 }, { "epoch": 0.27102386794557215, "grad_norm": 0.19161482155323029, "learning_rate": 1.9658293164507265e-05, "loss": 0.5206, "step": 1215 }, { "epoch": 0.27124693285746154, "grad_norm": 0.1860935389995575, "learning_rate": 1.965768293771735e-05, "loss": 0.5484, "step": 1216 }, { "epoch": 0.2714699977693509, "grad_norm": 0.17763131856918335, "learning_rate": 1.9657072176024202e-05, "loss": 0.5029, "step": 1217 }, { "epoch": 0.2716930626812402, "grad_norm": 0.18652020394802094, "learning_rate": 1.9656460879461652e-05, "loss": 0.5161, "step": 1218 }, { "epoch": 0.2719161275931296, "grad_norm": 0.2644646465778351, "learning_rate": 1.965584904806356e-05, "loss": 0.5371, "step": 1219 }, { "epoch": 0.27213919250501895, "grad_norm": 0.1769624799489975, "learning_rate": 1.9655236681863806e-05, "loss": 0.4892, "step": 1220 }, { "epoch": 0.27236225741690834, "grad_norm": 0.1740611046552658, "learning_rate": 1.9654623780896313e-05, "loss": 0.5162, "step": 1221 }, { "epoch": 0.2725853223287977, "grad_norm": 0.16314013302326202, "learning_rate": 1.9654010345195026e-05, "loss": 0.5304, "step": 1222 }, { "epoch": 0.27280838724068707, "grad_norm": 0.18545354902744293, "learning_rate": 1.9653396374793915e-05, "loss": 0.5181, "step": 1223 }, { "epoch": 0.2730314521525764, "grad_norm": 0.17202220857143402, "learning_rate": 1.9652781869726993e-05, "loss": 0.5486, "step": 1224 }, { "epoch": 0.27325451706446574, "grad_norm": 0.18883365392684937, "learning_rate": 1.9652166830028295e-05, "loss": 0.5565, "step": 1225 }, { "epoch": 0.27347758197635513, "grad_norm": 0.17693057656288147, "learning_rate": 1.9651551255731884e-05, "loss": 0.5275, "step": 1226 }, { "epoch": 0.27370064688824447, "grad_norm": 0.17243415117263794, "learning_rate": 1.9650935146871848e-05, "loss": 0.5037, "step": 1227 }, { "epoch": 0.27392371180013386, "grad_norm": 0.18462933599948883, "learning_rate": 1.9650318503482323e-05, "loss": 0.5573, "step": 1228 }, { "epoch": 0.2741467767120232, "grad_norm": 0.3310684263706207, "learning_rate": 1.964970132559745e-05, "loss": 0.5186, "step": 1229 }, { "epoch": 0.27436984162391254, "grad_norm": 0.16663436591625214, "learning_rate": 1.964908361325142e-05, "loss": 0.5067, "step": 1230 }, { "epoch": 0.27459290653580193, "grad_norm": 0.16989050805568695, "learning_rate": 1.964846536647845e-05, "loss": 0.5084, "step": 1231 }, { "epoch": 0.27481597144769127, "grad_norm": 0.16925394535064697, "learning_rate": 1.9647846585312775e-05, "loss": 0.5348, "step": 1232 }, { "epoch": 0.27503903635958066, "grad_norm": 0.176070898771286, "learning_rate": 1.9647227269788665e-05, "loss": 0.5249, "step": 1233 }, { "epoch": 0.27526210127147, "grad_norm": 0.16656488180160522, "learning_rate": 1.9646607419940428e-05, "loss": 0.5408, "step": 1234 }, { "epoch": 0.27548516618335933, "grad_norm": 0.16989517211914062, "learning_rate": 1.964598703580239e-05, "loss": 0.5033, "step": 1235 }, { "epoch": 0.2757082310952487, "grad_norm": 0.18006569147109985, "learning_rate": 1.9645366117408918e-05, "loss": 0.5354, "step": 1236 }, { "epoch": 0.27593129600713806, "grad_norm": 0.17241814732551575, "learning_rate": 1.9644744664794394e-05, "loss": 0.508, "step": 1237 }, { "epoch": 0.27615436091902745, "grad_norm": 0.20181423425674438, "learning_rate": 1.9644122677993246e-05, "loss": 0.4536, "step": 1238 }, { "epoch": 0.2763774258309168, "grad_norm": 0.17486101388931274, "learning_rate": 1.964350015703992e-05, "loss": 0.5597, "step": 1239 }, { "epoch": 0.2766004907428062, "grad_norm": 0.1774255484342575, "learning_rate": 1.9642877101968894e-05, "loss": 0.5511, "step": 1240 }, { "epoch": 0.2768235556546955, "grad_norm": 0.3290056586265564, "learning_rate": 1.964225351281468e-05, "loss": 0.5278, "step": 1241 }, { "epoch": 0.27704662056658486, "grad_norm": 0.20791974663734436, "learning_rate": 1.9641629389611813e-05, "loss": 0.5261, "step": 1242 }, { "epoch": 0.27726968547847425, "grad_norm": 0.17645849287509918, "learning_rate": 1.9641004732394862e-05, "loss": 0.526, "step": 1243 }, { "epoch": 0.2774927503903636, "grad_norm": 0.16956675052642822, "learning_rate": 1.9640379541198425e-05, "loss": 0.5489, "step": 1244 }, { "epoch": 0.277715815302253, "grad_norm": 0.18473365902900696, "learning_rate": 1.9639753816057128e-05, "loss": 0.5422, "step": 1245 }, { "epoch": 0.2779388802141423, "grad_norm": 0.262015700340271, "learning_rate": 1.9639127557005627e-05, "loss": 0.5031, "step": 1246 }, { "epoch": 0.27816194512603165, "grad_norm": 0.19820185005664825, "learning_rate": 1.963850076407861e-05, "loss": 0.5132, "step": 1247 }, { "epoch": 0.27838501003792104, "grad_norm": 0.1682923436164856, "learning_rate": 1.9637873437310795e-05, "loss": 0.5214, "step": 1248 }, { "epoch": 0.2786080749498104, "grad_norm": 0.17070676386356354, "learning_rate": 1.9637245576736923e-05, "loss": 0.5368, "step": 1249 }, { "epoch": 0.2788311398616998, "grad_norm": 0.17165376245975494, "learning_rate": 1.9636617182391768e-05, "loss": 0.5282, "step": 1250 }, { "epoch": 0.2790542047735891, "grad_norm": 0.17272816598415375, "learning_rate": 1.963598825431014e-05, "loss": 0.5657, "step": 1251 }, { "epoch": 0.2792772696854785, "grad_norm": 0.1659235805273056, "learning_rate": 1.9635358792526865e-05, "loss": 0.5181, "step": 1252 }, { "epoch": 0.27950033459736784, "grad_norm": 0.1700238287448883, "learning_rate": 1.9634728797076818e-05, "loss": 0.5194, "step": 1253 }, { "epoch": 0.2797233995092572, "grad_norm": 0.1710444688796997, "learning_rate": 1.9634098267994882e-05, "loss": 0.5405, "step": 1254 }, { "epoch": 0.27994646442114657, "grad_norm": 0.1748325079679489, "learning_rate": 1.9633467205315983e-05, "loss": 0.5295, "step": 1255 }, { "epoch": 0.2801695293330359, "grad_norm": 0.1600925624370575, "learning_rate": 1.9632835609075072e-05, "loss": 0.5448, "step": 1256 }, { "epoch": 0.2803925942449253, "grad_norm": 0.1937401294708252, "learning_rate": 1.9632203479307132e-05, "loss": 0.5145, "step": 1257 }, { "epoch": 0.28061565915681463, "grad_norm": 0.1888452023267746, "learning_rate": 1.9631570816047176e-05, "loss": 0.5376, "step": 1258 }, { "epoch": 0.28083872406870397, "grad_norm": 0.16786231100559235, "learning_rate": 1.963093761933024e-05, "loss": 0.5336, "step": 1259 }, { "epoch": 0.28106178898059336, "grad_norm": 0.16962790489196777, "learning_rate": 1.9630303889191406e-05, "loss": 0.5161, "step": 1260 }, { "epoch": 0.2812848538924827, "grad_norm": 0.17393000423908234, "learning_rate": 1.9629669625665757e-05, "loss": 0.5098, "step": 1261 }, { "epoch": 0.2815079188043721, "grad_norm": 0.18124370276927948, "learning_rate": 1.9629034828788435e-05, "loss": 0.5155, "step": 1262 }, { "epoch": 0.28173098371626143, "grad_norm": 0.16687680780887604, "learning_rate": 1.962839949859459e-05, "loss": 0.5189, "step": 1263 }, { "epoch": 0.28195404862815077, "grad_norm": 0.1622602790594101, "learning_rate": 1.9627763635119423e-05, "loss": 0.4974, "step": 1264 }, { "epoch": 0.28217711354004016, "grad_norm": 0.16674260795116425, "learning_rate": 1.9627127238398142e-05, "loss": 0.4923, "step": 1265 }, { "epoch": 0.2824001784519295, "grad_norm": 0.1815263032913208, "learning_rate": 1.9626490308465996e-05, "loss": 0.5048, "step": 1266 }, { "epoch": 0.2826232433638189, "grad_norm": 0.17753565311431885, "learning_rate": 1.9625852845358265e-05, "loss": 0.5326, "step": 1267 }, { "epoch": 0.2828463082757082, "grad_norm": 0.1820516288280487, "learning_rate": 1.9625214849110253e-05, "loss": 0.5289, "step": 1268 }, { "epoch": 0.2830693731875976, "grad_norm": 0.16515739262104034, "learning_rate": 1.9624576319757302e-05, "loss": 0.5159, "step": 1269 }, { "epoch": 0.28329243809948695, "grad_norm": 0.17564083635807037, "learning_rate": 1.9623937257334767e-05, "loss": 0.5052, "step": 1270 }, { "epoch": 0.2835155030113763, "grad_norm": 0.18046805262565613, "learning_rate": 1.9623297661878054e-05, "loss": 0.5349, "step": 1271 }, { "epoch": 0.2837385679232657, "grad_norm": 0.16942784190177917, "learning_rate": 1.9622657533422583e-05, "loss": 0.4924, "step": 1272 }, { "epoch": 0.283961632835155, "grad_norm": 0.17093469202518463, "learning_rate": 1.9622016872003807e-05, "loss": 0.5261, "step": 1273 }, { "epoch": 0.2841846977470444, "grad_norm": 0.17606058716773987, "learning_rate": 1.9621375677657217e-05, "loss": 0.51, "step": 1274 }, { "epoch": 0.28440776265893375, "grad_norm": 0.16897493600845337, "learning_rate": 1.9620733950418316e-05, "loss": 0.5058, "step": 1275 }, { "epoch": 0.2846308275708231, "grad_norm": 0.18536695837974548, "learning_rate": 1.9620091690322654e-05, "loss": 0.512, "step": 1276 }, { "epoch": 0.2848538924827125, "grad_norm": 0.17734606564044952, "learning_rate": 1.96194488974058e-05, "loss": 0.5311, "step": 1277 }, { "epoch": 0.2850769573946018, "grad_norm": 0.18116462230682373, "learning_rate": 1.9618805571703356e-05, "loss": 0.5153, "step": 1278 }, { "epoch": 0.2853000223064912, "grad_norm": 0.17743848264217377, "learning_rate": 1.961816171325096e-05, "loss": 0.5238, "step": 1279 }, { "epoch": 0.28552308721838054, "grad_norm": 0.16778843104839325, "learning_rate": 1.961751732208426e-05, "loss": 0.505, "step": 1280 }, { "epoch": 0.2857461521302699, "grad_norm": 0.17177841067314148, "learning_rate": 1.961687239823896e-05, "loss": 0.5075, "step": 1281 }, { "epoch": 0.2859692170421593, "grad_norm": 0.18460653722286224, "learning_rate": 1.9616226941750775e-05, "loss": 0.5109, "step": 1282 }, { "epoch": 0.2861922819540486, "grad_norm": 0.16801568865776062, "learning_rate": 1.961558095265545e-05, "loss": 0.552, "step": 1283 }, { "epoch": 0.286415346865938, "grad_norm": 0.19550864398479462, "learning_rate": 1.961493443098877e-05, "loss": 0.5138, "step": 1284 }, { "epoch": 0.28663841177782734, "grad_norm": 0.17915892601013184, "learning_rate": 1.9614287376786537e-05, "loss": 0.5267, "step": 1285 }, { "epoch": 0.28686147668971673, "grad_norm": 0.17141470313072205, "learning_rate": 1.9613639790084596e-05, "loss": 0.5398, "step": 1286 }, { "epoch": 0.28708454160160607, "grad_norm": 0.16843633353710175, "learning_rate": 1.9612991670918808e-05, "loss": 0.5224, "step": 1287 }, { "epoch": 0.2873076065134954, "grad_norm": 0.17116384208202362, "learning_rate": 1.9612343019325077e-05, "loss": 0.506, "step": 1288 }, { "epoch": 0.2875306714253848, "grad_norm": 0.19337520003318787, "learning_rate": 1.9611693835339323e-05, "loss": 0.5417, "step": 1289 }, { "epoch": 0.28775373633727414, "grad_norm": 0.1831638365983963, "learning_rate": 1.9611044118997507e-05, "loss": 0.5487, "step": 1290 }, { "epoch": 0.2879768012491635, "grad_norm": 0.1741098165512085, "learning_rate": 1.961039387033561e-05, "loss": 0.5098, "step": 1291 }, { "epoch": 0.28819986616105286, "grad_norm": 0.16796158254146576, "learning_rate": 1.960974308938965e-05, "loss": 0.5105, "step": 1292 }, { "epoch": 0.2884229310729422, "grad_norm": 0.16848038136959076, "learning_rate": 1.9609091776195667e-05, "loss": 0.5106, "step": 1293 }, { "epoch": 0.2886459959848316, "grad_norm": 0.1563376486301422, "learning_rate": 1.960843993078974e-05, "loss": 0.4927, "step": 1294 }, { "epoch": 0.28886906089672093, "grad_norm": 0.16741539537906647, "learning_rate": 1.9607787553207972e-05, "loss": 0.5241, "step": 1295 }, { "epoch": 0.2890921258086103, "grad_norm": 0.16913025081157684, "learning_rate": 1.9607134643486492e-05, "loss": 0.5183, "step": 1296 }, { "epoch": 0.28931519072049966, "grad_norm": 0.17226742208003998, "learning_rate": 1.9606481201661466e-05, "loss": 0.5105, "step": 1297 }, { "epoch": 0.28953825563238905, "grad_norm": 0.17816194891929626, "learning_rate": 1.960582722776908e-05, "loss": 0.5352, "step": 1298 }, { "epoch": 0.2897613205442784, "grad_norm": 0.16423411667346954, "learning_rate": 1.9605172721845564e-05, "loss": 0.5303, "step": 1299 }, { "epoch": 0.2899843854561677, "grad_norm": 0.22330845892429352, "learning_rate": 1.9604517683927156e-05, "loss": 0.5115, "step": 1300 }, { "epoch": 0.2902074503680571, "grad_norm": 0.16385138034820557, "learning_rate": 1.960386211405015e-05, "loss": 0.5375, "step": 1301 }, { "epoch": 0.29043051527994646, "grad_norm": 0.17269232869148254, "learning_rate": 1.960320601225085e-05, "loss": 0.5291, "step": 1302 }, { "epoch": 0.29065358019183585, "grad_norm": 0.1625886857509613, "learning_rate": 1.9602549378565592e-05, "loss": 0.4982, "step": 1303 }, { "epoch": 0.2908766451037252, "grad_norm": 0.17353162169456482, "learning_rate": 1.9601892213030746e-05, "loss": 0.5109, "step": 1304 }, { "epoch": 0.2910997100156145, "grad_norm": 0.16863249242305756, "learning_rate": 1.9601234515682712e-05, "loss": 0.5409, "step": 1305 }, { "epoch": 0.2913227749275039, "grad_norm": 0.17428503930568695, "learning_rate": 1.960057628655792e-05, "loss": 0.5155, "step": 1306 }, { "epoch": 0.29154583983939325, "grad_norm": 0.1825830489397049, "learning_rate": 1.9599917525692816e-05, "loss": 0.5355, "step": 1307 }, { "epoch": 0.29176890475128264, "grad_norm": 0.8213825821876526, "learning_rate": 1.95992582331239e-05, "loss": 0.5366, "step": 1308 }, { "epoch": 0.291991969663172, "grad_norm": 0.16370530426502228, "learning_rate": 1.959859840888768e-05, "loss": 0.5071, "step": 1309 }, { "epoch": 0.2922150345750613, "grad_norm": 0.17610913515090942, "learning_rate": 1.95979380530207e-05, "loss": 0.5346, "step": 1310 }, { "epoch": 0.2924380994869507, "grad_norm": 0.16303309798240662, "learning_rate": 1.959727716555954e-05, "loss": 0.535, "step": 1311 }, { "epoch": 0.29266116439884005, "grad_norm": 0.17250092327594757, "learning_rate": 1.9596615746540798e-05, "loss": 0.5322, "step": 1312 }, { "epoch": 0.29288422931072944, "grad_norm": 0.18915972113609314, "learning_rate": 1.959595379600111e-05, "loss": 0.5293, "step": 1313 }, { "epoch": 0.2931072942226188, "grad_norm": 0.17842963337898254, "learning_rate": 1.9595291313977144e-05, "loss": 0.5197, "step": 1314 }, { "epoch": 0.29333035913450817, "grad_norm": 0.1693105846643448, "learning_rate": 1.959462830050559e-05, "loss": 0.5044, "step": 1315 }, { "epoch": 0.2935534240463975, "grad_norm": 0.23653189837932587, "learning_rate": 1.959396475562316e-05, "loss": 0.5286, "step": 1316 }, { "epoch": 0.29377648895828684, "grad_norm": 0.1704518347978592, "learning_rate": 1.9593300679366622e-05, "loss": 0.5288, "step": 1317 }, { "epoch": 0.29399955387017623, "grad_norm": 0.169080451130867, "learning_rate": 1.9592636071772745e-05, "loss": 0.5336, "step": 1318 }, { "epoch": 0.29422261878206557, "grad_norm": 0.2256423681974411, "learning_rate": 1.959197093287834e-05, "loss": 0.5253, "step": 1319 }, { "epoch": 0.29444568369395496, "grad_norm": 0.1699950098991394, "learning_rate": 1.9591305262720252e-05, "loss": 0.5308, "step": 1320 }, { "epoch": 0.2946687486058443, "grad_norm": 0.17314709722995758, "learning_rate": 1.9590639061335345e-05, "loss": 0.5477, "step": 1321 }, { "epoch": 0.29489181351773364, "grad_norm": 0.16141551733016968, "learning_rate": 1.958997232876052e-05, "loss": 0.5207, "step": 1322 }, { "epoch": 0.29511487842962303, "grad_norm": 0.15994872152805328, "learning_rate": 1.9589305065032705e-05, "loss": 0.5142, "step": 1323 }, { "epoch": 0.29533794334151237, "grad_norm": 0.16060283780097961, "learning_rate": 1.9588637270188852e-05, "loss": 0.4967, "step": 1324 }, { "epoch": 0.29556100825340176, "grad_norm": 0.17949143052101135, "learning_rate": 1.9587968944265955e-05, "loss": 0.5484, "step": 1325 }, { "epoch": 0.2957840731652911, "grad_norm": 0.17476752400398254, "learning_rate": 1.958730008730103e-05, "loss": 0.5466, "step": 1326 }, { "epoch": 0.2960071380771805, "grad_norm": 0.205107182264328, "learning_rate": 1.9586630699331115e-05, "loss": 0.5147, "step": 1327 }, { "epoch": 0.2962302029890698, "grad_norm": 0.17714425921440125, "learning_rate": 1.9585960780393293e-05, "loss": 0.5142, "step": 1328 }, { "epoch": 0.29645326790095916, "grad_norm": 0.17877231538295746, "learning_rate": 1.9585290330524663e-05, "loss": 0.5073, "step": 1329 }, { "epoch": 0.29667633281284855, "grad_norm": 0.16752929985523224, "learning_rate": 1.958461934976236e-05, "loss": 0.5381, "step": 1330 }, { "epoch": 0.2968993977247379, "grad_norm": 0.17550905048847198, "learning_rate": 1.9583947838143553e-05, "loss": 0.5185, "step": 1331 }, { "epoch": 0.2971224626366273, "grad_norm": 0.17885231971740723, "learning_rate": 1.958327579570542e-05, "loss": 0.5279, "step": 1332 }, { "epoch": 0.2973455275485166, "grad_norm": 0.16726696491241455, "learning_rate": 1.95826032224852e-05, "loss": 0.5253, "step": 1333 }, { "epoch": 0.29756859246040596, "grad_norm": 0.16935260593891144, "learning_rate": 1.9581930118520135e-05, "loss": 0.5311, "step": 1334 }, { "epoch": 0.29779165737229535, "grad_norm": 0.22187648713588715, "learning_rate": 1.9581256483847505e-05, "loss": 0.5124, "step": 1335 }, { "epoch": 0.2980147222841847, "grad_norm": 0.17254653573036194, "learning_rate": 1.9580582318504623e-05, "loss": 0.5113, "step": 1336 }, { "epoch": 0.2982377871960741, "grad_norm": 0.1686115860939026, "learning_rate": 1.9579907622528827e-05, "loss": 0.516, "step": 1337 }, { "epoch": 0.2984608521079634, "grad_norm": 0.17233097553253174, "learning_rate": 1.9579232395957492e-05, "loss": 0.5135, "step": 1338 }, { "epoch": 0.29868391701985275, "grad_norm": 0.1767144799232483, "learning_rate": 1.957855663882801e-05, "loss": 0.5113, "step": 1339 }, { "epoch": 0.29890698193174214, "grad_norm": 0.16990788280963898, "learning_rate": 1.9577880351177803e-05, "loss": 0.5213, "step": 1340 }, { "epoch": 0.2991300468436315, "grad_norm": 0.16583338379859924, "learning_rate": 1.957720353304434e-05, "loss": 0.5059, "step": 1341 }, { "epoch": 0.2993531117555209, "grad_norm": 0.17070691287517548, "learning_rate": 1.95765261844651e-05, "loss": 0.5154, "step": 1342 }, { "epoch": 0.2995761766674102, "grad_norm": 0.18274636566638947, "learning_rate": 1.9575848305477606e-05, "loss": 0.5367, "step": 1343 }, { "epoch": 0.2997992415792996, "grad_norm": 0.16276118159294128, "learning_rate": 1.957516989611939e-05, "loss": 0.5156, "step": 1344 }, { "epoch": 0.30002230649118894, "grad_norm": 0.16200782358646393, "learning_rate": 1.9574490956428045e-05, "loss": 0.5201, "step": 1345 }, { "epoch": 0.3002453714030783, "grad_norm": 0.18693317472934723, "learning_rate": 1.9573811486441158e-05, "loss": 0.5181, "step": 1346 }, { "epoch": 0.30046843631496767, "grad_norm": 0.17908775806427002, "learning_rate": 1.9573131486196372e-05, "loss": 0.5012, "step": 1347 }, { "epoch": 0.300691501226857, "grad_norm": 0.16340862214565277, "learning_rate": 1.9572450955731346e-05, "loss": 0.4951, "step": 1348 }, { "epoch": 0.3009145661387464, "grad_norm": 0.16997461020946503, "learning_rate": 1.957176989508377e-05, "loss": 0.5248, "step": 1349 }, { "epoch": 0.30113763105063573, "grad_norm": 0.1896587759256363, "learning_rate": 1.9571088304291376e-05, "loss": 0.5129, "step": 1350 }, { "epoch": 0.30136069596252507, "grad_norm": 0.1624515950679779, "learning_rate": 1.95704061833919e-05, "loss": 0.4872, "step": 1351 }, { "epoch": 0.30158376087441446, "grad_norm": 0.16828244924545288, "learning_rate": 1.956972353242313e-05, "loss": 0.5291, "step": 1352 }, { "epoch": 0.3018068257863038, "grad_norm": 0.17992524802684784, "learning_rate": 1.9569040351422882e-05, "loss": 0.5396, "step": 1353 }, { "epoch": 0.3020298906981932, "grad_norm": 0.17174522578716278, "learning_rate": 1.956835664042898e-05, "loss": 0.523, "step": 1354 }, { "epoch": 0.30225295561008253, "grad_norm": 0.18021540343761444, "learning_rate": 1.9567672399479304e-05, "loss": 0.5091, "step": 1355 }, { "epoch": 0.30247602052197187, "grad_norm": 0.1889013797044754, "learning_rate": 1.9566987628611748e-05, "loss": 0.521, "step": 1356 }, { "epoch": 0.30269908543386126, "grad_norm": 0.1710296869277954, "learning_rate": 1.9566302327864233e-05, "loss": 0.51, "step": 1357 }, { "epoch": 0.3029221503457506, "grad_norm": 0.16925543546676636, "learning_rate": 1.9565616497274725e-05, "loss": 0.5194, "step": 1358 }, { "epoch": 0.30314521525764, "grad_norm": 0.16558070480823517, "learning_rate": 1.95649301368812e-05, "loss": 0.5284, "step": 1359 }, { "epoch": 0.3033682801695293, "grad_norm": 0.1646072119474411, "learning_rate": 1.9564243246721686e-05, "loss": 0.53, "step": 1360 }, { "epoch": 0.3035913450814187, "grad_norm": 0.17670351266860962, "learning_rate": 1.9563555826834214e-05, "loss": 0.527, "step": 1361 }, { "epoch": 0.30381440999330805, "grad_norm": 0.17414048314094543, "learning_rate": 1.9562867877256867e-05, "loss": 0.5318, "step": 1362 }, { "epoch": 0.3040374749051974, "grad_norm": 0.18004649877548218, "learning_rate": 1.956217939802774e-05, "loss": 0.5149, "step": 1363 }, { "epoch": 0.3042605398170868, "grad_norm": 0.16560295224189758, "learning_rate": 1.9561490389184973e-05, "loss": 0.5093, "step": 1364 }, { "epoch": 0.3044836047289761, "grad_norm": 0.1967393010854721, "learning_rate": 1.956080085076672e-05, "loss": 0.499, "step": 1365 }, { "epoch": 0.3047066696408655, "grad_norm": 0.17638468742370605, "learning_rate": 1.956011078281118e-05, "loss": 0.5287, "step": 1366 }, { "epoch": 0.30492973455275485, "grad_norm": 0.18389096856117249, "learning_rate": 1.955942018535657e-05, "loss": 0.5263, "step": 1367 }, { "epoch": 0.3051527994646442, "grad_norm": 0.16795802116394043, "learning_rate": 1.9558729058441135e-05, "loss": 0.4909, "step": 1368 }, { "epoch": 0.3053758643765336, "grad_norm": 0.1886921525001526, "learning_rate": 1.955803740210316e-05, "loss": 0.4929, "step": 1369 }, { "epoch": 0.3055989292884229, "grad_norm": 0.15743811428546906, "learning_rate": 1.9557345216380953e-05, "loss": 0.5252, "step": 1370 }, { "epoch": 0.3058219942003123, "grad_norm": 0.18187056481838226, "learning_rate": 1.955665250131285e-05, "loss": 0.5307, "step": 1371 }, { "epoch": 0.30604505911220165, "grad_norm": 0.17258834838867188, "learning_rate": 1.9555959256937214e-05, "loss": 0.5007, "step": 1372 }, { "epoch": 0.30626812402409104, "grad_norm": 0.16746965050697327, "learning_rate": 1.9555265483292446e-05, "loss": 0.5174, "step": 1373 }, { "epoch": 0.3064911889359804, "grad_norm": 0.191069096326828, "learning_rate": 1.955457118041697e-05, "loss": 0.5462, "step": 1374 }, { "epoch": 0.3067142538478697, "grad_norm": 0.17369060218334198, "learning_rate": 1.9553876348349242e-05, "loss": 0.5175, "step": 1375 }, { "epoch": 0.3069373187597591, "grad_norm": 0.16620078682899475, "learning_rate": 1.9553180987127748e-05, "loss": 0.5298, "step": 1376 }, { "epoch": 0.30716038367164844, "grad_norm": 0.22137194871902466, "learning_rate": 1.9552485096790996e-05, "loss": 0.5135, "step": 1377 }, { "epoch": 0.30738344858353783, "grad_norm": 0.20380550622940063, "learning_rate": 1.9551788677377535e-05, "loss": 0.5304, "step": 1378 }, { "epoch": 0.30760651349542717, "grad_norm": 0.16844195127487183, "learning_rate": 1.955109172892593e-05, "loss": 0.55, "step": 1379 }, { "epoch": 0.3078295784073165, "grad_norm": 0.1768990010023117, "learning_rate": 1.955039425147479e-05, "loss": 0.5254, "step": 1380 }, { "epoch": 0.3080526433192059, "grad_norm": 0.16573172807693481, "learning_rate": 1.954969624506274e-05, "loss": 0.5195, "step": 1381 }, { "epoch": 0.30827570823109524, "grad_norm": 0.16543026268482208, "learning_rate": 1.9548997709728443e-05, "loss": 0.5243, "step": 1382 }, { "epoch": 0.30849877314298463, "grad_norm": 0.17528237402439117, "learning_rate": 1.9548298645510587e-05, "loss": 0.5133, "step": 1383 }, { "epoch": 0.30872183805487396, "grad_norm": 0.16765229403972626, "learning_rate": 1.954759905244789e-05, "loss": 0.5185, "step": 1384 }, { "epoch": 0.3089449029667633, "grad_norm": 0.17179544270038605, "learning_rate": 1.9546898930579102e-05, "loss": 0.5156, "step": 1385 }, { "epoch": 0.3091679678786527, "grad_norm": 0.17194928228855133, "learning_rate": 1.9546198279942997e-05, "loss": 0.5134, "step": 1386 }, { "epoch": 0.30939103279054203, "grad_norm": 0.16621114313602448, "learning_rate": 1.9545497100578382e-05, "loss": 0.5101, "step": 1387 }, { "epoch": 0.3096140977024314, "grad_norm": 0.16408509016036987, "learning_rate": 1.9544795392524096e-05, "loss": 0.515, "step": 1388 }, { "epoch": 0.30983716261432076, "grad_norm": 0.16583041846752167, "learning_rate": 1.9544093155819004e-05, "loss": 0.4974, "step": 1389 }, { "epoch": 0.31006022752621015, "grad_norm": 0.16977885365486145, "learning_rate": 1.9543390390502e-05, "loss": 0.5229, "step": 1390 }, { "epoch": 0.3102832924380995, "grad_norm": 0.1626800149679184, "learning_rate": 1.9542687096611998e-05, "loss": 0.5166, "step": 1391 }, { "epoch": 0.3105063573499888, "grad_norm": 0.17305392026901245, "learning_rate": 1.9541983274187964e-05, "loss": 0.516, "step": 1392 }, { "epoch": 0.3107294222618782, "grad_norm": 0.18632498383522034, "learning_rate": 1.9541278923268872e-05, "loss": 0.4985, "step": 1393 }, { "epoch": 0.31095248717376756, "grad_norm": 0.18472731113433838, "learning_rate": 1.9540574043893738e-05, "loss": 0.5304, "step": 1394 }, { "epoch": 0.31117555208565695, "grad_norm": 0.1734933704137802, "learning_rate": 1.9539868636101602e-05, "loss": 0.5523, "step": 1395 }, { "epoch": 0.3113986169975463, "grad_norm": 0.17965291440486908, "learning_rate": 1.9539162699931534e-05, "loss": 0.532, "step": 1396 }, { "epoch": 0.3116216819094356, "grad_norm": 0.1630621999502182, "learning_rate": 1.9538456235422625e-05, "loss": 0.5351, "step": 1397 }, { "epoch": 0.311844746821325, "grad_norm": 0.17468804121017456, "learning_rate": 1.9537749242614016e-05, "loss": 0.5144, "step": 1398 }, { "epoch": 0.31206781173321435, "grad_norm": 0.17356379330158234, "learning_rate": 1.9537041721544862e-05, "loss": 0.5124, "step": 1399 }, { "epoch": 0.31229087664510374, "grad_norm": 0.1623128354549408, "learning_rate": 1.953633367225434e-05, "loss": 0.5256, "step": 1400 }, { "epoch": 0.3125139415569931, "grad_norm": 0.17314434051513672, "learning_rate": 1.9535625094781677e-05, "loss": 0.5231, "step": 1401 }, { "epoch": 0.3127370064688825, "grad_norm": 0.17369012534618378, "learning_rate": 1.9534915989166115e-05, "loss": 0.5266, "step": 1402 }, { "epoch": 0.3129600713807718, "grad_norm": 0.16464915871620178, "learning_rate": 1.9534206355446927e-05, "loss": 0.4962, "step": 1403 }, { "epoch": 0.31318313629266115, "grad_norm": 0.16123609244823456, "learning_rate": 1.953349619366342e-05, "loss": 0.4932, "step": 1404 }, { "epoch": 0.31340620120455054, "grad_norm": 0.16396626830101013, "learning_rate": 1.9532785503854926e-05, "loss": 0.5103, "step": 1405 }, { "epoch": 0.3136292661164399, "grad_norm": 0.1750822216272354, "learning_rate": 1.9532074286060805e-05, "loss": 0.5108, "step": 1406 }, { "epoch": 0.31385233102832927, "grad_norm": 0.1627611666917801, "learning_rate": 1.953136254032045e-05, "loss": 0.4933, "step": 1407 }, { "epoch": 0.3140753959402186, "grad_norm": 0.17482301592826843, "learning_rate": 1.9530650266673286e-05, "loss": 0.5179, "step": 1408 }, { "epoch": 0.31429846085210794, "grad_norm": 0.17025405168533325, "learning_rate": 1.952993746515876e-05, "loss": 0.4968, "step": 1409 }, { "epoch": 0.31452152576399733, "grad_norm": 0.17483878135681152, "learning_rate": 1.9529224135816348e-05, "loss": 0.5124, "step": 1410 }, { "epoch": 0.31474459067588667, "grad_norm": 0.1743018627166748, "learning_rate": 1.9528510278685568e-05, "loss": 0.5207, "step": 1411 }, { "epoch": 0.31496765558777606, "grad_norm": 0.16575628519058228, "learning_rate": 1.9527795893805947e-05, "loss": 0.4846, "step": 1412 }, { "epoch": 0.3151907204996654, "grad_norm": 0.16664400696754456, "learning_rate": 1.952708098121706e-05, "loss": 0.5097, "step": 1413 }, { "epoch": 0.31541378541155474, "grad_norm": 0.17265918850898743, "learning_rate": 1.9526365540958497e-05, "loss": 0.5002, "step": 1414 }, { "epoch": 0.31563685032344413, "grad_norm": 0.16979828476905823, "learning_rate": 1.952564957306989e-05, "loss": 0.5172, "step": 1415 }, { "epoch": 0.31585991523533347, "grad_norm": 0.17120909690856934, "learning_rate": 1.952493307759089e-05, "loss": 0.5204, "step": 1416 }, { "epoch": 0.31608298014722286, "grad_norm": 0.15589144825935364, "learning_rate": 1.9524216054561186e-05, "loss": 0.5001, "step": 1417 }, { "epoch": 0.3163060450591122, "grad_norm": 0.1638038605451584, "learning_rate": 1.9523498504020486e-05, "loss": 0.495, "step": 1418 }, { "epoch": 0.3165291099710016, "grad_norm": 0.2510809600353241, "learning_rate": 1.952278042600853e-05, "loss": 0.5177, "step": 1419 }, { "epoch": 0.3167521748828909, "grad_norm": 0.1806640475988388, "learning_rate": 1.9522061820565093e-05, "loss": 0.5187, "step": 1420 }, { "epoch": 0.31697523979478026, "grad_norm": 0.16829638183116913, "learning_rate": 1.9521342687729977e-05, "loss": 0.5275, "step": 1421 }, { "epoch": 0.31719830470666965, "grad_norm": 0.18520011007785797, "learning_rate": 1.9520623027543015e-05, "loss": 0.5654, "step": 1422 }, { "epoch": 0.317421369618559, "grad_norm": 0.17533186078071594, "learning_rate": 1.951990284004406e-05, "loss": 0.5102, "step": 1423 }, { "epoch": 0.3176444345304484, "grad_norm": 0.17360709607601166, "learning_rate": 1.9519182125273e-05, "loss": 0.5196, "step": 1424 }, { "epoch": 0.3178674994423377, "grad_norm": 0.17495203018188477, "learning_rate": 1.951846088326976e-05, "loss": 0.5416, "step": 1425 }, { "epoch": 0.31809056435422706, "grad_norm": 0.17361585795879364, "learning_rate": 1.9517739114074282e-05, "loss": 0.53, "step": 1426 }, { "epoch": 0.31831362926611645, "grad_norm": 0.17399337887763977, "learning_rate": 1.9517016817726542e-05, "loss": 0.5207, "step": 1427 }, { "epoch": 0.3185366941780058, "grad_norm": 0.16763678193092346, "learning_rate": 1.9516293994266548e-05, "loss": 0.5059, "step": 1428 }, { "epoch": 0.3187597590898952, "grad_norm": 0.16563831269741058, "learning_rate": 1.951557064373433e-05, "loss": 0.5079, "step": 1429 }, { "epoch": 0.3189828240017845, "grad_norm": 0.1833125650882721, "learning_rate": 1.951484676616996e-05, "loss": 0.5181, "step": 1430 }, { "epoch": 0.31920588891367385, "grad_norm": 0.15749512612819672, "learning_rate": 1.951412236161352e-05, "loss": 0.4967, "step": 1431 }, { "epoch": 0.31942895382556324, "grad_norm": 0.17580753564834595, "learning_rate": 1.9513397430105137e-05, "loss": 0.5238, "step": 1432 }, { "epoch": 0.3196520187374526, "grad_norm": 0.16896769404411316, "learning_rate": 1.9512671971684963e-05, "loss": 0.5218, "step": 1433 }, { "epoch": 0.319875083649342, "grad_norm": 0.1761835813522339, "learning_rate": 1.951194598639318e-05, "loss": 0.5183, "step": 1434 }, { "epoch": 0.3200981485612313, "grad_norm": 0.1777181178331375, "learning_rate": 1.9511219474269992e-05, "loss": 0.5283, "step": 1435 }, { "epoch": 0.3203212134731207, "grad_norm": 0.17135201394557953, "learning_rate": 1.9510492435355647e-05, "loss": 0.5183, "step": 1436 }, { "epoch": 0.32054427838501004, "grad_norm": 0.18420204520225525, "learning_rate": 1.9509764869690407e-05, "loss": 0.5239, "step": 1437 }, { "epoch": 0.3207673432968994, "grad_norm": 0.17093034088611603, "learning_rate": 1.9509036777314568e-05, "loss": 0.508, "step": 1438 }, { "epoch": 0.32099040820878877, "grad_norm": 0.18275004625320435, "learning_rate": 1.9508308158268458e-05, "loss": 0.5257, "step": 1439 }, { "epoch": 0.3212134731206781, "grad_norm": 0.19184084236621857, "learning_rate": 1.950757901259243e-05, "loss": 0.5138, "step": 1440 }, { "epoch": 0.3214365380325675, "grad_norm": 0.19820280373096466, "learning_rate": 1.9506849340326876e-05, "loss": 0.5406, "step": 1441 }, { "epoch": 0.32165960294445683, "grad_norm": 0.1850280463695526, "learning_rate": 1.9506119141512204e-05, "loss": 0.5009, "step": 1442 }, { "epoch": 0.32188266785634617, "grad_norm": 0.17693570256233215, "learning_rate": 1.9505388416188854e-05, "loss": 0.5502, "step": 1443 }, { "epoch": 0.32210573276823556, "grad_norm": 0.16360151767730713, "learning_rate": 1.9504657164397307e-05, "loss": 0.5215, "step": 1444 }, { "epoch": 0.3223287976801249, "grad_norm": 0.1807732880115509, "learning_rate": 1.950392538617806e-05, "loss": 0.5024, "step": 1445 }, { "epoch": 0.3225518625920143, "grad_norm": 0.17271289229393005, "learning_rate": 1.950319308157164e-05, "loss": 0.4997, "step": 1446 }, { "epoch": 0.32277492750390363, "grad_norm": 0.16827355325222015, "learning_rate": 1.950246025061861e-05, "loss": 0.5178, "step": 1447 }, { "epoch": 0.322997992415793, "grad_norm": 0.17736375331878662, "learning_rate": 1.950172689335956e-05, "loss": 0.494, "step": 1448 }, { "epoch": 0.32322105732768236, "grad_norm": 0.16899362206459045, "learning_rate": 1.9500993009835106e-05, "loss": 0.4701, "step": 1449 }, { "epoch": 0.3234441222395717, "grad_norm": 0.1732366383075714, "learning_rate": 1.9500258600085894e-05, "loss": 0.4774, "step": 1450 }, { "epoch": 0.3236671871514611, "grad_norm": 0.16916102170944214, "learning_rate": 1.9499523664152603e-05, "loss": 0.5222, "step": 1451 }, { "epoch": 0.3238902520633504, "grad_norm": 0.18570592999458313, "learning_rate": 1.9498788202075936e-05, "loss": 0.5611, "step": 1452 }, { "epoch": 0.3241133169752398, "grad_norm": 0.1690537929534912, "learning_rate": 1.9498052213896627e-05, "loss": 0.5055, "step": 1453 }, { "epoch": 0.32433638188712915, "grad_norm": 0.17589417099952698, "learning_rate": 1.9497315699655447e-05, "loss": 0.539, "step": 1454 }, { "epoch": 0.3245594467990185, "grad_norm": 0.1848466843366623, "learning_rate": 1.949657865939318e-05, "loss": 0.5337, "step": 1455 }, { "epoch": 0.3247825117109079, "grad_norm": 0.17046886682510376, "learning_rate": 1.949584109315065e-05, "loss": 0.507, "step": 1456 }, { "epoch": 0.3250055766227972, "grad_norm": 0.17708532512187958, "learning_rate": 1.9495103000968708e-05, "loss": 0.5377, "step": 1457 }, { "epoch": 0.3252286415346866, "grad_norm": 0.18529073894023895, "learning_rate": 1.9494364382888236e-05, "loss": 0.5136, "step": 1458 }, { "epoch": 0.32545170644657595, "grad_norm": 0.16627921164035797, "learning_rate": 1.9493625238950143e-05, "loss": 0.5149, "step": 1459 }, { "epoch": 0.3256747713584653, "grad_norm": 0.1823386400938034, "learning_rate": 1.949288556919537e-05, "loss": 0.5422, "step": 1460 }, { "epoch": 0.3258978362703547, "grad_norm": 0.17173448204994202, "learning_rate": 1.949214537366488e-05, "loss": 0.5315, "step": 1461 }, { "epoch": 0.326120901182244, "grad_norm": 0.20931746065616608, "learning_rate": 1.949140465239967e-05, "loss": 0.525, "step": 1462 }, { "epoch": 0.3263439660941334, "grad_norm": 0.3969743549823761, "learning_rate": 1.9490663405440765e-05, "loss": 0.5193, "step": 1463 }, { "epoch": 0.32656703100602275, "grad_norm": 0.17721976339817047, "learning_rate": 1.9489921632829227e-05, "loss": 0.5115, "step": 1464 }, { "epoch": 0.32679009591791214, "grad_norm": 0.16409623622894287, "learning_rate": 1.948917933460613e-05, "loss": 0.5123, "step": 1465 }, { "epoch": 0.3270131608298015, "grad_norm": 0.16853661835193634, "learning_rate": 1.9488436510812594e-05, "loss": 0.5269, "step": 1466 }, { "epoch": 0.3272362257416908, "grad_norm": 0.1875690519809723, "learning_rate": 1.948769316148976e-05, "loss": 0.4727, "step": 1467 }, { "epoch": 0.3274592906535802, "grad_norm": 0.1777825504541397, "learning_rate": 1.9486949286678798e-05, "loss": 0.5095, "step": 1468 }, { "epoch": 0.32768235556546954, "grad_norm": 0.17113567888736725, "learning_rate": 1.948620488642091e-05, "loss": 0.5244, "step": 1469 }, { "epoch": 0.32790542047735893, "grad_norm": 0.16337715089321136, "learning_rate": 1.9485459960757325e-05, "loss": 0.5194, "step": 1470 }, { "epoch": 0.32812848538924827, "grad_norm": 0.1793193519115448, "learning_rate": 1.9484714509729305e-05, "loss": 0.527, "step": 1471 }, { "epoch": 0.3283515503011376, "grad_norm": 0.16597363352775574, "learning_rate": 1.948396853337813e-05, "loss": 0.493, "step": 1472 }, { "epoch": 0.328574615213027, "grad_norm": 0.16836529970169067, "learning_rate": 1.9483222031745118e-05, "loss": 0.4932, "step": 1473 }, { "epoch": 0.32879768012491634, "grad_norm": 0.17120935022830963, "learning_rate": 1.9482475004871622e-05, "loss": 0.5156, "step": 1474 }, { "epoch": 0.32902074503680573, "grad_norm": 0.1692550629377365, "learning_rate": 1.9481727452799013e-05, "loss": 0.5089, "step": 1475 }, { "epoch": 0.32924380994869507, "grad_norm": 0.16897635161876678, "learning_rate": 1.9480979375568694e-05, "loss": 0.5042, "step": 1476 }, { "epoch": 0.32946687486058446, "grad_norm": 0.16120664775371552, "learning_rate": 1.9480230773222102e-05, "loss": 0.5105, "step": 1477 }, { "epoch": 0.3296899397724738, "grad_norm": 0.17156098783016205, "learning_rate": 1.9479481645800694e-05, "loss": 0.5217, "step": 1478 }, { "epoch": 0.32991300468436313, "grad_norm": 0.17012807726860046, "learning_rate": 1.9478731993345965e-05, "loss": 0.4947, "step": 1479 }, { "epoch": 0.3301360695962525, "grad_norm": 0.16123747825622559, "learning_rate": 1.9477981815899435e-05, "loss": 0.5181, "step": 1480 }, { "epoch": 0.33035913450814186, "grad_norm": 0.16029803454875946, "learning_rate": 1.947723111350265e-05, "loss": 0.5089, "step": 1481 }, { "epoch": 0.33058219942003125, "grad_norm": 0.17824719846248627, "learning_rate": 1.9476479886197198e-05, "loss": 0.5245, "step": 1482 }, { "epoch": 0.3308052643319206, "grad_norm": 0.1661938726902008, "learning_rate": 1.9475728134024675e-05, "loss": 0.4971, "step": 1483 }, { "epoch": 0.3310283292438099, "grad_norm": 0.16781049966812134, "learning_rate": 1.9474975857026727e-05, "loss": 0.5187, "step": 1484 }, { "epoch": 0.3312513941556993, "grad_norm": 0.20398850739002228, "learning_rate": 1.9474223055245014e-05, "loss": 0.5087, "step": 1485 }, { "epoch": 0.33147445906758866, "grad_norm": 0.19590437412261963, "learning_rate": 1.9473469728721233e-05, "loss": 0.5193, "step": 1486 }, { "epoch": 0.33169752397947805, "grad_norm": 0.16896989941596985, "learning_rate": 1.947271587749711e-05, "loss": 0.5272, "step": 1487 }, { "epoch": 0.3319205888913674, "grad_norm": 0.16621822118759155, "learning_rate": 1.9471961501614395e-05, "loss": 0.5356, "step": 1488 }, { "epoch": 0.3321436538032567, "grad_norm": 0.16570885479450226, "learning_rate": 1.947120660111487e-05, "loss": 0.5105, "step": 1489 }, { "epoch": 0.3323667187151461, "grad_norm": 0.1804710477590561, "learning_rate": 1.9470451176040343e-05, "loss": 0.5037, "step": 1490 }, { "epoch": 0.33258978362703545, "grad_norm": 0.16383785009384155, "learning_rate": 1.9469695226432667e-05, "loss": 0.4991, "step": 1491 }, { "epoch": 0.33281284853892484, "grad_norm": 0.16517230868339539, "learning_rate": 1.9468938752333698e-05, "loss": 0.4858, "step": 1492 }, { "epoch": 0.3330359134508142, "grad_norm": 0.1879924088716507, "learning_rate": 1.946818175378534e-05, "loss": 0.5003, "step": 1493 }, { "epoch": 0.3332589783627036, "grad_norm": 0.16963818669319153, "learning_rate": 1.9467424230829514e-05, "loss": 0.5081, "step": 1494 }, { "epoch": 0.3334820432745929, "grad_norm": 0.1686849743127823, "learning_rate": 1.946666618350819e-05, "loss": 0.5384, "step": 1495 }, { "epoch": 0.33370510818648225, "grad_norm": 0.18511471152305603, "learning_rate": 1.946590761186334e-05, "loss": 0.5387, "step": 1496 }, { "epoch": 0.33392817309837164, "grad_norm": 0.15684032440185547, "learning_rate": 1.9465148515936986e-05, "loss": 0.5051, "step": 1497 }, { "epoch": 0.334151238010261, "grad_norm": 0.18369467556476593, "learning_rate": 1.9464388895771165e-05, "loss": 0.4973, "step": 1498 }, { "epoch": 0.33437430292215037, "grad_norm": 0.17524507641792297, "learning_rate": 1.9463628751407957e-05, "loss": 0.5204, "step": 1499 }, { "epoch": 0.3345973678340397, "grad_norm": 0.15974605083465576, "learning_rate": 1.946286808288946e-05, "loss": 0.5128, "step": 1500 }, { "epoch": 0.33482043274592904, "grad_norm": 0.1720893234014511, "learning_rate": 1.9462106890257805e-05, "loss": 0.531, "step": 1501 }, { "epoch": 0.33504349765781843, "grad_norm": 0.1723415106534958, "learning_rate": 1.946134517355515e-05, "loss": 0.5344, "step": 1502 }, { "epoch": 0.33526656256970777, "grad_norm": 0.1691267043352127, "learning_rate": 1.9460582932823685e-05, "loss": 0.4693, "step": 1503 }, { "epoch": 0.33548962748159716, "grad_norm": 0.18036657571792603, "learning_rate": 1.945982016810563e-05, "loss": 0.5175, "step": 1504 }, { "epoch": 0.3357126923934865, "grad_norm": 0.16378861665725708, "learning_rate": 1.9459056879443227e-05, "loss": 0.4967, "step": 1505 }, { "epoch": 0.33593575730537584, "grad_norm": 0.16954581439495087, "learning_rate": 1.9458293066878754e-05, "loss": 0.5024, "step": 1506 }, { "epoch": 0.33615882221726523, "grad_norm": 0.16542184352874756, "learning_rate": 1.9457528730454516e-05, "loss": 0.525, "step": 1507 }, { "epoch": 0.33638188712915457, "grad_norm": 0.16997484862804413, "learning_rate": 1.9456763870212853e-05, "loss": 0.5177, "step": 1508 }, { "epoch": 0.33660495204104396, "grad_norm": 0.1820513904094696, "learning_rate": 1.945599848619611e-05, "loss": 0.5173, "step": 1509 }, { "epoch": 0.3368280169529333, "grad_norm": 0.1801905781030655, "learning_rate": 1.94552325784467e-05, "loss": 0.4951, "step": 1510 }, { "epoch": 0.3370510818648227, "grad_norm": 0.16981808841228485, "learning_rate": 1.9454466147007032e-05, "loss": 0.5359, "step": 1511 }, { "epoch": 0.337274146776712, "grad_norm": 0.16826239228248596, "learning_rate": 1.9453699191919557e-05, "loss": 0.4852, "step": 1512 }, { "epoch": 0.33749721168860136, "grad_norm": 0.16905133426189423, "learning_rate": 1.9452931713226752e-05, "loss": 0.502, "step": 1513 }, { "epoch": 0.33772027660049075, "grad_norm": 0.3108493685722351, "learning_rate": 1.945216371097113e-05, "loss": 0.5025, "step": 1514 }, { "epoch": 0.3379433415123801, "grad_norm": 0.16754211485385895, "learning_rate": 1.9451395185195224e-05, "loss": 0.4913, "step": 1515 }, { "epoch": 0.3381664064242695, "grad_norm": 0.16909672319889069, "learning_rate": 1.9450626135941603e-05, "loss": 0.5489, "step": 1516 }, { "epoch": 0.3383894713361588, "grad_norm": 0.17022110521793365, "learning_rate": 1.944985656325286e-05, "loss": 0.5266, "step": 1517 }, { "epoch": 0.33861253624804816, "grad_norm": 0.16641078889369965, "learning_rate": 1.9449086467171615e-05, "loss": 0.4913, "step": 1518 }, { "epoch": 0.33883560115993755, "grad_norm": 0.16541585326194763, "learning_rate": 1.9448315847740527e-05, "loss": 0.5116, "step": 1519 }, { "epoch": 0.3390586660718269, "grad_norm": 0.175320103764534, "learning_rate": 1.9447544705002273e-05, "loss": 0.506, "step": 1520 }, { "epoch": 0.3392817309837163, "grad_norm": 0.18363285064697266, "learning_rate": 1.9446773038999566e-05, "loss": 0.5119, "step": 1521 }, { "epoch": 0.3395047958956056, "grad_norm": 0.17000706493854523, "learning_rate": 1.944600084977515e-05, "loss": 0.5286, "step": 1522 }, { "epoch": 0.339727860807495, "grad_norm": 0.20967601239681244, "learning_rate": 1.9445228137371784e-05, "loss": 0.5366, "step": 1523 }, { "epoch": 0.33995092571938434, "grad_norm": 0.1751982420682907, "learning_rate": 1.9444454901832273e-05, "loss": 0.5298, "step": 1524 }, { "epoch": 0.3401739906312737, "grad_norm": 0.16722087562084198, "learning_rate": 1.944368114319944e-05, "loss": 0.5248, "step": 1525 }, { "epoch": 0.3403970555431631, "grad_norm": 0.16639317572116852, "learning_rate": 1.9442906861516143e-05, "loss": 0.4947, "step": 1526 }, { "epoch": 0.3406201204550524, "grad_norm": 0.18173882365226746, "learning_rate": 1.9442132056825268e-05, "loss": 0.5198, "step": 1527 }, { "epoch": 0.3408431853669418, "grad_norm": 0.16811014711856842, "learning_rate": 1.9441356729169725e-05, "loss": 0.5193, "step": 1528 }, { "epoch": 0.34106625027883114, "grad_norm": 0.18385176360607147, "learning_rate": 1.944058087859246e-05, "loss": 0.5419, "step": 1529 }, { "epoch": 0.3412893151907205, "grad_norm": 0.18335555493831635, "learning_rate": 1.9439804505136437e-05, "loss": 0.5442, "step": 1530 }, { "epoch": 0.34151238010260987, "grad_norm": 0.16866059601306915, "learning_rate": 1.9439027608844665e-05, "loss": 0.4893, "step": 1531 }, { "epoch": 0.3417354450144992, "grad_norm": 0.16770686209201813, "learning_rate": 1.9438250189760168e-05, "loss": 0.5276, "step": 1532 }, { "epoch": 0.3419585099263886, "grad_norm": 0.17907945811748505, "learning_rate": 1.943747224792601e-05, "loss": 0.5221, "step": 1533 }, { "epoch": 0.34218157483827794, "grad_norm": 0.16425661742687225, "learning_rate": 1.9436693783385273e-05, "loss": 0.527, "step": 1534 }, { "epoch": 0.3424046397501673, "grad_norm": 0.20484143495559692, "learning_rate": 1.9435914796181077e-05, "loss": 0.5294, "step": 1535 }, { "epoch": 0.34262770466205666, "grad_norm": 0.171040877699852, "learning_rate": 1.9435135286356563e-05, "loss": 0.5317, "step": 1536 }, { "epoch": 0.342850769573946, "grad_norm": 0.1566307246685028, "learning_rate": 1.943435525395491e-05, "loss": 0.4908, "step": 1537 }, { "epoch": 0.3430738344858354, "grad_norm": 0.15999655425548553, "learning_rate": 1.9433574699019315e-05, "loss": 0.5211, "step": 1538 }, { "epoch": 0.34329689939772473, "grad_norm": 0.16879580914974213, "learning_rate": 1.9432793621593013e-05, "loss": 0.5473, "step": 1539 }, { "epoch": 0.3435199643096141, "grad_norm": 0.16014404594898224, "learning_rate": 1.943201202171927e-05, "loss": 0.5369, "step": 1540 }, { "epoch": 0.34374302922150346, "grad_norm": 0.1585661619901657, "learning_rate": 1.943122989944137e-05, "loss": 0.5203, "step": 1541 }, { "epoch": 0.3439660941333928, "grad_norm": 0.16618762910366058, "learning_rate": 1.943044725480263e-05, "loss": 0.5163, "step": 1542 }, { "epoch": 0.3441891590452822, "grad_norm": 0.1790398806333542, "learning_rate": 1.9429664087846407e-05, "loss": 0.492, "step": 1543 }, { "epoch": 0.3444122239571715, "grad_norm": 0.17689375579357147, "learning_rate": 1.9428880398616065e-05, "loss": 0.5009, "step": 1544 }, { "epoch": 0.3446352888690609, "grad_norm": 0.16719898581504822, "learning_rate": 1.942809618715502e-05, "loss": 0.5064, "step": 1545 }, { "epoch": 0.34485835378095026, "grad_norm": 0.1700020283460617, "learning_rate": 1.9427311453506705e-05, "loss": 0.5229, "step": 1546 }, { "epoch": 0.3450814186928396, "grad_norm": 0.17998361587524414, "learning_rate": 1.9426526197714582e-05, "loss": 0.5188, "step": 1547 }, { "epoch": 0.345304483604729, "grad_norm": 0.17831408977508545, "learning_rate": 1.9425740419822138e-05, "loss": 0.4967, "step": 1548 }, { "epoch": 0.3455275485166183, "grad_norm": 0.16397793591022491, "learning_rate": 1.9424954119872904e-05, "loss": 0.5319, "step": 1549 }, { "epoch": 0.3457506134285077, "grad_norm": 0.17712676525115967, "learning_rate": 1.9424167297910425e-05, "loss": 0.5104, "step": 1550 }, { "epoch": 0.34597367834039705, "grad_norm": 0.19361375272274017, "learning_rate": 1.9423379953978277e-05, "loss": 0.5113, "step": 1551 }, { "epoch": 0.34619674325228644, "grad_norm": 0.1744556725025177, "learning_rate": 1.9422592088120074e-05, "loss": 0.5331, "step": 1552 }, { "epoch": 0.3464198081641758, "grad_norm": 0.1747075766324997, "learning_rate": 1.9421803700379454e-05, "loss": 0.5257, "step": 1553 }, { "epoch": 0.3466428730760651, "grad_norm": 0.17446283996105194, "learning_rate": 1.9421014790800074e-05, "loss": 0.5133, "step": 1554 }, { "epoch": 0.3468659379879545, "grad_norm": 0.1626337766647339, "learning_rate": 1.9420225359425637e-05, "loss": 0.4812, "step": 1555 }, { "epoch": 0.34708900289984385, "grad_norm": 0.16959097981452942, "learning_rate": 1.9419435406299863e-05, "loss": 0.4954, "step": 1556 }, { "epoch": 0.34731206781173324, "grad_norm": 0.17018267512321472, "learning_rate": 1.9418644931466507e-05, "loss": 0.4785, "step": 1557 }, { "epoch": 0.3475351327236226, "grad_norm": 0.18296095728874207, "learning_rate": 1.9417853934969347e-05, "loss": 0.5244, "step": 1558 }, { "epoch": 0.3477581976355119, "grad_norm": 0.1640445590019226, "learning_rate": 1.9417062416852198e-05, "loss": 0.5273, "step": 1559 }, { "epoch": 0.3479812625474013, "grad_norm": 0.1624521166086197, "learning_rate": 1.9416270377158896e-05, "loss": 0.4939, "step": 1560 }, { "epoch": 0.34820432745929064, "grad_norm": 0.16685132682323456, "learning_rate": 1.941547781593331e-05, "loss": 0.5118, "step": 1561 }, { "epoch": 0.34842739237118003, "grad_norm": 0.1717056930065155, "learning_rate": 1.9414684733219334e-05, "loss": 0.5156, "step": 1562 }, { "epoch": 0.34865045728306937, "grad_norm": 0.18991652131080627, "learning_rate": 1.94138911290609e-05, "loss": 0.5346, "step": 1563 }, { "epoch": 0.3488735221949587, "grad_norm": 0.16688503324985504, "learning_rate": 1.941309700350196e-05, "loss": 0.5333, "step": 1564 }, { "epoch": 0.3490965871068481, "grad_norm": 0.17251865565776825, "learning_rate": 1.9412302356586494e-05, "loss": 0.5113, "step": 1565 }, { "epoch": 0.34931965201873744, "grad_norm": 0.15765444934368134, "learning_rate": 1.941150718835852e-05, "loss": 0.5305, "step": 1566 }, { "epoch": 0.34954271693062683, "grad_norm": 0.18435880541801453, "learning_rate": 1.9410711498862077e-05, "loss": 0.5087, "step": 1567 }, { "epoch": 0.34976578184251617, "grad_norm": 0.16191725432872772, "learning_rate": 1.9409915288141235e-05, "loss": 0.5001, "step": 1568 }, { "epoch": 0.34998884675440556, "grad_norm": 0.19742515683174133, "learning_rate": 1.9409118556240095e-05, "loss": 0.5232, "step": 1569 }, { "epoch": 0.3502119116662949, "grad_norm": 0.16742932796478271, "learning_rate": 1.940832130320278e-05, "loss": 0.5227, "step": 1570 }, { "epoch": 0.35043497657818423, "grad_norm": 0.17970915138721466, "learning_rate": 1.9407523529073455e-05, "loss": 0.5228, "step": 1571 }, { "epoch": 0.3506580414900736, "grad_norm": 0.16648298501968384, "learning_rate": 1.9406725233896297e-05, "loss": 0.4987, "step": 1572 }, { "epoch": 0.35088110640196296, "grad_norm": 0.1718263030052185, "learning_rate": 1.940592641771553e-05, "loss": 0.5256, "step": 1573 }, { "epoch": 0.35110417131385235, "grad_norm": 0.17099805176258087, "learning_rate": 1.9405127080575387e-05, "loss": 0.5371, "step": 1574 }, { "epoch": 0.3513272362257417, "grad_norm": 0.16639132797718048, "learning_rate": 1.9404327222520147e-05, "loss": 0.5055, "step": 1575 }, { "epoch": 0.351550301137631, "grad_norm": 0.15915724635124207, "learning_rate": 1.9403526843594115e-05, "loss": 0.5123, "step": 1576 }, { "epoch": 0.3517733660495204, "grad_norm": 0.190689817070961, "learning_rate": 1.9402725943841608e-05, "loss": 0.5149, "step": 1577 }, { "epoch": 0.35199643096140976, "grad_norm": 0.1728559285402298, "learning_rate": 1.9401924523306998e-05, "loss": 0.518, "step": 1578 }, { "epoch": 0.35221949587329915, "grad_norm": 0.15996624529361725, "learning_rate": 1.9401122582034664e-05, "loss": 0.4949, "step": 1579 }, { "epoch": 0.3524425607851885, "grad_norm": 0.17822036147117615, "learning_rate": 1.940032012006903e-05, "loss": 0.5012, "step": 1580 }, { "epoch": 0.3526656256970778, "grad_norm": 0.18252968788146973, "learning_rate": 1.9399517137454534e-05, "loss": 0.54, "step": 1581 }, { "epoch": 0.3528886906089672, "grad_norm": 0.1667163223028183, "learning_rate": 1.939871363423566e-05, "loss": 0.5176, "step": 1582 }, { "epoch": 0.35311175552085655, "grad_norm": 0.17513629794120789, "learning_rate": 1.9397909610456897e-05, "loss": 0.5182, "step": 1583 }, { "epoch": 0.35333482043274594, "grad_norm": 0.16966688632965088, "learning_rate": 1.939710506616279e-05, "loss": 0.5154, "step": 1584 }, { "epoch": 0.3535578853446353, "grad_norm": 0.17325864732265472, "learning_rate": 1.9396300001397888e-05, "loss": 0.5321, "step": 1585 }, { "epoch": 0.3537809502565247, "grad_norm": 0.17322127521038055, "learning_rate": 1.939549441620679e-05, "loss": 0.5206, "step": 1586 }, { "epoch": 0.354004015168414, "grad_norm": 0.16276715695858002, "learning_rate": 1.9394688310634114e-05, "loss": 0.4874, "step": 1587 }, { "epoch": 0.35422708008030335, "grad_norm": 0.1607387810945511, "learning_rate": 1.93938816847245e-05, "loss": 0.4813, "step": 1588 }, { "epoch": 0.35445014499219274, "grad_norm": 0.16621682047843933, "learning_rate": 1.939307453852263e-05, "loss": 0.533, "step": 1589 }, { "epoch": 0.3546732099040821, "grad_norm": 0.165096253156662, "learning_rate": 1.9392266872073207e-05, "loss": 0.5115, "step": 1590 }, { "epoch": 0.35489627481597147, "grad_norm": 0.17987790703773499, "learning_rate": 1.9391458685420966e-05, "loss": 0.5439, "step": 1591 }, { "epoch": 0.3551193397278608, "grad_norm": 0.1641106754541397, "learning_rate": 1.939064997861067e-05, "loss": 0.462, "step": 1592 }, { "epoch": 0.35534240463975014, "grad_norm": 0.23700258135795593, "learning_rate": 1.9389840751687105e-05, "loss": 0.4945, "step": 1593 }, { "epoch": 0.35556546955163953, "grad_norm": 0.1738569736480713, "learning_rate": 1.9389031004695095e-05, "loss": 0.5247, "step": 1594 }, { "epoch": 0.35578853446352887, "grad_norm": 0.16523152589797974, "learning_rate": 1.9388220737679493e-05, "loss": 0.5231, "step": 1595 }, { "epoch": 0.35601159937541826, "grad_norm": 0.1748376339673996, "learning_rate": 1.9387409950685167e-05, "loss": 0.5048, "step": 1596 }, { "epoch": 0.3562346642873076, "grad_norm": 0.20013664662837982, "learning_rate": 1.938659864375703e-05, "loss": 0.5284, "step": 1597 }, { "epoch": 0.356457729199197, "grad_norm": 0.18472377955913544, "learning_rate": 1.938578681694002e-05, "loss": 0.5063, "step": 1598 }, { "epoch": 0.35668079411108633, "grad_norm": 0.17718833684921265, "learning_rate": 1.9384974470279093e-05, "loss": 0.5156, "step": 1599 }, { "epoch": 0.35690385902297567, "grad_norm": 0.16550204157829285, "learning_rate": 1.938416160381925e-05, "loss": 0.487, "step": 1600 }, { "epoch": 0.35712692393486506, "grad_norm": 0.17269675433635712, "learning_rate": 1.93833482176055e-05, "loss": 0.4989, "step": 1601 }, { "epoch": 0.3573499888467544, "grad_norm": 0.17201650142669678, "learning_rate": 1.938253431168291e-05, "loss": 0.5075, "step": 1602 }, { "epoch": 0.3575730537586438, "grad_norm": 0.17147138714790344, "learning_rate": 1.938171988609655e-05, "loss": 0.4967, "step": 1603 }, { "epoch": 0.3577961186705331, "grad_norm": 0.20010094344615936, "learning_rate": 1.938090494089153e-05, "loss": 0.5335, "step": 1604 }, { "epoch": 0.35801918358242246, "grad_norm": 0.19643662869930267, "learning_rate": 1.9380089476112985e-05, "loss": 0.586, "step": 1605 }, { "epoch": 0.35824224849431185, "grad_norm": 0.16737660765647888, "learning_rate": 1.937927349180608e-05, "loss": 0.5203, "step": 1606 }, { "epoch": 0.3584653134062012, "grad_norm": 0.1626511663198471, "learning_rate": 1.9378456988016015e-05, "loss": 0.5057, "step": 1607 }, { "epoch": 0.3586883783180906, "grad_norm": 0.18112361431121826, "learning_rate": 1.9377639964788005e-05, "loss": 0.5709, "step": 1608 }, { "epoch": 0.3589114432299799, "grad_norm": 0.1701660305261612, "learning_rate": 1.937682242216731e-05, "loss": 0.5427, "step": 1609 }, { "epoch": 0.35913450814186926, "grad_norm": 0.16982769966125488, "learning_rate": 1.9376004360199202e-05, "loss": 0.5414, "step": 1610 }, { "epoch": 0.35935757305375865, "grad_norm": 0.1677013635635376, "learning_rate": 1.9375185778928997e-05, "loss": 0.5189, "step": 1611 }, { "epoch": 0.359580637965648, "grad_norm": 0.16802898049354553, "learning_rate": 1.9374366678402032e-05, "loss": 0.5359, "step": 1612 }, { "epoch": 0.3598037028775374, "grad_norm": 0.1629040390253067, "learning_rate": 1.9373547058663674e-05, "loss": 0.5239, "step": 1613 }, { "epoch": 0.3600267677894267, "grad_norm": 0.15942612290382385, "learning_rate": 1.9372726919759318e-05, "loss": 0.4927, "step": 1614 }, { "epoch": 0.3602498327013161, "grad_norm": 0.16228771209716797, "learning_rate": 1.9371906261734387e-05, "loss": 0.4795, "step": 1615 }, { "epoch": 0.36047289761320545, "grad_norm": 0.18663759529590607, "learning_rate": 1.9371085084634337e-05, "loss": 0.4523, "step": 1616 }, { "epoch": 0.3606959625250948, "grad_norm": 0.1616244614124298, "learning_rate": 1.9370263388504647e-05, "loss": 0.5221, "step": 1617 }, { "epoch": 0.3609190274369842, "grad_norm": 0.16261187195777893, "learning_rate": 1.936944117339083e-05, "loss": 0.507, "step": 1618 }, { "epoch": 0.3611420923488735, "grad_norm": 0.16541439294815063, "learning_rate": 1.9368618439338424e-05, "loss": 0.5318, "step": 1619 }, { "epoch": 0.3613651572607629, "grad_norm": 0.17135834693908691, "learning_rate": 1.9367795186392996e-05, "loss": 0.5071, "step": 1620 }, { "epoch": 0.36158822217265224, "grad_norm": 0.1599961221218109, "learning_rate": 1.936697141460015e-05, "loss": 0.5031, "step": 1621 }, { "epoch": 0.3618112870845416, "grad_norm": 0.17239384353160858, "learning_rate": 1.9366147124005504e-05, "loss": 0.5138, "step": 1622 }, { "epoch": 0.36203435199643097, "grad_norm": 0.1636582612991333, "learning_rate": 1.9365322314654714e-05, "loss": 0.5197, "step": 1623 }, { "epoch": 0.3622574169083203, "grad_norm": 0.17068350315093994, "learning_rate": 1.9364496986593463e-05, "loss": 0.4886, "step": 1624 }, { "epoch": 0.3624804818202097, "grad_norm": 0.17033697664737701, "learning_rate": 1.9363671139867467e-05, "loss": 0.5232, "step": 1625 }, { "epoch": 0.36270354673209904, "grad_norm": 0.16417935490608215, "learning_rate": 1.936284477452246e-05, "loss": 0.5384, "step": 1626 }, { "epoch": 0.36292661164398843, "grad_norm": 0.1705051213502884, "learning_rate": 1.9362017890604215e-05, "loss": 0.517, "step": 1627 }, { "epoch": 0.36314967655587777, "grad_norm": 0.17601191997528076, "learning_rate": 1.9361190488158535e-05, "loss": 0.5141, "step": 1628 }, { "epoch": 0.3633727414677671, "grad_norm": 0.173202782869339, "learning_rate": 1.936036256723124e-05, "loss": 0.5133, "step": 1629 }, { "epoch": 0.3635958063796565, "grad_norm": 0.1655133217573166, "learning_rate": 1.935953412786818e-05, "loss": 0.5278, "step": 1630 }, { "epoch": 0.36381887129154583, "grad_norm": 0.1649170219898224, "learning_rate": 1.9358705170115253e-05, "loss": 0.5101, "step": 1631 }, { "epoch": 0.3640419362034352, "grad_norm": 0.4769546389579773, "learning_rate": 1.9357875694018364e-05, "loss": 0.5325, "step": 1632 }, { "epoch": 0.36426500111532456, "grad_norm": 0.16675199568271637, "learning_rate": 1.9357045699623452e-05, "loss": 0.5373, "step": 1633 }, { "epoch": 0.3644880660272139, "grad_norm": 0.18537083268165588, "learning_rate": 1.9356215186976496e-05, "loss": 0.5077, "step": 1634 }, { "epoch": 0.3647111309391033, "grad_norm": 0.1646578013896942, "learning_rate": 1.935538415612349e-05, "loss": 0.5163, "step": 1635 }, { "epoch": 0.3649341958509926, "grad_norm": 0.17558851838111877, "learning_rate": 1.935455260711046e-05, "loss": 0.5269, "step": 1636 }, { "epoch": 0.365157260762882, "grad_norm": 0.1750616431236267, "learning_rate": 1.9353720539983462e-05, "loss": 0.5185, "step": 1637 }, { "epoch": 0.36538032567477136, "grad_norm": 0.1686229407787323, "learning_rate": 1.9352887954788583e-05, "loss": 0.5033, "step": 1638 }, { "epoch": 0.3656033905866607, "grad_norm": 0.17237605154514313, "learning_rate": 1.935205485157194e-05, "loss": 0.5071, "step": 1639 }, { "epoch": 0.3658264554985501, "grad_norm": 0.17209577560424805, "learning_rate": 1.9351221230379673e-05, "loss": 0.5049, "step": 1640 }, { "epoch": 0.3660495204104394, "grad_norm": 0.1662733256816864, "learning_rate": 1.9350387091257952e-05, "loss": 0.5399, "step": 1641 }, { "epoch": 0.3662725853223288, "grad_norm": 0.17135699093341827, "learning_rate": 1.9349552434252976e-05, "loss": 0.5175, "step": 1642 }, { "epoch": 0.36649565023421815, "grad_norm": 0.17738810181617737, "learning_rate": 1.9348717259410975e-05, "loss": 0.5464, "step": 1643 }, { "epoch": 0.36671871514610754, "grad_norm": 0.17101503908634186, "learning_rate": 1.9347881566778208e-05, "loss": 0.5162, "step": 1644 }, { "epoch": 0.3669417800579969, "grad_norm": 0.16841377317905426, "learning_rate": 1.934704535640096e-05, "loss": 0.5244, "step": 1645 }, { "epoch": 0.3671648449698862, "grad_norm": 0.17288081347942352, "learning_rate": 1.9346208628325543e-05, "loss": 0.5487, "step": 1646 }, { "epoch": 0.3673879098817756, "grad_norm": 0.1731618493795395, "learning_rate": 1.93453713825983e-05, "loss": 0.4924, "step": 1647 }, { "epoch": 0.36761097479366495, "grad_norm": 0.18122443556785583, "learning_rate": 1.934453361926561e-05, "loss": 0.5105, "step": 1648 }, { "epoch": 0.36783403970555434, "grad_norm": 0.17434357106685638, "learning_rate": 1.9343695338373866e-05, "loss": 0.5185, "step": 1649 }, { "epoch": 0.3680571046174437, "grad_norm": 0.19195139408111572, "learning_rate": 1.93428565399695e-05, "loss": 0.5074, "step": 1650 }, { "epoch": 0.368280169529333, "grad_norm": 0.17529140412807465, "learning_rate": 1.9342017224098974e-05, "loss": 0.5355, "step": 1651 }, { "epoch": 0.3685032344412224, "grad_norm": 0.16739797592163086, "learning_rate": 1.9341177390808768e-05, "loss": 0.5128, "step": 1652 }, { "epoch": 0.36872629935311174, "grad_norm": 0.17157401144504547, "learning_rate": 1.9340337040145397e-05, "loss": 0.5117, "step": 1653 }, { "epoch": 0.36894936426500113, "grad_norm": 0.166087806224823, "learning_rate": 1.933949617215541e-05, "loss": 0.5182, "step": 1654 }, { "epoch": 0.36917242917689047, "grad_norm": 0.17426824569702148, "learning_rate": 1.9338654786885377e-05, "loss": 0.5141, "step": 1655 }, { "epoch": 0.3693954940887798, "grad_norm": 0.18022432923316956, "learning_rate": 1.93378128843819e-05, "loss": 0.5097, "step": 1656 }, { "epoch": 0.3696185590006692, "grad_norm": 0.1715417206287384, "learning_rate": 1.933697046469161e-05, "loss": 0.525, "step": 1657 }, { "epoch": 0.36984162391255854, "grad_norm": 0.17939765751361847, "learning_rate": 1.9336127527861158e-05, "loss": 0.5166, "step": 1658 }, { "epoch": 0.37006468882444793, "grad_norm": 0.1856374740600586, "learning_rate": 1.9335284073937242e-05, "loss": 0.5078, "step": 1659 }, { "epoch": 0.37028775373633727, "grad_norm": 0.16677772998809814, "learning_rate": 1.9334440102966567e-05, "loss": 0.5427, "step": 1660 }, { "epoch": 0.37051081864822666, "grad_norm": 0.17052651941776276, "learning_rate": 1.933359561499589e-05, "loss": 0.4976, "step": 1661 }, { "epoch": 0.370733883560116, "grad_norm": 0.180454283952713, "learning_rate": 1.9332750610071972e-05, "loss": 0.4807, "step": 1662 }, { "epoch": 0.37095694847200533, "grad_norm": 0.16270607709884644, "learning_rate": 1.9331905088241623e-05, "loss": 0.5079, "step": 1663 }, { "epoch": 0.3711800133838947, "grad_norm": 0.1687788963317871, "learning_rate": 1.9331059049551668e-05, "loss": 0.5208, "step": 1664 }, { "epoch": 0.37140307829578406, "grad_norm": 0.16753818094730377, "learning_rate": 1.933021249404897e-05, "loss": 0.4991, "step": 1665 }, { "epoch": 0.37162614320767345, "grad_norm": 0.1763564944267273, "learning_rate": 1.9329365421780414e-05, "loss": 0.5321, "step": 1666 }, { "epoch": 0.3718492081195628, "grad_norm": 0.1733461618423462, "learning_rate": 1.932851783279292e-05, "loss": 0.5331, "step": 1667 }, { "epoch": 0.3720722730314521, "grad_norm": 0.1711835116147995, "learning_rate": 1.9327669727133424e-05, "loss": 0.4764, "step": 1668 }, { "epoch": 0.3722953379433415, "grad_norm": 0.190887913107872, "learning_rate": 1.932682110484891e-05, "loss": 0.5254, "step": 1669 }, { "epoch": 0.37251840285523086, "grad_norm": 0.16770336031913757, "learning_rate": 1.9325971965986373e-05, "loss": 0.4989, "step": 1670 }, { "epoch": 0.37274146776712025, "grad_norm": 0.2075570970773697, "learning_rate": 1.9325122310592846e-05, "loss": 0.515, "step": 1671 }, { "epoch": 0.3729645326790096, "grad_norm": 0.17692680656909943, "learning_rate": 1.9324272138715388e-05, "loss": 0.5442, "step": 1672 }, { "epoch": 0.373187597590899, "grad_norm": 0.17124991118907928, "learning_rate": 1.932342145040109e-05, "loss": 0.5066, "step": 1673 }, { "epoch": 0.3734106625027883, "grad_norm": 0.16621904075145721, "learning_rate": 1.932257024569706e-05, "loss": 0.5066, "step": 1674 }, { "epoch": 0.37363372741467765, "grad_norm": 0.17721469700336456, "learning_rate": 1.932171852465045e-05, "loss": 0.4789, "step": 1675 }, { "epoch": 0.37385679232656704, "grad_norm": 0.16972602903842926, "learning_rate": 1.9320866287308433e-05, "loss": 0.4922, "step": 1676 }, { "epoch": 0.3740798572384564, "grad_norm": 0.17205440998077393, "learning_rate": 1.9320013533718208e-05, "loss": 0.4909, "step": 1677 }, { "epoch": 0.3743029221503458, "grad_norm": 0.16761687397956848, "learning_rate": 1.9319160263927013e-05, "loss": 0.513, "step": 1678 }, { "epoch": 0.3745259870622351, "grad_norm": 0.16278418898582458, "learning_rate": 1.93183064779821e-05, "loss": 0.508, "step": 1679 }, { "epoch": 0.37474905197412445, "grad_norm": 0.17359983921051025, "learning_rate": 1.931745217593076e-05, "loss": 0.5472, "step": 1680 }, { "epoch": 0.37497211688601384, "grad_norm": 0.16624867916107178, "learning_rate": 1.931659735782031e-05, "loss": 0.53, "step": 1681 }, { "epoch": 0.3751951817979032, "grad_norm": 0.16128243505954742, "learning_rate": 1.9315742023698095e-05, "loss": 0.474, "step": 1682 }, { "epoch": 0.37541824670979257, "grad_norm": 0.19452917575836182, "learning_rate": 1.9314886173611487e-05, "loss": 0.5527, "step": 1683 }, { "epoch": 0.3756413116216819, "grad_norm": 0.18000726401805878, "learning_rate": 1.931402980760789e-05, "loss": 0.5185, "step": 1684 }, { "epoch": 0.37586437653357124, "grad_norm": 0.1593390852212906, "learning_rate": 1.9313172925734736e-05, "loss": 0.503, "step": 1685 }, { "epoch": 0.37608744144546064, "grad_norm": 0.1637982428073883, "learning_rate": 1.931231552803948e-05, "loss": 0.5011, "step": 1686 }, { "epoch": 0.37631050635734997, "grad_norm": 0.18159234523773193, "learning_rate": 1.931145761456962e-05, "loss": 0.501, "step": 1687 }, { "epoch": 0.37653357126923936, "grad_norm": 0.17507293820381165, "learning_rate": 1.9310599185372657e-05, "loss": 0.5219, "step": 1688 }, { "epoch": 0.3767566361811287, "grad_norm": 0.16377007961273193, "learning_rate": 1.9309740240496152e-05, "loss": 0.5096, "step": 1689 }, { "epoch": 0.3769797010930181, "grad_norm": 0.171824112534523, "learning_rate": 1.930888077998767e-05, "loss": 0.5366, "step": 1690 }, { "epoch": 0.37720276600490743, "grad_norm": 0.18775586783885956, "learning_rate": 1.9308020803894813e-05, "loss": 0.5018, "step": 1691 }, { "epoch": 0.37742583091679677, "grad_norm": 0.1640700101852417, "learning_rate": 1.9307160312265216e-05, "loss": 0.5159, "step": 1692 }, { "epoch": 0.37764889582868616, "grad_norm": 0.1758948117494583, "learning_rate": 1.9306299305146535e-05, "loss": 0.5171, "step": 1693 }, { "epoch": 0.3778719607405755, "grad_norm": 0.1625639647245407, "learning_rate": 1.9305437782586463e-05, "loss": 0.4989, "step": 1694 }, { "epoch": 0.3780950256524649, "grad_norm": 0.15426993370056152, "learning_rate": 1.9304575744632708e-05, "loss": 0.4781, "step": 1695 }, { "epoch": 0.3783180905643542, "grad_norm": 0.18176917731761932, "learning_rate": 1.9303713191333025e-05, "loss": 0.5307, "step": 1696 }, { "epoch": 0.37854115547624356, "grad_norm": 0.16359803080558777, "learning_rate": 1.930285012273518e-05, "loss": 0.5067, "step": 1697 }, { "epoch": 0.37876422038813296, "grad_norm": 0.16793343424797058, "learning_rate": 1.930198653888698e-05, "loss": 0.5234, "step": 1698 }, { "epoch": 0.3789872853000223, "grad_norm": 0.17032268643379211, "learning_rate": 1.930112243983625e-05, "loss": 0.5521, "step": 1699 }, { "epoch": 0.3792103502119117, "grad_norm": 0.18838275969028473, "learning_rate": 1.930025782563086e-05, "loss": 0.5203, "step": 1700 }, { "epoch": 0.379433415123801, "grad_norm": 0.1813000738620758, "learning_rate": 1.9299392696318683e-05, "loss": 0.5299, "step": 1701 }, { "epoch": 0.3796564800356904, "grad_norm": 0.15768998861312866, "learning_rate": 1.9298527051947645e-05, "loss": 0.4802, "step": 1702 }, { "epoch": 0.37987954494757975, "grad_norm": 0.15893371403217316, "learning_rate": 1.9297660892565692e-05, "loss": 0.4939, "step": 1703 }, { "epoch": 0.3801026098594691, "grad_norm": 0.16448427736759186, "learning_rate": 1.929679421822079e-05, "loss": 0.503, "step": 1704 }, { "epoch": 0.3803256747713585, "grad_norm": 0.1799025982618332, "learning_rate": 1.9295927028960947e-05, "loss": 0.5146, "step": 1705 }, { "epoch": 0.3805487396832478, "grad_norm": 0.17270612716674805, "learning_rate": 1.9295059324834193e-05, "loss": 0.5472, "step": 1706 }, { "epoch": 0.3807718045951372, "grad_norm": 0.1640714704990387, "learning_rate": 1.9294191105888586e-05, "loss": 0.4847, "step": 1707 }, { "epoch": 0.38099486950702655, "grad_norm": 0.16522546112537384, "learning_rate": 1.9293322372172207e-05, "loss": 0.4752, "step": 1708 }, { "epoch": 0.3812179344189159, "grad_norm": 0.17741802334785461, "learning_rate": 1.9292453123733184e-05, "loss": 0.5246, "step": 1709 }, { "epoch": 0.3814409993308053, "grad_norm": 0.3004036545753479, "learning_rate": 1.9291583360619653e-05, "loss": 0.5032, "step": 1710 }, { "epoch": 0.3816640642426946, "grad_norm": 0.17262016236782074, "learning_rate": 1.9290713082879786e-05, "loss": 0.5208, "step": 1711 }, { "epoch": 0.381887129154584, "grad_norm": 0.1745826005935669, "learning_rate": 1.928984229056179e-05, "loss": 0.5245, "step": 1712 }, { "epoch": 0.38211019406647334, "grad_norm": 0.19315177202224731, "learning_rate": 1.9288970983713893e-05, "loss": 0.5029, "step": 1713 }, { "epoch": 0.3823332589783627, "grad_norm": 0.17079593241214752, "learning_rate": 1.9288099162384354e-05, "loss": 0.5204, "step": 1714 }, { "epoch": 0.38255632389025207, "grad_norm": 0.17363616824150085, "learning_rate": 1.9287226826621457e-05, "loss": 0.5195, "step": 1715 }, { "epoch": 0.3827793888021414, "grad_norm": 0.1668912172317505, "learning_rate": 1.928635397647352e-05, "loss": 0.5259, "step": 1716 }, { "epoch": 0.3830024537140308, "grad_norm": 0.162687286734581, "learning_rate": 1.9285480611988886e-05, "loss": 0.5135, "step": 1717 }, { "epoch": 0.38322551862592014, "grad_norm": 0.1736724078655243, "learning_rate": 1.9284606733215925e-05, "loss": 0.4855, "step": 1718 }, { "epoch": 0.38344858353780953, "grad_norm": 0.16885711252689362, "learning_rate": 1.9283732340203045e-05, "loss": 0.4934, "step": 1719 }, { "epoch": 0.38367164844969887, "grad_norm": 0.17074720561504364, "learning_rate": 1.928285743299867e-05, "loss": 0.5167, "step": 1720 }, { "epoch": 0.3838947133615882, "grad_norm": 0.17049984633922577, "learning_rate": 1.9281982011651257e-05, "loss": 0.498, "step": 1721 }, { "epoch": 0.3841177782734776, "grad_norm": 0.21770413219928741, "learning_rate": 1.9281106076209296e-05, "loss": 0.4869, "step": 1722 }, { "epoch": 0.38434084318536693, "grad_norm": 0.17190971970558167, "learning_rate": 1.9280229626721302e-05, "loss": 0.5476, "step": 1723 }, { "epoch": 0.3845639080972563, "grad_norm": 0.20358806848526, "learning_rate": 1.9279352663235813e-05, "loss": 0.5074, "step": 1724 }, { "epoch": 0.38478697300914566, "grad_norm": 0.19548457860946655, "learning_rate": 1.9278475185801404e-05, "loss": 0.5246, "step": 1725 }, { "epoch": 0.385010037921035, "grad_norm": 0.18699929118156433, "learning_rate": 1.9277597194466674e-05, "loss": 0.5267, "step": 1726 }, { "epoch": 0.3852331028329244, "grad_norm": 0.16484257578849792, "learning_rate": 1.9276718689280258e-05, "loss": 0.5108, "step": 1727 }, { "epoch": 0.3854561677448137, "grad_norm": 0.16136226058006287, "learning_rate": 1.9275839670290804e-05, "loss": 0.4973, "step": 1728 }, { "epoch": 0.3856792326567031, "grad_norm": 0.16811032593250275, "learning_rate": 1.9274960137547002e-05, "loss": 0.4971, "step": 1729 }, { "epoch": 0.38590229756859246, "grad_norm": 0.17258323729038239, "learning_rate": 1.9274080091097568e-05, "loss": 0.5513, "step": 1730 }, { "epoch": 0.3861253624804818, "grad_norm": 0.16782884299755096, "learning_rate": 1.927319953099124e-05, "loss": 0.5032, "step": 1731 }, { "epoch": 0.3863484273923712, "grad_norm": 0.1599043756723404, "learning_rate": 1.9272318457276792e-05, "loss": 0.5117, "step": 1732 }, { "epoch": 0.3865714923042605, "grad_norm": 0.18118049204349518, "learning_rate": 1.9271436870003022e-05, "loss": 0.5476, "step": 1733 }, { "epoch": 0.3867945572161499, "grad_norm": 0.16125431656837463, "learning_rate": 1.927055476921876e-05, "loss": 0.496, "step": 1734 }, { "epoch": 0.38701762212803925, "grad_norm": 0.17001016438007355, "learning_rate": 1.9269672154972863e-05, "loss": 0.5274, "step": 1735 }, { "epoch": 0.38724068703992864, "grad_norm": 0.18982788920402527, "learning_rate": 1.9268789027314208e-05, "loss": 0.5423, "step": 1736 }, { "epoch": 0.387463751951818, "grad_norm": 0.20157238841056824, "learning_rate": 1.9267905386291716e-05, "loss": 0.5204, "step": 1737 }, { "epoch": 0.3876868168637073, "grad_norm": 0.16580817103385925, "learning_rate": 1.926702123195433e-05, "loss": 0.523, "step": 1738 }, { "epoch": 0.3879098817755967, "grad_norm": 0.1722583770751953, "learning_rate": 1.926613656435101e-05, "loss": 0.4689, "step": 1739 }, { "epoch": 0.38813294668748605, "grad_norm": 0.16248932480812073, "learning_rate": 1.9265251383530765e-05, "loss": 0.4838, "step": 1740 }, { "epoch": 0.38835601159937544, "grad_norm": 0.16789276897907257, "learning_rate": 1.9264365689542616e-05, "loss": 0.5191, "step": 1741 }, { "epoch": 0.3885790765112648, "grad_norm": 0.16488412022590637, "learning_rate": 1.926347948243562e-05, "loss": 0.5124, "step": 1742 }, { "epoch": 0.3888021414231541, "grad_norm": 0.17527227103710175, "learning_rate": 1.926259276225886e-05, "loss": 0.5042, "step": 1743 }, { "epoch": 0.3890252063350435, "grad_norm": 0.16100718080997467, "learning_rate": 1.926170552906145e-05, "loss": 0.5103, "step": 1744 }, { "epoch": 0.38924827124693284, "grad_norm": 0.1704183667898178, "learning_rate": 1.926081778289253e-05, "loss": 0.5252, "step": 1745 }, { "epoch": 0.38947133615882223, "grad_norm": 0.15948736667633057, "learning_rate": 1.9259929523801266e-05, "loss": 0.4972, "step": 1746 }, { "epoch": 0.38969440107071157, "grad_norm": 0.15783725678920746, "learning_rate": 1.9259040751836858e-05, "loss": 0.4862, "step": 1747 }, { "epoch": 0.38991746598260096, "grad_norm": 0.22207237780094147, "learning_rate": 1.9258151467048533e-05, "loss": 0.5058, "step": 1748 }, { "epoch": 0.3901405308944903, "grad_norm": 0.18270133435726166, "learning_rate": 1.9257261669485544e-05, "loss": 0.4872, "step": 1749 }, { "epoch": 0.39036359580637964, "grad_norm": 0.15779659152030945, "learning_rate": 1.925637135919717e-05, "loss": 0.4972, "step": 1750 }, { "epoch": 0.39058666071826903, "grad_norm": 0.16197752952575684, "learning_rate": 1.9255480536232728e-05, "loss": 0.5001, "step": 1751 }, { "epoch": 0.39080972563015837, "grad_norm": 0.1598835587501526, "learning_rate": 1.9254589200641556e-05, "loss": 0.4991, "step": 1752 }, { "epoch": 0.39103279054204776, "grad_norm": 0.1754835844039917, "learning_rate": 1.925369735247302e-05, "loss": 0.4858, "step": 1753 }, { "epoch": 0.3912558554539371, "grad_norm": 0.17721553146839142, "learning_rate": 1.9252804991776513e-05, "loss": 0.4954, "step": 1754 }, { "epoch": 0.39147892036582643, "grad_norm": 0.16600117087364197, "learning_rate": 1.9251912118601466e-05, "loss": 0.5006, "step": 1755 }, { "epoch": 0.3917019852777158, "grad_norm": 0.18240424990653992, "learning_rate": 1.925101873299733e-05, "loss": 0.5563, "step": 1756 }, { "epoch": 0.39192505018960516, "grad_norm": 0.2154020071029663, "learning_rate": 1.9250124835013583e-05, "loss": 0.5213, "step": 1757 }, { "epoch": 0.39214811510149455, "grad_norm": 0.17148853838443756, "learning_rate": 1.9249230424699735e-05, "loss": 0.5378, "step": 1758 }, { "epoch": 0.3923711800133839, "grad_norm": 0.17164389789104462, "learning_rate": 1.9248335502105328e-05, "loss": 0.5272, "step": 1759 }, { "epoch": 0.39259424492527323, "grad_norm": 0.16211992502212524, "learning_rate": 1.924744006727993e-05, "loss": 0.5543, "step": 1760 }, { "epoch": 0.3928173098371626, "grad_norm": 0.17015716433525085, "learning_rate": 1.924654412027313e-05, "loss": 0.5281, "step": 1761 }, { "epoch": 0.39304037474905196, "grad_norm": 0.15979216992855072, "learning_rate": 1.924564766113455e-05, "loss": 0.4938, "step": 1762 }, { "epoch": 0.39326343966094135, "grad_norm": 0.16782401502132416, "learning_rate": 1.924475068991385e-05, "loss": 0.4876, "step": 1763 }, { "epoch": 0.3934865045728307, "grad_norm": 0.1688111275434494, "learning_rate": 1.9243853206660703e-05, "loss": 0.503, "step": 1764 }, { "epoch": 0.3937095694847201, "grad_norm": 0.16509543359279633, "learning_rate": 1.924295521142482e-05, "loss": 0.4955, "step": 1765 }, { "epoch": 0.3939326343966094, "grad_norm": 0.17914824187755585, "learning_rate": 1.9242056704255935e-05, "loss": 0.5334, "step": 1766 }, { "epoch": 0.39415569930849875, "grad_norm": 0.18346786499023438, "learning_rate": 1.9241157685203817e-05, "loss": 0.5135, "step": 1767 }, { "epoch": 0.39437876422038814, "grad_norm": 0.18024152517318726, "learning_rate": 1.9240258154318257e-05, "loss": 0.5284, "step": 1768 }, { "epoch": 0.3946018291322775, "grad_norm": 0.17404595017433167, "learning_rate": 1.923935811164908e-05, "loss": 0.5069, "step": 1769 }, { "epoch": 0.3948248940441669, "grad_norm": 0.15942560136318207, "learning_rate": 1.9238457557246128e-05, "loss": 0.5034, "step": 1770 }, { "epoch": 0.3950479589560562, "grad_norm": 0.15909984707832336, "learning_rate": 1.9237556491159285e-05, "loss": 0.4762, "step": 1771 }, { "epoch": 0.39527102386794555, "grad_norm": 0.16344551742076874, "learning_rate": 1.9236654913438456e-05, "loss": 0.5243, "step": 1772 }, { "epoch": 0.39549408877983494, "grad_norm": 0.1672784835100174, "learning_rate": 1.923575282413358e-05, "loss": 0.5057, "step": 1773 }, { "epoch": 0.3957171536917243, "grad_norm": 0.1585850566625595, "learning_rate": 1.9234850223294613e-05, "loss": 0.5074, "step": 1774 }, { "epoch": 0.39594021860361367, "grad_norm": 0.17053529620170593, "learning_rate": 1.9233947110971556e-05, "loss": 0.5556, "step": 1775 }, { "epoch": 0.396163283515503, "grad_norm": 0.16313889622688293, "learning_rate": 1.9233043487214423e-05, "loss": 0.491, "step": 1776 }, { "epoch": 0.3963863484273924, "grad_norm": 0.15941911935806274, "learning_rate": 1.9232139352073265e-05, "loss": 0.4862, "step": 1777 }, { "epoch": 0.39660941333928174, "grad_norm": 0.1699916571378708, "learning_rate": 1.9231234705598153e-05, "loss": 0.542, "step": 1778 }, { "epoch": 0.3968324782511711, "grad_norm": 0.17430098354816437, "learning_rate": 1.9230329547839196e-05, "loss": 0.5006, "step": 1779 }, { "epoch": 0.39705554316306046, "grad_norm": 0.16042235493659973, "learning_rate": 1.9229423878846535e-05, "loss": 0.5087, "step": 1780 }, { "epoch": 0.3972786080749498, "grad_norm": 0.1637001633644104, "learning_rate": 1.9228517698670316e-05, "loss": 0.4966, "step": 1781 }, { "epoch": 0.3975016729868392, "grad_norm": 0.1627058982849121, "learning_rate": 1.922761100736074e-05, "loss": 0.4882, "step": 1782 }, { "epoch": 0.39772473789872853, "grad_norm": 0.1667163074016571, "learning_rate": 1.9226703804968022e-05, "loss": 0.5295, "step": 1783 }, { "epoch": 0.39794780281061787, "grad_norm": 0.16532334685325623, "learning_rate": 1.9225796091542412e-05, "loss": 0.5062, "step": 1784 }, { "epoch": 0.39817086772250726, "grad_norm": 0.16788703203201294, "learning_rate": 1.9224887867134178e-05, "loss": 0.5276, "step": 1785 }, { "epoch": 0.3983939326343966, "grad_norm": 0.17480318248271942, "learning_rate": 1.9223979131793627e-05, "loss": 0.52, "step": 1786 }, { "epoch": 0.398616997546286, "grad_norm": 0.1601472944021225, "learning_rate": 1.9223069885571094e-05, "loss": 0.5263, "step": 1787 }, { "epoch": 0.3988400624581753, "grad_norm": 0.16812476515769958, "learning_rate": 1.9222160128516932e-05, "loss": 0.4831, "step": 1788 }, { "epoch": 0.39906312737006466, "grad_norm": 0.1809302121400833, "learning_rate": 1.9221249860681537e-05, "loss": 0.4944, "step": 1789 }, { "epoch": 0.39928619228195406, "grad_norm": 0.16555048525333405, "learning_rate": 1.9220339082115317e-05, "loss": 0.4851, "step": 1790 }, { "epoch": 0.3995092571938434, "grad_norm": 0.16044898331165314, "learning_rate": 1.9219427792868722e-05, "loss": 0.5377, "step": 1791 }, { "epoch": 0.3997323221057328, "grad_norm": 0.16331616044044495, "learning_rate": 1.921851599299222e-05, "loss": 0.4978, "step": 1792 }, { "epoch": 0.3999553870176221, "grad_norm": 0.16315311193466187, "learning_rate": 1.9217603682536315e-05, "loss": 0.51, "step": 1793 }, { "epoch": 0.4001784519295115, "grad_norm": 0.16615985333919525, "learning_rate": 1.9216690861551544e-05, "loss": 0.5343, "step": 1794 }, { "epoch": 0.40040151684140085, "grad_norm": 0.1581142395734787, "learning_rate": 1.9215777530088452e-05, "loss": 0.5276, "step": 1795 }, { "epoch": 0.4006245817532902, "grad_norm": 0.25778672099113464, "learning_rate": 1.9214863688197634e-05, "loss": 0.5265, "step": 1796 }, { "epoch": 0.4008476466651796, "grad_norm": 0.21218818426132202, "learning_rate": 1.92139493359297e-05, "loss": 0.4929, "step": 1797 }, { "epoch": 0.4010707115770689, "grad_norm": 0.1664579063653946, "learning_rate": 1.9213034473335293e-05, "loss": 0.5298, "step": 1798 }, { "epoch": 0.4012937764889583, "grad_norm": 0.158608078956604, "learning_rate": 1.9212119100465084e-05, "loss": 0.513, "step": 1799 }, { "epoch": 0.40151684140084765, "grad_norm": 0.15767832100391388, "learning_rate": 1.9211203217369774e-05, "loss": 0.5037, "step": 1800 }, { "epoch": 0.401739906312737, "grad_norm": 0.1640487164258957, "learning_rate": 1.921028682410009e-05, "loss": 0.5102, "step": 1801 }, { "epoch": 0.4019629712246264, "grad_norm": 0.1980050951242447, "learning_rate": 1.9209369920706783e-05, "loss": 0.4926, "step": 1802 }, { "epoch": 0.4021860361365157, "grad_norm": 0.16443881392478943, "learning_rate": 1.9208452507240642e-05, "loss": 0.5165, "step": 1803 }, { "epoch": 0.4024091010484051, "grad_norm": 0.17739985883235931, "learning_rate": 1.920753458375248e-05, "loss": 0.5141, "step": 1804 }, { "epoch": 0.40263216596029444, "grad_norm": 0.1755398064851761, "learning_rate": 1.9206616150293132e-05, "loss": 0.5279, "step": 1805 }, { "epoch": 0.4028552308721838, "grad_norm": 0.16583852469921112, "learning_rate": 1.9205697206913473e-05, "loss": 0.4838, "step": 1806 }, { "epoch": 0.40307829578407317, "grad_norm": 0.1888515055179596, "learning_rate": 1.9204777753664397e-05, "loss": 0.5381, "step": 1807 }, { "epoch": 0.4033013606959625, "grad_norm": 0.17295043170452118, "learning_rate": 1.9203857790596826e-05, "loss": 0.5062, "step": 1808 }, { "epoch": 0.4035244256078519, "grad_norm": 0.16970674693584442, "learning_rate": 1.9202937317761713e-05, "loss": 0.5138, "step": 1809 }, { "epoch": 0.40374749051974124, "grad_norm": 0.16952745616436005, "learning_rate": 1.9202016335210047e-05, "loss": 0.4829, "step": 1810 }, { "epoch": 0.40397055543163063, "grad_norm": 0.16743586957454681, "learning_rate": 1.9201094842992832e-05, "loss": 0.5085, "step": 1811 }, { "epoch": 0.40419362034351997, "grad_norm": 0.16140130162239075, "learning_rate": 1.9200172841161108e-05, "loss": 0.4983, "step": 1812 }, { "epoch": 0.4044166852554093, "grad_norm": 0.1676797717809677, "learning_rate": 1.9199250329765943e-05, "loss": 0.5055, "step": 1813 }, { "epoch": 0.4046397501672987, "grad_norm": 0.1718011051416397, "learning_rate": 1.9198327308858427e-05, "loss": 0.5021, "step": 1814 }, { "epoch": 0.40486281507918803, "grad_norm": 0.1699821949005127, "learning_rate": 1.9197403778489684e-05, "loss": 0.5312, "step": 1815 }, { "epoch": 0.4050858799910774, "grad_norm": 0.16701120138168335, "learning_rate": 1.9196479738710865e-05, "loss": 0.5028, "step": 1816 }, { "epoch": 0.40530894490296676, "grad_norm": 0.17370650172233582, "learning_rate": 1.9195555189573153e-05, "loss": 0.5075, "step": 1817 }, { "epoch": 0.4055320098148561, "grad_norm": 0.16744117438793182, "learning_rate": 1.919463013112775e-05, "loss": 0.4972, "step": 1818 }, { "epoch": 0.4057550747267455, "grad_norm": 0.1703552007675171, "learning_rate": 1.9193704563425896e-05, "loss": 0.5215, "step": 1819 }, { "epoch": 0.4059781396386348, "grad_norm": 0.1808227002620697, "learning_rate": 1.919277848651885e-05, "loss": 0.5048, "step": 1820 }, { "epoch": 0.4062012045505242, "grad_norm": 0.17191959917545319, "learning_rate": 1.9191851900457905e-05, "loss": 0.5293, "step": 1821 }, { "epoch": 0.40642426946241356, "grad_norm": 0.16464729607105255, "learning_rate": 1.9190924805294388e-05, "loss": 0.531, "step": 1822 }, { "epoch": 0.40664733437430295, "grad_norm": 0.17217296361923218, "learning_rate": 1.9189997201079638e-05, "loss": 0.5221, "step": 1823 }, { "epoch": 0.4068703992861923, "grad_norm": 0.15952859818935394, "learning_rate": 1.918906908786504e-05, "loss": 0.4898, "step": 1824 }, { "epoch": 0.4070934641980816, "grad_norm": 0.15873339772224426, "learning_rate": 1.9188140465701987e-05, "loss": 0.4771, "step": 1825 }, { "epoch": 0.407316529109971, "grad_norm": 0.16631244122982025, "learning_rate": 1.9187211334641923e-05, "loss": 0.5216, "step": 1826 }, { "epoch": 0.40753959402186035, "grad_norm": 0.19430842995643616, "learning_rate": 1.918628169473631e-05, "loss": 0.532, "step": 1827 }, { "epoch": 0.40776265893374974, "grad_norm": 0.1647733449935913, "learning_rate": 1.9185351546036625e-05, "loss": 0.4991, "step": 1828 }, { "epoch": 0.4079857238456391, "grad_norm": 0.17242863774299622, "learning_rate": 1.9184420888594398e-05, "loss": 0.5095, "step": 1829 }, { "epoch": 0.4082087887575284, "grad_norm": 0.15754511952400208, "learning_rate": 1.9183489722461167e-05, "loss": 0.5185, "step": 1830 }, { "epoch": 0.4084318536694178, "grad_norm": 0.16371309757232666, "learning_rate": 1.918255804768851e-05, "loss": 0.5146, "step": 1831 }, { "epoch": 0.40865491858130715, "grad_norm": 0.17599989473819733, "learning_rate": 1.918162586432803e-05, "loss": 0.5509, "step": 1832 }, { "epoch": 0.40887798349319654, "grad_norm": 0.17950280010700226, "learning_rate": 1.9180693172431353e-05, "loss": 0.5236, "step": 1833 }, { "epoch": 0.4091010484050859, "grad_norm": 0.1845085769891739, "learning_rate": 1.917975997205014e-05, "loss": 0.5155, "step": 1834 }, { "epoch": 0.4093241133169752, "grad_norm": 0.16801810264587402, "learning_rate": 1.9178826263236076e-05, "loss": 0.5265, "step": 1835 }, { "epoch": 0.4095471782288646, "grad_norm": 0.16314861178398132, "learning_rate": 1.9177892046040875e-05, "loss": 0.4922, "step": 1836 }, { "epoch": 0.40977024314075394, "grad_norm": 0.16442064940929413, "learning_rate": 1.9176957320516287e-05, "loss": 0.5004, "step": 1837 }, { "epoch": 0.40999330805264333, "grad_norm": 0.16156339645385742, "learning_rate": 1.917602208671407e-05, "loss": 0.5172, "step": 1838 }, { "epoch": 0.41021637296453267, "grad_norm": 0.17251324653625488, "learning_rate": 1.9175086344686035e-05, "loss": 0.5432, "step": 1839 }, { "epoch": 0.41043943787642206, "grad_norm": 0.1891004890203476, "learning_rate": 1.9174150094484e-05, "loss": 0.5276, "step": 1840 }, { "epoch": 0.4106625027883114, "grad_norm": 0.1750495731830597, "learning_rate": 1.917321333615983e-05, "loss": 0.548, "step": 1841 }, { "epoch": 0.41088556770020074, "grad_norm": 0.1698615849018097, "learning_rate": 1.91722760697654e-05, "loss": 0.5276, "step": 1842 }, { "epoch": 0.41110863261209013, "grad_norm": 0.16396887600421906, "learning_rate": 1.917133829535263e-05, "loss": 0.5211, "step": 1843 }, { "epoch": 0.41133169752397947, "grad_norm": 0.18432539701461792, "learning_rate": 1.917040001297345e-05, "loss": 0.5276, "step": 1844 }, { "epoch": 0.41155476243586886, "grad_norm": 0.17017246782779694, "learning_rate": 1.9169461222679836e-05, "loss": 0.5312, "step": 1845 }, { "epoch": 0.4117778273477582, "grad_norm": 0.16993820667266846, "learning_rate": 1.9168521924523782e-05, "loss": 0.4981, "step": 1846 }, { "epoch": 0.41200089225964753, "grad_norm": 0.1594426929950714, "learning_rate": 1.916758211855731e-05, "loss": 0.5162, "step": 1847 }, { "epoch": 0.4122239571715369, "grad_norm": 0.16228000819683075, "learning_rate": 1.9166641804832474e-05, "loss": 0.5086, "step": 1848 }, { "epoch": 0.41244702208342626, "grad_norm": 0.2065616101026535, "learning_rate": 1.9165700983401354e-05, "loss": 0.4916, "step": 1849 }, { "epoch": 0.41267008699531565, "grad_norm": 0.16846513748168945, "learning_rate": 1.916475965431606e-05, "loss": 0.5246, "step": 1850 }, { "epoch": 0.412893151907205, "grad_norm": 0.19557945430278778, "learning_rate": 1.9163817817628728e-05, "loss": 0.5208, "step": 1851 }, { "epoch": 0.4131162168190944, "grad_norm": 0.17081156373023987, "learning_rate": 1.916287547339152e-05, "loss": 0.5534, "step": 1852 }, { "epoch": 0.4133392817309837, "grad_norm": 0.16580162942409515, "learning_rate": 1.9161932621656634e-05, "loss": 0.5152, "step": 1853 }, { "epoch": 0.41356234664287306, "grad_norm": 0.18416714668273926, "learning_rate": 1.9160989262476288e-05, "loss": 0.5064, "step": 1854 }, { "epoch": 0.41378541155476245, "grad_norm": 0.16161711513996124, "learning_rate": 1.916004539590273e-05, "loss": 0.5008, "step": 1855 }, { "epoch": 0.4140084764666518, "grad_norm": 0.17786547541618347, "learning_rate": 1.9159101021988244e-05, "loss": 0.5168, "step": 1856 }, { "epoch": 0.4142315413785412, "grad_norm": 0.15692083537578583, "learning_rate": 1.9158156140785125e-05, "loss": 0.5026, "step": 1857 }, { "epoch": 0.4144546062904305, "grad_norm": 0.22050227224826813, "learning_rate": 1.9157210752345713e-05, "loss": 0.4936, "step": 1858 }, { "epoch": 0.41467767120231985, "grad_norm": 0.20180164277553558, "learning_rate": 1.915626485672237e-05, "loss": 0.5347, "step": 1859 }, { "epoch": 0.41490073611420925, "grad_norm": 0.156357541680336, "learning_rate": 1.9155318453967483e-05, "loss": 0.4816, "step": 1860 }, { "epoch": 0.4151238010260986, "grad_norm": 0.16347983479499817, "learning_rate": 1.9154371544133472e-05, "loss": 0.5032, "step": 1861 }, { "epoch": 0.415346865937988, "grad_norm": 0.16571176052093506, "learning_rate": 1.9153424127272783e-05, "loss": 0.4875, "step": 1862 }, { "epoch": 0.4155699308498773, "grad_norm": 0.16103731095790863, "learning_rate": 1.9152476203437884e-05, "loss": 0.5266, "step": 1863 }, { "epoch": 0.41579299576176665, "grad_norm": 0.17397384345531464, "learning_rate": 1.915152777268128e-05, "loss": 0.5114, "step": 1864 }, { "epoch": 0.41601606067365604, "grad_norm": 0.16579222679138184, "learning_rate": 1.9150578835055507e-05, "loss": 0.5219, "step": 1865 }, { "epoch": 0.4162391255855454, "grad_norm": 0.1724756360054016, "learning_rate": 1.914962939061312e-05, "loss": 0.4942, "step": 1866 }, { "epoch": 0.41646219049743477, "grad_norm": 0.16709351539611816, "learning_rate": 1.9148679439406704e-05, "loss": 0.514, "step": 1867 }, { "epoch": 0.4166852554093241, "grad_norm": 0.15628811717033386, "learning_rate": 1.914772898148887e-05, "loss": 0.5153, "step": 1868 }, { "epoch": 0.4169083203212135, "grad_norm": 0.16881687939167023, "learning_rate": 1.914677801691226e-05, "loss": 0.4924, "step": 1869 }, { "epoch": 0.41713138523310284, "grad_norm": 0.16054266691207886, "learning_rate": 1.9145826545729555e-05, "loss": 0.5238, "step": 1870 }, { "epoch": 0.4173544501449922, "grad_norm": 0.1636224240064621, "learning_rate": 1.9144874567993446e-05, "loss": 0.4813, "step": 1871 }, { "epoch": 0.41757751505688157, "grad_norm": 0.16189956665039062, "learning_rate": 1.9143922083756656e-05, "loss": 0.5211, "step": 1872 }, { "epoch": 0.4178005799687709, "grad_norm": 0.16065020859241486, "learning_rate": 1.9142969093071944e-05, "loss": 0.5116, "step": 1873 }, { "epoch": 0.4180236448806603, "grad_norm": 0.17001986503601074, "learning_rate": 1.9142015595992096e-05, "loss": 0.5202, "step": 1874 }, { "epoch": 0.41824670979254963, "grad_norm": 0.16956806182861328, "learning_rate": 1.9141061592569913e-05, "loss": 0.4941, "step": 1875 }, { "epoch": 0.41846977470443897, "grad_norm": 0.16168902814388275, "learning_rate": 1.9140107082858243e-05, "loss": 0.5134, "step": 1876 }, { "epoch": 0.41869283961632836, "grad_norm": 0.16588400304317474, "learning_rate": 1.9139152066909948e-05, "loss": 0.4899, "step": 1877 }, { "epoch": 0.4189159045282177, "grad_norm": 0.16289415955543518, "learning_rate": 1.9138196544777925e-05, "loss": 0.482, "step": 1878 }, { "epoch": 0.4191389694401071, "grad_norm": 0.17892742156982422, "learning_rate": 1.9137240516515094e-05, "loss": 0.5178, "step": 1879 }, { "epoch": 0.4193620343519964, "grad_norm": 0.17155860364437103, "learning_rate": 1.913628398217441e-05, "loss": 0.5099, "step": 1880 }, { "epoch": 0.41958509926388576, "grad_norm": 0.1801706701517105, "learning_rate": 1.913532694180885e-05, "loss": 0.4966, "step": 1881 }, { "epoch": 0.41980816417577516, "grad_norm": 0.16115552186965942, "learning_rate": 1.9134369395471416e-05, "loss": 0.4839, "step": 1882 }, { "epoch": 0.4200312290876645, "grad_norm": 0.16582123935222626, "learning_rate": 1.913341134321515e-05, "loss": 0.4719, "step": 1883 }, { "epoch": 0.4202542939995539, "grad_norm": 0.16009938716888428, "learning_rate": 1.9132452785093113e-05, "loss": 0.5096, "step": 1884 }, { "epoch": 0.4204773589114432, "grad_norm": 0.17301099002361298, "learning_rate": 1.9131493721158395e-05, "loss": 0.5295, "step": 1885 }, { "epoch": 0.4207004238233326, "grad_norm": 0.16522350907325745, "learning_rate": 1.9130534151464116e-05, "loss": 0.5386, "step": 1886 }, { "epoch": 0.42092348873522195, "grad_norm": 0.1624733954668045, "learning_rate": 1.9129574076063423e-05, "loss": 0.5226, "step": 1887 }, { "epoch": 0.4211465536471113, "grad_norm": 0.16734281182289124, "learning_rate": 1.9128613495009487e-05, "loss": 0.5367, "step": 1888 }, { "epoch": 0.4213696185590007, "grad_norm": 0.15870822966098785, "learning_rate": 1.912765240835552e-05, "loss": 0.4597, "step": 1889 }, { "epoch": 0.42159268347089, "grad_norm": 0.16024811565876007, "learning_rate": 1.912669081615474e-05, "loss": 0.4794, "step": 1890 }, { "epoch": 0.4218157483827794, "grad_norm": 0.16861672699451447, "learning_rate": 1.912572871846042e-05, "loss": 0.4918, "step": 1891 }, { "epoch": 0.42203881329466875, "grad_norm": 0.16772149503231049, "learning_rate": 1.9124766115325837e-05, "loss": 0.5262, "step": 1892 }, { "epoch": 0.4222618782065581, "grad_norm": 0.16915294528007507, "learning_rate": 1.912380300680431e-05, "loss": 0.4926, "step": 1893 }, { "epoch": 0.4224849431184475, "grad_norm": 0.16086746752262115, "learning_rate": 1.912283939294918e-05, "loss": 0.5091, "step": 1894 }, { "epoch": 0.4227080080303368, "grad_norm": 0.18025851249694824, "learning_rate": 1.912187527381382e-05, "loss": 0.5147, "step": 1895 }, { "epoch": 0.4229310729422262, "grad_norm": 0.15781170129776, "learning_rate": 1.9120910649451632e-05, "loss": 0.501, "step": 1896 }, { "epoch": 0.42315413785411554, "grad_norm": 0.15790554881095886, "learning_rate": 1.9119945519916036e-05, "loss": 0.4963, "step": 1897 }, { "epoch": 0.42337720276600493, "grad_norm": 0.16987471282482147, "learning_rate": 1.9118979885260493e-05, "loss": 0.5148, "step": 1898 }, { "epoch": 0.42360026767789427, "grad_norm": 0.16438226401805878, "learning_rate": 1.9118013745538483e-05, "loss": 0.5348, "step": 1899 }, { "epoch": 0.4238233325897836, "grad_norm": 0.18299153447151184, "learning_rate": 1.9117047100803513e-05, "loss": 0.5055, "step": 1900 }, { "epoch": 0.424046397501673, "grad_norm": 0.18117006123065948, "learning_rate": 1.911607995110913e-05, "loss": 0.517, "step": 1901 }, { "epoch": 0.42426946241356234, "grad_norm": 0.19496211409568787, "learning_rate": 1.9115112296508896e-05, "loss": 0.5216, "step": 1902 }, { "epoch": 0.42449252732545173, "grad_norm": 0.17908519506454468, "learning_rate": 1.9114144137056406e-05, "loss": 0.5386, "step": 1903 }, { "epoch": 0.42471559223734107, "grad_norm": 0.18684661388397217, "learning_rate": 1.9113175472805284e-05, "loss": 0.5341, "step": 1904 }, { "epoch": 0.4249386571492304, "grad_norm": 0.1662161499261856, "learning_rate": 1.9112206303809183e-05, "loss": 0.4824, "step": 1905 }, { "epoch": 0.4251617220611198, "grad_norm": 0.1687028408050537, "learning_rate": 1.9111236630121775e-05, "loss": 0.5026, "step": 1906 }, { "epoch": 0.42538478697300913, "grad_norm": 0.20177949965000153, "learning_rate": 1.9110266451796772e-05, "loss": 0.5112, "step": 1907 }, { "epoch": 0.4256078518848985, "grad_norm": 0.16266430914402008, "learning_rate": 1.9109295768887907e-05, "loss": 0.5011, "step": 1908 }, { "epoch": 0.42583091679678786, "grad_norm": 0.1639285683631897, "learning_rate": 1.910832458144894e-05, "loss": 0.5147, "step": 1909 }, { "epoch": 0.4260539817086772, "grad_norm": 0.16959701478481293, "learning_rate": 1.9107352889533667e-05, "loss": 0.513, "step": 1910 }, { "epoch": 0.4262770466205666, "grad_norm": 0.16716282069683075, "learning_rate": 1.9106380693195903e-05, "loss": 0.5023, "step": 1911 }, { "epoch": 0.42650011153245593, "grad_norm": 0.16967961192131042, "learning_rate": 1.9105407992489495e-05, "loss": 0.5185, "step": 1912 }, { "epoch": 0.4267231764443453, "grad_norm": 0.16723939776420593, "learning_rate": 1.9104434787468316e-05, "loss": 0.5013, "step": 1913 }, { "epoch": 0.42694624135623466, "grad_norm": 0.16064821183681488, "learning_rate": 1.9103461078186268e-05, "loss": 0.5109, "step": 1914 }, { "epoch": 0.42716930626812405, "grad_norm": 0.17482173442840576, "learning_rate": 1.9102486864697285e-05, "loss": 0.5148, "step": 1915 }, { "epoch": 0.4273923711800134, "grad_norm": 0.1638958603143692, "learning_rate": 1.910151214705532e-05, "loss": 0.527, "step": 1916 }, { "epoch": 0.4276154360919027, "grad_norm": 0.18999075889587402, "learning_rate": 1.9100536925314363e-05, "loss": 0.5, "step": 1917 }, { "epoch": 0.4278385010037921, "grad_norm": 0.18148307502269745, "learning_rate": 1.9099561199528425e-05, "loss": 0.4862, "step": 1918 }, { "epoch": 0.42806156591568145, "grad_norm": 0.17788200080394745, "learning_rate": 1.909858496975155e-05, "loss": 0.4991, "step": 1919 }, { "epoch": 0.42828463082757084, "grad_norm": 0.1629716157913208, "learning_rate": 1.9097608236037813e-05, "loss": 0.5292, "step": 1920 }, { "epoch": 0.4285076957394602, "grad_norm": 0.1686851680278778, "learning_rate": 1.9096630998441298e-05, "loss": 0.5156, "step": 1921 }, { "epoch": 0.4287307606513495, "grad_norm": 0.1803978681564331, "learning_rate": 1.909565325701614e-05, "loss": 0.5193, "step": 1922 }, { "epoch": 0.4289538255632389, "grad_norm": 0.17601296305656433, "learning_rate": 1.9094675011816496e-05, "loss": 0.5193, "step": 1923 }, { "epoch": 0.42917689047512825, "grad_norm": 0.17354577779769897, "learning_rate": 1.9093696262896535e-05, "loss": 0.5055, "step": 1924 }, { "epoch": 0.42939995538701764, "grad_norm": 0.1659156084060669, "learning_rate": 1.9092717010310476e-05, "loss": 0.5232, "step": 1925 }, { "epoch": 0.429623020298907, "grad_norm": 0.1605677753686905, "learning_rate": 1.909173725411255e-05, "loss": 0.4916, "step": 1926 }, { "epoch": 0.42984608521079637, "grad_norm": 0.17374806106090546, "learning_rate": 1.9090756994357035e-05, "loss": 0.5259, "step": 1927 }, { "epoch": 0.4300691501226857, "grad_norm": 0.16589613258838654, "learning_rate": 1.9089776231098204e-05, "loss": 0.5192, "step": 1928 }, { "epoch": 0.43029221503457504, "grad_norm": 0.16348575055599213, "learning_rate": 1.9088794964390395e-05, "loss": 0.4963, "step": 1929 }, { "epoch": 0.43051527994646444, "grad_norm": 0.16494914889335632, "learning_rate": 1.9087813194287948e-05, "loss": 0.5359, "step": 1930 }, { "epoch": 0.43073834485835377, "grad_norm": 0.16134381294250488, "learning_rate": 1.9086830920845242e-05, "loss": 0.5087, "step": 1931 }, { "epoch": 0.43096140977024316, "grad_norm": 0.16159029304981232, "learning_rate": 1.908584814411668e-05, "loss": 0.5242, "step": 1932 }, { "epoch": 0.4311844746821325, "grad_norm": 0.1648871749639511, "learning_rate": 1.9084864864156696e-05, "loss": 0.5069, "step": 1933 }, { "epoch": 0.43140753959402184, "grad_norm": 0.17623218894004822, "learning_rate": 1.9083881081019752e-05, "loss": 0.5049, "step": 1934 }, { "epoch": 0.43163060450591123, "grad_norm": 0.166509211063385, "learning_rate": 1.9082896794760327e-05, "loss": 0.4966, "step": 1935 }, { "epoch": 0.43185366941780057, "grad_norm": 0.15902282297611237, "learning_rate": 1.908191200543295e-05, "loss": 0.5343, "step": 1936 }, { "epoch": 0.43207673432968996, "grad_norm": 0.16570866107940674, "learning_rate": 1.908092671309216e-05, "loss": 0.4797, "step": 1937 }, { "epoch": 0.4322997992415793, "grad_norm": 0.15843339264392853, "learning_rate": 1.9079940917792524e-05, "loss": 0.5359, "step": 1938 }, { "epoch": 0.43252286415346863, "grad_norm": 0.16115322709083557, "learning_rate": 1.9078954619588645e-05, "loss": 0.5032, "step": 1939 }, { "epoch": 0.432745929065358, "grad_norm": 0.16374099254608154, "learning_rate": 1.9077967818535153e-05, "loss": 0.4998, "step": 1940 }, { "epoch": 0.43296899397724736, "grad_norm": 0.17378082871437073, "learning_rate": 1.9076980514686695e-05, "loss": 0.5403, "step": 1941 }, { "epoch": 0.43319205888913676, "grad_norm": 0.16688857972621918, "learning_rate": 1.9075992708097965e-05, "loss": 0.5103, "step": 1942 }, { "epoch": 0.4334151238010261, "grad_norm": 0.16233094036579132, "learning_rate": 1.9075004398823665e-05, "loss": 0.5045, "step": 1943 }, { "epoch": 0.4336381887129155, "grad_norm": 0.16693229973316193, "learning_rate": 1.907401558691854e-05, "loss": 0.5167, "step": 1944 }, { "epoch": 0.4338612536248048, "grad_norm": 0.16341574490070343, "learning_rate": 1.9073026272437353e-05, "loss": 0.5001, "step": 1945 }, { "epoch": 0.43408431853669416, "grad_norm": 0.16043853759765625, "learning_rate": 1.90720364554349e-05, "loss": 0.5109, "step": 1946 }, { "epoch": 0.43430738344858355, "grad_norm": 0.15996608138084412, "learning_rate": 1.9071046135966e-05, "loss": 0.4826, "step": 1947 }, { "epoch": 0.4345304483604729, "grad_norm": 0.15902268886566162, "learning_rate": 1.9070055314085508e-05, "loss": 0.4989, "step": 1948 }, { "epoch": 0.4347535132723623, "grad_norm": 0.16526652872562408, "learning_rate": 1.9069063989848298e-05, "loss": 0.5209, "step": 1949 }, { "epoch": 0.4349765781842516, "grad_norm": 0.16759642958641052, "learning_rate": 1.9068072163309282e-05, "loss": 0.5246, "step": 1950 }, { "epoch": 0.43519964309614095, "grad_norm": 0.17135794460773468, "learning_rate": 1.9067079834523387e-05, "loss": 0.5198, "step": 1951 }, { "epoch": 0.43542270800803035, "grad_norm": 0.16336077451705933, "learning_rate": 1.9066087003545576e-05, "loss": 0.5098, "step": 1952 }, { "epoch": 0.4356457729199197, "grad_norm": 0.1512858122587204, "learning_rate": 1.9065093670430836e-05, "loss": 0.4843, "step": 1953 }, { "epoch": 0.4358688378318091, "grad_norm": 0.17053045332431793, "learning_rate": 1.9064099835234188e-05, "loss": 0.527, "step": 1954 }, { "epoch": 0.4360919027436984, "grad_norm": 0.16126702725887299, "learning_rate": 1.9063105498010678e-05, "loss": 0.5222, "step": 1955 }, { "epoch": 0.43631496765558775, "grad_norm": 0.1633397489786148, "learning_rate": 1.9062110658815375e-05, "loss": 0.5124, "step": 1956 }, { "epoch": 0.43653803256747714, "grad_norm": 0.18700194358825684, "learning_rate": 1.9061115317703384e-05, "loss": 0.5123, "step": 1957 }, { "epoch": 0.4367610974793665, "grad_norm": 0.1907283067703247, "learning_rate": 1.9060119474729826e-05, "loss": 0.5223, "step": 1958 }, { "epoch": 0.43698416239125587, "grad_norm": 0.1634737253189087, "learning_rate": 1.9059123129949865e-05, "loss": 0.4879, "step": 1959 }, { "epoch": 0.4372072273031452, "grad_norm": 0.18314914405345917, "learning_rate": 1.9058126283418675e-05, "loss": 0.5018, "step": 1960 }, { "epoch": 0.4374302922150346, "grad_norm": 0.1680321991443634, "learning_rate": 1.9057128935191477e-05, "loss": 0.4975, "step": 1961 }, { "epoch": 0.43765335712692394, "grad_norm": 0.1635400801897049, "learning_rate": 1.9056131085323506e-05, "loss": 0.4895, "step": 1962 }, { "epoch": 0.4378764220388133, "grad_norm": 0.15011341869831085, "learning_rate": 1.905513273387003e-05, "loss": 0.4765, "step": 1963 }, { "epoch": 0.43809948695070267, "grad_norm": 0.16991405189037323, "learning_rate": 1.9054133880886348e-05, "loss": 0.518, "step": 1964 }, { "epoch": 0.438322551862592, "grad_norm": 0.17523062229156494, "learning_rate": 1.9053134526427777e-05, "loss": 0.4764, "step": 1965 }, { "epoch": 0.4385456167744814, "grad_norm": 0.1695917248725891, "learning_rate": 1.905213467054967e-05, "loss": 0.5111, "step": 1966 }, { "epoch": 0.43876868168637073, "grad_norm": 0.16954153776168823, "learning_rate": 1.90511343133074e-05, "loss": 0.505, "step": 1967 }, { "epoch": 0.43899174659826007, "grad_norm": 0.3032020628452301, "learning_rate": 1.905013345475638e-05, "loss": 0.5037, "step": 1968 }, { "epoch": 0.43921481151014946, "grad_norm": 0.16867168247699738, "learning_rate": 1.9049132094952046e-05, "loss": 0.5596, "step": 1969 }, { "epoch": 0.4394378764220388, "grad_norm": 0.16780449450016022, "learning_rate": 1.904813023394985e-05, "loss": 0.4937, "step": 1970 }, { "epoch": 0.4396609413339282, "grad_norm": 0.19413304328918457, "learning_rate": 1.904712787180529e-05, "loss": 0.4823, "step": 1971 }, { "epoch": 0.4398840062458175, "grad_norm": 0.16119445860385895, "learning_rate": 1.9046125008573876e-05, "loss": 0.5451, "step": 1972 }, { "epoch": 0.4401070711577069, "grad_norm": 0.1607835441827774, "learning_rate": 1.904512164431116e-05, "loss": 0.5066, "step": 1973 }, { "epoch": 0.44033013606959626, "grad_norm": 0.1697167158126831, "learning_rate": 1.9044117779072708e-05, "loss": 0.5166, "step": 1974 }, { "epoch": 0.4405532009814856, "grad_norm": 0.16189232468605042, "learning_rate": 1.9043113412914128e-05, "loss": 0.4968, "step": 1975 }, { "epoch": 0.440776265893375, "grad_norm": 0.1717143952846527, "learning_rate": 1.904210854589104e-05, "loss": 0.481, "step": 1976 }, { "epoch": 0.4409993308052643, "grad_norm": 0.17401723563671112, "learning_rate": 1.9041103178059107e-05, "loss": 0.526, "step": 1977 }, { "epoch": 0.4412223957171537, "grad_norm": 0.16598494350910187, "learning_rate": 1.9040097309474007e-05, "loss": 0.5195, "step": 1978 }, { "epoch": 0.44144546062904305, "grad_norm": 0.21129997074604034, "learning_rate": 1.9039090940191455e-05, "loss": 0.5096, "step": 1979 }, { "epoch": 0.4416685255409324, "grad_norm": 0.17546051740646362, "learning_rate": 1.9038084070267186e-05, "loss": 0.5313, "step": 1980 }, { "epoch": 0.4418915904528218, "grad_norm": 0.15967412292957306, "learning_rate": 1.9037076699756973e-05, "loss": 0.5183, "step": 1981 }, { "epoch": 0.4421146553647111, "grad_norm": 0.17214882373809814, "learning_rate": 1.9036068828716603e-05, "loss": 0.5091, "step": 1982 }, { "epoch": 0.4423377202766005, "grad_norm": 0.184258371591568, "learning_rate": 1.9035060457201904e-05, "loss": 0.4785, "step": 1983 }, { "epoch": 0.44256078518848985, "grad_norm": 0.1570109874010086, "learning_rate": 1.9034051585268725e-05, "loss": 0.515, "step": 1984 }, { "epoch": 0.4427838501003792, "grad_norm": 0.17051473259925842, "learning_rate": 1.903304221297294e-05, "loss": 0.5031, "step": 1985 }, { "epoch": 0.4430069150122686, "grad_norm": 0.19998317956924438, "learning_rate": 1.903203234037046e-05, "loss": 0.499, "step": 1986 }, { "epoch": 0.4432299799241579, "grad_norm": 0.16017475724220276, "learning_rate": 1.9031021967517213e-05, "loss": 0.5237, "step": 1987 }, { "epoch": 0.4434530448360473, "grad_norm": 0.18142235279083252, "learning_rate": 1.9030011094469164e-05, "loss": 0.5373, "step": 1988 }, { "epoch": 0.44367610974793664, "grad_norm": 0.16957718133926392, "learning_rate": 1.90289997212823e-05, "loss": 0.5038, "step": 1989 }, { "epoch": 0.44389917465982603, "grad_norm": 0.16631929576396942, "learning_rate": 1.9027987848012635e-05, "loss": 0.4873, "step": 1990 }, { "epoch": 0.44412223957171537, "grad_norm": 0.16362012922763824, "learning_rate": 1.9026975474716215e-05, "loss": 0.5204, "step": 1991 }, { "epoch": 0.4443453044836047, "grad_norm": 0.39765313267707825, "learning_rate": 1.902596260144911e-05, "loss": 0.4736, "step": 1992 }, { "epoch": 0.4445683693954941, "grad_norm": 0.17083793878555298, "learning_rate": 1.9024949228267423e-05, "loss": 0.5152, "step": 1993 }, { "epoch": 0.44479143430738344, "grad_norm": 0.1833147555589676, "learning_rate": 1.902393535522728e-05, "loss": 0.536, "step": 1994 }, { "epoch": 0.44501449921927283, "grad_norm": 0.17427003383636475, "learning_rate": 1.902292098238483e-05, "loss": 0.514, "step": 1995 }, { "epoch": 0.44523756413116217, "grad_norm": 0.15868180990219116, "learning_rate": 1.902190610979626e-05, "loss": 0.4869, "step": 1996 }, { "epoch": 0.4454606290430515, "grad_norm": 0.17407076060771942, "learning_rate": 1.9020890737517783e-05, "loss": 0.5134, "step": 1997 }, { "epoch": 0.4456836939549409, "grad_norm": 0.16776099801063538, "learning_rate": 1.901987486560563e-05, "loss": 0.5079, "step": 1998 }, { "epoch": 0.44590675886683023, "grad_norm": 0.2472338080406189, "learning_rate": 1.9018858494116074e-05, "loss": 0.5128, "step": 1999 }, { "epoch": 0.4461298237787196, "grad_norm": 0.16303667426109314, "learning_rate": 1.90178416231054e-05, "loss": 0.5225, "step": 2000 }, { "epoch": 0.44635288869060896, "grad_norm": 0.17127907276153564, "learning_rate": 1.901682425262993e-05, "loss": 0.5092, "step": 2001 }, { "epoch": 0.44657595360249835, "grad_norm": 0.17813526093959808, "learning_rate": 1.9015806382746018e-05, "loss": 0.5, "step": 2002 }, { "epoch": 0.4467990185143877, "grad_norm": 0.1653776615858078, "learning_rate": 1.901478801351004e-05, "loss": 0.5051, "step": 2003 }, { "epoch": 0.44702208342627703, "grad_norm": 0.16295598447322845, "learning_rate": 1.9013769144978392e-05, "loss": 0.4955, "step": 2004 }, { "epoch": 0.4472451483381664, "grad_norm": 0.18259268999099731, "learning_rate": 1.901274977720751e-05, "loss": 0.5504, "step": 2005 }, { "epoch": 0.44746821325005576, "grad_norm": 0.19138063490390778, "learning_rate": 1.9011729910253856e-05, "loss": 0.5129, "step": 2006 }, { "epoch": 0.44769127816194515, "grad_norm": 0.1588236689567566, "learning_rate": 1.9010709544173913e-05, "loss": 0.5078, "step": 2007 }, { "epoch": 0.4479143430738345, "grad_norm": 0.162478968501091, "learning_rate": 1.900968867902419e-05, "loss": 0.5121, "step": 2008 }, { "epoch": 0.4481374079857238, "grad_norm": 0.17752927541732788, "learning_rate": 1.900866731486124e-05, "loss": 0.4983, "step": 2009 }, { "epoch": 0.4483604728976132, "grad_norm": 0.17127980291843414, "learning_rate": 1.900764545174163e-05, "loss": 0.5347, "step": 2010 }, { "epoch": 0.44858353780950255, "grad_norm": 0.1686343103647232, "learning_rate": 1.900662308972195e-05, "loss": 0.5107, "step": 2011 }, { "epoch": 0.44880660272139195, "grad_norm": 0.16008520126342773, "learning_rate": 1.9005600228858832e-05, "loss": 0.5036, "step": 2012 }, { "epoch": 0.4490296676332813, "grad_norm": 0.16494229435920715, "learning_rate": 1.9004576869208922e-05, "loss": 0.5135, "step": 2013 }, { "epoch": 0.4492527325451706, "grad_norm": 0.17318038642406464, "learning_rate": 1.9003553010828906e-05, "loss": 0.5279, "step": 2014 }, { "epoch": 0.44947579745706, "grad_norm": 0.252083420753479, "learning_rate": 1.9002528653775492e-05, "loss": 0.5176, "step": 2015 }, { "epoch": 0.44969886236894935, "grad_norm": 0.1737111508846283, "learning_rate": 1.900150379810541e-05, "loss": 0.5051, "step": 2016 }, { "epoch": 0.44992192728083874, "grad_norm": 0.172331303358078, "learning_rate": 1.9000478443875427e-05, "loss": 0.5161, "step": 2017 }, { "epoch": 0.4501449921927281, "grad_norm": 0.17037436366081238, "learning_rate": 1.899945259114233e-05, "loss": 0.5282, "step": 2018 }, { "epoch": 0.45036805710461747, "grad_norm": 0.16774620115756989, "learning_rate": 1.8998426239962945e-05, "loss": 0.5056, "step": 2019 }, { "epoch": 0.4505911220165068, "grad_norm": 0.17856718599796295, "learning_rate": 1.899739939039411e-05, "loss": 0.514, "step": 2020 }, { "epoch": 0.45081418692839614, "grad_norm": 0.18140466511249542, "learning_rate": 1.89963720424927e-05, "loss": 0.5372, "step": 2021 }, { "epoch": 0.45103725184028554, "grad_norm": 0.1641564816236496, "learning_rate": 1.8995344196315618e-05, "loss": 0.5205, "step": 2022 }, { "epoch": 0.4512603167521749, "grad_norm": 0.16866321861743927, "learning_rate": 1.899431585191979e-05, "loss": 0.4906, "step": 2023 }, { "epoch": 0.45148338166406426, "grad_norm": 0.16418704390525818, "learning_rate": 1.8993287009362175e-05, "loss": 0.5275, "step": 2024 }, { "epoch": 0.4517064465759536, "grad_norm": 0.15996740758419037, "learning_rate": 1.8992257668699756e-05, "loss": 0.5124, "step": 2025 }, { "epoch": 0.45192951148784294, "grad_norm": 0.16520540416240692, "learning_rate": 1.899122782998954e-05, "loss": 0.5045, "step": 2026 }, { "epoch": 0.45215257639973233, "grad_norm": 0.16536462306976318, "learning_rate": 1.8990197493288575e-05, "loss": 0.5003, "step": 2027 }, { "epoch": 0.45237564131162167, "grad_norm": 0.16777758300304413, "learning_rate": 1.8989166658653916e-05, "loss": 0.5443, "step": 2028 }, { "epoch": 0.45259870622351106, "grad_norm": 0.16215604543685913, "learning_rate": 1.8988135326142668e-05, "loss": 0.4965, "step": 2029 }, { "epoch": 0.4528217711354004, "grad_norm": 0.1622023582458496, "learning_rate": 1.8987103495811947e-05, "loss": 0.4606, "step": 2030 }, { "epoch": 0.45304483604728973, "grad_norm": 0.16461379826068878, "learning_rate": 1.8986071167718902e-05, "loss": 0.5273, "step": 2031 }, { "epoch": 0.4532679009591791, "grad_norm": 0.16976390779018402, "learning_rate": 1.8985038341920715e-05, "loss": 0.4889, "step": 2032 }, { "epoch": 0.45349096587106846, "grad_norm": 0.1569637656211853, "learning_rate": 1.898400501847458e-05, "loss": 0.4828, "step": 2033 }, { "epoch": 0.45371403078295786, "grad_norm": 0.16640910506248474, "learning_rate": 1.898297119743774e-05, "loss": 0.5351, "step": 2034 }, { "epoch": 0.4539370956948472, "grad_norm": 0.1644599735736847, "learning_rate": 1.898193687886745e-05, "loss": 0.4869, "step": 2035 }, { "epoch": 0.4541601606067366, "grad_norm": 0.16732025146484375, "learning_rate": 1.8980902062820997e-05, "loss": 0.4893, "step": 2036 }, { "epoch": 0.4543832255186259, "grad_norm": 0.16019105911254883, "learning_rate": 1.8979866749355694e-05, "loss": 0.5038, "step": 2037 }, { "epoch": 0.45460629043051526, "grad_norm": 0.17239058017730713, "learning_rate": 1.8978830938528884e-05, "loss": 0.5106, "step": 2038 }, { "epoch": 0.45482935534240465, "grad_norm": 0.15983308851718903, "learning_rate": 1.8977794630397942e-05, "loss": 0.5163, "step": 2039 }, { "epoch": 0.455052420254294, "grad_norm": 0.16848912835121155, "learning_rate": 1.8976757825020255e-05, "loss": 0.5462, "step": 2040 }, { "epoch": 0.4552754851661834, "grad_norm": 0.15303179621696472, "learning_rate": 1.897572052245326e-05, "loss": 0.4594, "step": 2041 }, { "epoch": 0.4554985500780727, "grad_norm": 0.17675776779651642, "learning_rate": 1.8974682722754397e-05, "loss": 0.5385, "step": 2042 }, { "epoch": 0.45572161498996205, "grad_norm": 0.17349328100681305, "learning_rate": 1.8973644425981154e-05, "loss": 0.4953, "step": 2043 }, { "epoch": 0.45594467990185145, "grad_norm": 0.15753738582134247, "learning_rate": 1.897260563219104e-05, "loss": 0.4851, "step": 2044 }, { "epoch": 0.4561677448137408, "grad_norm": 0.16278420388698578, "learning_rate": 1.897156634144158e-05, "loss": 0.4994, "step": 2045 }, { "epoch": 0.4563908097256302, "grad_norm": 0.1592012643814087, "learning_rate": 1.8970526553790346e-05, "loss": 0.513, "step": 2046 }, { "epoch": 0.4566138746375195, "grad_norm": 0.18133483827114105, "learning_rate": 1.8969486269294922e-05, "loss": 0.4734, "step": 2047 }, { "epoch": 0.4568369395494089, "grad_norm": 0.170905202627182, "learning_rate": 1.8968445488012933e-05, "loss": 0.5401, "step": 2048 }, { "epoch": 0.45706000446129824, "grad_norm": 0.16522808372974396, "learning_rate": 1.8967404210002014e-05, "loss": 0.5008, "step": 2049 }, { "epoch": 0.4572830693731876, "grad_norm": 0.18945086002349854, "learning_rate": 1.8966362435319845e-05, "loss": 0.5064, "step": 2050 }, { "epoch": 0.45750613428507697, "grad_norm": 0.17545610666275024, "learning_rate": 1.8965320164024123e-05, "loss": 0.5105, "step": 2051 }, { "epoch": 0.4577291991969663, "grad_norm": 0.1666799783706665, "learning_rate": 1.8964277396172577e-05, "loss": 0.5233, "step": 2052 }, { "epoch": 0.4579522641088557, "grad_norm": 0.16869474947452545, "learning_rate": 1.896323413182296e-05, "loss": 0.4921, "step": 2053 }, { "epoch": 0.45817532902074504, "grad_norm": 0.17108272016048431, "learning_rate": 1.8962190371033057e-05, "loss": 0.5358, "step": 2054 }, { "epoch": 0.4583983939326344, "grad_norm": 0.16527149081230164, "learning_rate": 1.8961146113860676e-05, "loss": 0.5033, "step": 2055 }, { "epoch": 0.45862145884452377, "grad_norm": 0.17335473001003265, "learning_rate": 1.8960101360363656e-05, "loss": 0.5156, "step": 2056 }, { "epoch": 0.4588445237564131, "grad_norm": 0.17064844071865082, "learning_rate": 1.895905611059986e-05, "loss": 0.5118, "step": 2057 }, { "epoch": 0.4590675886683025, "grad_norm": 0.16565892100334167, "learning_rate": 1.8958010364627183e-05, "loss": 0.5152, "step": 2058 }, { "epoch": 0.45929065358019183, "grad_norm": 0.1769896149635315, "learning_rate": 1.8956964122503546e-05, "loss": 0.5483, "step": 2059 }, { "epoch": 0.45951371849208117, "grad_norm": 0.1574466973543167, "learning_rate": 1.895591738428689e-05, "loss": 0.5205, "step": 2060 }, { "epoch": 0.45973678340397056, "grad_norm": 0.16593651473522186, "learning_rate": 1.8954870150035195e-05, "loss": 0.4863, "step": 2061 }, { "epoch": 0.4599598483158599, "grad_norm": 0.16003793478012085, "learning_rate": 1.8953822419806468e-05, "loss": 0.5051, "step": 2062 }, { "epoch": 0.4601829132277493, "grad_norm": 0.17091605067253113, "learning_rate": 1.895277419365873e-05, "loss": 0.5037, "step": 2063 }, { "epoch": 0.4604059781396386, "grad_norm": 0.17469583451747894, "learning_rate": 1.895172547165004e-05, "loss": 0.4878, "step": 2064 }, { "epoch": 0.460629043051528, "grad_norm": 0.16121211647987366, "learning_rate": 1.895067625383849e-05, "loss": 0.4768, "step": 2065 }, { "epoch": 0.46085210796341736, "grad_norm": 0.1591392159461975, "learning_rate": 1.894962654028218e-05, "loss": 0.5138, "step": 2066 }, { "epoch": 0.4610751728753067, "grad_norm": 1.2021454572677612, "learning_rate": 1.8948576331039264e-05, "loss": 0.5438, "step": 2067 }, { "epoch": 0.4612982377871961, "grad_norm": 0.1606753021478653, "learning_rate": 1.8947525626167896e-05, "loss": 0.5376, "step": 2068 }, { "epoch": 0.4615213026990854, "grad_norm": 0.17104879021644592, "learning_rate": 1.894647442572628e-05, "loss": 0.5164, "step": 2069 }, { "epoch": 0.4617443676109748, "grad_norm": 0.16374671459197998, "learning_rate": 1.8945422729772633e-05, "loss": 0.516, "step": 2070 }, { "epoch": 0.46196743252286415, "grad_norm": 0.16128143668174744, "learning_rate": 1.8944370538365206e-05, "loss": 0.5157, "step": 2071 }, { "epoch": 0.4621904974347535, "grad_norm": 0.16720320284366608, "learning_rate": 1.8943317851562278e-05, "loss": 0.5291, "step": 2072 }, { "epoch": 0.4624135623466429, "grad_norm": 0.1879906952381134, "learning_rate": 1.8942264669422154e-05, "loss": 0.5145, "step": 2073 }, { "epoch": 0.4626366272585322, "grad_norm": 0.17528733611106873, "learning_rate": 1.894121099200316e-05, "loss": 0.5235, "step": 2074 }, { "epoch": 0.4628596921704216, "grad_norm": 0.16449770331382751, "learning_rate": 1.894015681936366e-05, "loss": 0.5165, "step": 2075 }, { "epoch": 0.46308275708231095, "grad_norm": 0.16646511852741241, "learning_rate": 1.8939102151562036e-05, "loss": 0.5093, "step": 2076 }, { "epoch": 0.46330582199420034, "grad_norm": 0.17490622401237488, "learning_rate": 1.893804698865671e-05, "loss": 0.5385, "step": 2077 }, { "epoch": 0.4635288869060897, "grad_norm": 0.2023608237504959, "learning_rate": 1.893699133070612e-05, "loss": 0.4972, "step": 2078 }, { "epoch": 0.463751951817979, "grad_norm": 0.16244065761566162, "learning_rate": 1.893593517776873e-05, "loss": 0.5269, "step": 2079 }, { "epoch": 0.4639750167298684, "grad_norm": 0.16335059702396393, "learning_rate": 1.8934878529903043e-05, "loss": 0.5107, "step": 2080 }, { "epoch": 0.46419808164175774, "grad_norm": 0.16115811467170715, "learning_rate": 1.8933821387167582e-05, "loss": 0.5214, "step": 2081 }, { "epoch": 0.46442114655364714, "grad_norm": 0.1833171248435974, "learning_rate": 1.8932763749620894e-05, "loss": 0.4664, "step": 2082 }, { "epoch": 0.46464421146553647, "grad_norm": 0.1721174716949463, "learning_rate": 1.893170561732156e-05, "loss": 0.5224, "step": 2083 }, { "epoch": 0.4648672763774258, "grad_norm": 0.16398632526397705, "learning_rate": 1.8930646990328188e-05, "loss": 0.4968, "step": 2084 }, { "epoch": 0.4650903412893152, "grad_norm": 0.17112642526626587, "learning_rate": 1.892958786869941e-05, "loss": 0.5241, "step": 2085 }, { "epoch": 0.46531340620120454, "grad_norm": 0.18161650002002716, "learning_rate": 1.8928528252493884e-05, "loss": 0.5047, "step": 2086 }, { "epoch": 0.46553647111309393, "grad_norm": 0.1646048128604889, "learning_rate": 1.8927468141770304e-05, "loss": 0.5271, "step": 2087 }, { "epoch": 0.46575953602498327, "grad_norm": 0.16542619466781616, "learning_rate": 1.8926407536587378e-05, "loss": 0.5338, "step": 2088 }, { "epoch": 0.4659826009368726, "grad_norm": 0.17052631080150604, "learning_rate": 1.8925346437003856e-05, "loss": 0.5276, "step": 2089 }, { "epoch": 0.466205665848762, "grad_norm": 0.17085763812065125, "learning_rate": 1.8924284843078503e-05, "loss": 0.5277, "step": 2090 }, { "epoch": 0.46642873076065133, "grad_norm": 0.1689939796924591, "learning_rate": 1.8923222754870124e-05, "loss": 0.4889, "step": 2091 }, { "epoch": 0.4666517956725407, "grad_norm": 0.16189821064472198, "learning_rate": 1.8922160172437535e-05, "loss": 0.5009, "step": 2092 }, { "epoch": 0.46687486058443006, "grad_norm": 0.16525843739509583, "learning_rate": 1.89210970958396e-05, "loss": 0.5073, "step": 2093 }, { "epoch": 0.46709792549631945, "grad_norm": 0.159906804561615, "learning_rate": 1.8920033525135184e-05, "loss": 0.5107, "step": 2094 }, { "epoch": 0.4673209904082088, "grad_norm": 0.15631826221942902, "learning_rate": 1.8918969460383205e-05, "loss": 0.4839, "step": 2095 }, { "epoch": 0.46754405532009813, "grad_norm": 0.1734917014837265, "learning_rate": 1.8917904901642593e-05, "loss": 0.5121, "step": 2096 }, { "epoch": 0.4677671202319875, "grad_norm": 0.17475415766239166, "learning_rate": 1.8916839848972315e-05, "loss": 0.5198, "step": 2097 }, { "epoch": 0.46799018514387686, "grad_norm": 0.15618795156478882, "learning_rate": 1.8915774302431357e-05, "loss": 0.4733, "step": 2098 }, { "epoch": 0.46821325005576625, "grad_norm": 0.17231950163841248, "learning_rate": 1.8914708262078735e-05, "loss": 0.4975, "step": 2099 }, { "epoch": 0.4684363149676556, "grad_norm": 0.1630786657333374, "learning_rate": 1.891364172797349e-05, "loss": 0.4906, "step": 2100 }, { "epoch": 0.4686593798795449, "grad_norm": 0.20263253152370453, "learning_rate": 1.89125747001747e-05, "loss": 0.4961, "step": 2101 }, { "epoch": 0.4688824447914343, "grad_norm": 0.1711629182100296, "learning_rate": 1.891150717874146e-05, "loss": 0.5197, "step": 2102 }, { "epoch": 0.46910550970332365, "grad_norm": 0.15791350603103638, "learning_rate": 1.89104391637329e-05, "loss": 0.5151, "step": 2103 }, { "epoch": 0.46932857461521305, "grad_norm": 0.1648074984550476, "learning_rate": 1.890937065520817e-05, "loss": 0.5206, "step": 2104 }, { "epoch": 0.4695516395271024, "grad_norm": 0.16917914152145386, "learning_rate": 1.8908301653226448e-05, "loss": 0.5308, "step": 2105 }, { "epoch": 0.4697747044389917, "grad_norm": 0.16751410067081451, "learning_rate": 1.8907232157846946e-05, "loss": 0.512, "step": 2106 }, { "epoch": 0.4699977693508811, "grad_norm": 0.16895310580730438, "learning_rate": 1.89061621691289e-05, "loss": 0.5134, "step": 2107 }, { "epoch": 0.47022083426277045, "grad_norm": 0.1659642457962036, "learning_rate": 1.8905091687131567e-05, "loss": 0.5125, "step": 2108 }, { "epoch": 0.47044389917465984, "grad_norm": 0.18306776881217957, "learning_rate": 1.8904020711914243e-05, "loss": 0.5404, "step": 2109 }, { "epoch": 0.4706669640865492, "grad_norm": 0.15722155570983887, "learning_rate": 1.8902949243536245e-05, "loss": 0.4929, "step": 2110 }, { "epoch": 0.47089002899843857, "grad_norm": 0.17024749517440796, "learning_rate": 1.8901877282056916e-05, "loss": 0.5129, "step": 2111 }, { "epoch": 0.4711130939103279, "grad_norm": 0.1668287068605423, "learning_rate": 1.8900804827535626e-05, "loss": 0.5386, "step": 2112 }, { "epoch": 0.47133615882221724, "grad_norm": 0.17657139897346497, "learning_rate": 1.8899731880031778e-05, "loss": 0.4682, "step": 2113 }, { "epoch": 0.47155922373410664, "grad_norm": 0.17689798772335052, "learning_rate": 1.8898658439604798e-05, "loss": 0.4814, "step": 2114 }, { "epoch": 0.471782288645996, "grad_norm": 0.16755974292755127, "learning_rate": 1.8897584506314137e-05, "loss": 0.4956, "step": 2115 }, { "epoch": 0.47200535355788537, "grad_norm": 0.16352280974388123, "learning_rate": 1.8896510080219277e-05, "loss": 0.5034, "step": 2116 }, { "epoch": 0.4722284184697747, "grad_norm": 0.16078925132751465, "learning_rate": 1.889543516137973e-05, "loss": 0.5017, "step": 2117 }, { "epoch": 0.47245148338166404, "grad_norm": 0.17038793861865997, "learning_rate": 1.8894359749855027e-05, "loss": 0.5128, "step": 2118 }, { "epoch": 0.47267454829355343, "grad_norm": 0.16477574408054352, "learning_rate": 1.8893283845704733e-05, "loss": 0.4765, "step": 2119 }, { "epoch": 0.47289761320544277, "grad_norm": 0.18243131041526794, "learning_rate": 1.889220744898844e-05, "loss": 0.5266, "step": 2120 }, { "epoch": 0.47312067811733216, "grad_norm": 0.15715399384498596, "learning_rate": 1.8891130559765763e-05, "loss": 0.5092, "step": 2121 }, { "epoch": 0.4733437430292215, "grad_norm": 0.16996978223323822, "learning_rate": 1.8890053178096353e-05, "loss": 0.5166, "step": 2122 }, { "epoch": 0.4735668079411109, "grad_norm": 0.16575470566749573, "learning_rate": 1.888897530403987e-05, "loss": 0.5212, "step": 2123 }, { "epoch": 0.4737898728530002, "grad_norm": 0.1607031673192978, "learning_rate": 1.8887896937656028e-05, "loss": 0.5067, "step": 2124 }, { "epoch": 0.47401293776488956, "grad_norm": 0.157185897231102, "learning_rate": 1.8886818079004545e-05, "loss": 0.4801, "step": 2125 }, { "epoch": 0.47423600267677896, "grad_norm": 0.1661345213651657, "learning_rate": 1.8885738728145173e-05, "loss": 0.5212, "step": 2126 }, { "epoch": 0.4744590675886683, "grad_norm": 0.17212405800819397, "learning_rate": 1.8884658885137698e-05, "loss": 0.4872, "step": 2127 }, { "epoch": 0.4746821325005577, "grad_norm": 0.1681511402130127, "learning_rate": 1.8883578550041925e-05, "loss": 0.5266, "step": 2128 }, { "epoch": 0.474905197412447, "grad_norm": 0.15793408453464508, "learning_rate": 1.8882497722917697e-05, "loss": 0.505, "step": 2129 }, { "epoch": 0.47512826232433636, "grad_norm": 0.1688978672027588, "learning_rate": 1.8881416403824867e-05, "loss": 0.5005, "step": 2130 }, { "epoch": 0.47535132723622575, "grad_norm": 0.17982251942157745, "learning_rate": 1.8880334592823333e-05, "loss": 0.5177, "step": 2131 }, { "epoch": 0.4755743921481151, "grad_norm": 0.15668730437755585, "learning_rate": 1.8879252289973008e-05, "loss": 0.5076, "step": 2132 }, { "epoch": 0.4757974570600045, "grad_norm": 0.15427713096141815, "learning_rate": 1.8878169495333843e-05, "loss": 0.4916, "step": 2133 }, { "epoch": 0.4760205219718938, "grad_norm": 0.1573878973722458, "learning_rate": 1.88770862089658e-05, "loss": 0.4851, "step": 2134 }, { "epoch": 0.47624358688378315, "grad_norm": 0.16141542792320251, "learning_rate": 1.887600243092889e-05, "loss": 0.5065, "step": 2135 }, { "epoch": 0.47646665179567255, "grad_norm": 0.1874970644712448, "learning_rate": 1.8874918161283127e-05, "loss": 0.4987, "step": 2136 }, { "epoch": 0.4766897167075619, "grad_norm": 0.1556268036365509, "learning_rate": 1.887383340008857e-05, "loss": 0.5001, "step": 2137 }, { "epoch": 0.4769127816194513, "grad_norm": 0.16177254915237427, "learning_rate": 1.8872748147405303e-05, "loss": 0.5145, "step": 2138 }, { "epoch": 0.4771358465313406, "grad_norm": 0.16272678971290588, "learning_rate": 1.8871662403293434e-05, "loss": 0.5111, "step": 2139 }, { "epoch": 0.47735891144323, "grad_norm": 0.157213494181633, "learning_rate": 1.8870576167813096e-05, "loss": 0.4975, "step": 2140 }, { "epoch": 0.47758197635511934, "grad_norm": 0.16583600640296936, "learning_rate": 1.886948944102445e-05, "loss": 0.4835, "step": 2141 }, { "epoch": 0.4778050412670087, "grad_norm": 0.15528440475463867, "learning_rate": 1.8868402222987687e-05, "loss": 0.4821, "step": 2142 }, { "epoch": 0.47802810617889807, "grad_norm": 0.1678980439901352, "learning_rate": 1.8867314513763023e-05, "loss": 0.5126, "step": 2143 }, { "epoch": 0.4782511710907874, "grad_norm": 0.1719445437192917, "learning_rate": 1.886622631341071e-05, "loss": 0.5278, "step": 2144 }, { "epoch": 0.4784742360026768, "grad_norm": 0.16177958250045776, "learning_rate": 1.886513762199101e-05, "loss": 0.4716, "step": 2145 }, { "epoch": 0.47869730091456614, "grad_norm": 0.17055903375148773, "learning_rate": 1.886404843956422e-05, "loss": 0.5155, "step": 2146 }, { "epoch": 0.4789203658264555, "grad_norm": 0.16013142466545105, "learning_rate": 1.8862958766190673e-05, "loss": 0.4882, "step": 2147 }, { "epoch": 0.47914343073834487, "grad_norm": 0.15844453871250153, "learning_rate": 1.886186860193072e-05, "loss": 0.5125, "step": 2148 }, { "epoch": 0.4793664956502342, "grad_norm": 0.16520345211029053, "learning_rate": 1.886077794684474e-05, "loss": 0.5355, "step": 2149 }, { "epoch": 0.4795895605621236, "grad_norm": 0.16560572385787964, "learning_rate": 1.885968680099314e-05, "loss": 0.5011, "step": 2150 }, { "epoch": 0.47981262547401293, "grad_norm": 0.16460593044757843, "learning_rate": 1.885859516443636e-05, "loss": 0.5186, "step": 2151 }, { "epoch": 0.4800356903859023, "grad_norm": 0.16416500508785248, "learning_rate": 1.885750303723485e-05, "loss": 0.5121, "step": 2152 }, { "epoch": 0.48025875529779166, "grad_norm": 0.1615086942911148, "learning_rate": 1.8856410419449108e-05, "loss": 0.5291, "step": 2153 }, { "epoch": 0.480481820209681, "grad_norm": 0.16628290712833405, "learning_rate": 1.885531731113965e-05, "loss": 0.5166, "step": 2154 }, { "epoch": 0.4807048851215704, "grad_norm": 0.212614044547081, "learning_rate": 1.8854223712367017e-05, "loss": 0.4687, "step": 2155 }, { "epoch": 0.48092795003345973, "grad_norm": 0.16417112946510315, "learning_rate": 1.8853129623191775e-05, "loss": 0.4839, "step": 2156 }, { "epoch": 0.4811510149453491, "grad_norm": 0.16616222262382507, "learning_rate": 1.8852035043674534e-05, "loss": 0.5013, "step": 2157 }, { "epoch": 0.48137407985723846, "grad_norm": 0.16397857666015625, "learning_rate": 1.8850939973875907e-05, "loss": 0.5163, "step": 2158 }, { "epoch": 0.4815971447691278, "grad_norm": 0.1688777208328247, "learning_rate": 1.8849844413856548e-05, "loss": 0.5154, "step": 2159 }, { "epoch": 0.4818202096810172, "grad_norm": 0.16248802840709686, "learning_rate": 1.884874836367714e-05, "loss": 0.49, "step": 2160 }, { "epoch": 0.4820432745929065, "grad_norm": 0.17419035732746124, "learning_rate": 1.8847651823398385e-05, "loss": 0.5105, "step": 2161 }, { "epoch": 0.4822663395047959, "grad_norm": 0.16489480435848236, "learning_rate": 1.884655479308102e-05, "loss": 0.4967, "step": 2162 }, { "epoch": 0.48248940441668525, "grad_norm": 0.16695787012577057, "learning_rate": 1.8845457272785802e-05, "loss": 0.5316, "step": 2163 }, { "epoch": 0.4827124693285746, "grad_norm": 0.1630323827266693, "learning_rate": 1.884435926257352e-05, "loss": 0.5144, "step": 2164 }, { "epoch": 0.482935534240464, "grad_norm": 0.16861893236637115, "learning_rate": 1.8843260762504985e-05, "loss": 0.5402, "step": 2165 }, { "epoch": 0.4831585991523533, "grad_norm": 0.16794002056121826, "learning_rate": 1.884216177264105e-05, "loss": 0.5581, "step": 2166 }, { "epoch": 0.4833816640642427, "grad_norm": 0.16783180832862854, "learning_rate": 1.8841062293042572e-05, "loss": 0.4884, "step": 2167 }, { "epoch": 0.48360472897613205, "grad_norm": 0.16696153581142426, "learning_rate": 1.8839962323770455e-05, "loss": 0.4893, "step": 2168 }, { "epoch": 0.48382779388802144, "grad_norm": 0.17243477702140808, "learning_rate": 1.8838861864885617e-05, "loss": 0.5134, "step": 2169 }, { "epoch": 0.4840508587999108, "grad_norm": 0.16844888031482697, "learning_rate": 1.883776091644901e-05, "loss": 0.5203, "step": 2170 }, { "epoch": 0.4842739237118001, "grad_norm": 0.16011419892311096, "learning_rate": 1.8836659478521614e-05, "loss": 0.4837, "step": 2171 }, { "epoch": 0.4844969886236895, "grad_norm": 0.16117213666439056, "learning_rate": 1.883555755116443e-05, "loss": 0.5124, "step": 2172 }, { "epoch": 0.48472005353557884, "grad_norm": 0.15577594935894012, "learning_rate": 1.883445513443849e-05, "loss": 0.4939, "step": 2173 }, { "epoch": 0.48494311844746824, "grad_norm": 0.17128697037696838, "learning_rate": 1.883335222840485e-05, "loss": 0.5331, "step": 2174 }, { "epoch": 0.4851661833593576, "grad_norm": 0.16078589856624603, "learning_rate": 1.8832248833124606e-05, "loss": 0.5348, "step": 2175 }, { "epoch": 0.4853892482712469, "grad_norm": 0.17158381640911102, "learning_rate": 1.8831144948658863e-05, "loss": 0.5016, "step": 2176 }, { "epoch": 0.4856123131831363, "grad_norm": 0.17395475506782532, "learning_rate": 1.883004057506876e-05, "loss": 0.5145, "step": 2177 }, { "epoch": 0.48583537809502564, "grad_norm": 0.15187759697437286, "learning_rate": 1.882893571241547e-05, "loss": 0.4747, "step": 2178 }, { "epoch": 0.48605844300691503, "grad_norm": 0.1744384616613388, "learning_rate": 1.8827830360760184e-05, "loss": 0.488, "step": 2179 }, { "epoch": 0.48628150791880437, "grad_norm": 0.16714079678058624, "learning_rate": 1.8826724520164118e-05, "loss": 0.5117, "step": 2180 }, { "epoch": 0.4865045728306937, "grad_norm": 0.1541585475206375, "learning_rate": 1.8825618190688534e-05, "loss": 0.4808, "step": 2181 }, { "epoch": 0.4867276377425831, "grad_norm": 0.16642670333385468, "learning_rate": 1.8824511372394694e-05, "loss": 0.5223, "step": 2182 }, { "epoch": 0.48695070265447243, "grad_norm": 0.16418053209781647, "learning_rate": 1.8823404065343904e-05, "loss": 0.5022, "step": 2183 }, { "epoch": 0.4871737675663618, "grad_norm": 0.1558452993631363, "learning_rate": 1.88222962695975e-05, "loss": 0.5009, "step": 2184 }, { "epoch": 0.48739683247825116, "grad_norm": 0.16112567484378815, "learning_rate": 1.8821187985216835e-05, "loss": 0.4851, "step": 2185 }, { "epoch": 0.48761989739014056, "grad_norm": 0.15951795876026154, "learning_rate": 1.8820079212263287e-05, "loss": 0.5364, "step": 2186 }, { "epoch": 0.4878429623020299, "grad_norm": 0.15660803020000458, "learning_rate": 1.8818969950798274e-05, "loss": 0.4687, "step": 2187 }, { "epoch": 0.48806602721391923, "grad_norm": 0.16717955470085144, "learning_rate": 1.881786020088323e-05, "loss": 0.5184, "step": 2188 }, { "epoch": 0.4882890921258086, "grad_norm": 0.4413386583328247, "learning_rate": 1.8816749962579625e-05, "loss": 0.49, "step": 2189 }, { "epoch": 0.48851215703769796, "grad_norm": 0.18018092215061188, "learning_rate": 1.8815639235948945e-05, "loss": 0.4948, "step": 2190 }, { "epoch": 0.48873522194958735, "grad_norm": 0.17363835871219635, "learning_rate": 1.881452802105271e-05, "loss": 0.5255, "step": 2191 }, { "epoch": 0.4889582868614767, "grad_norm": 0.16417300701141357, "learning_rate": 1.8813416317952474e-05, "loss": 0.4984, "step": 2192 }, { "epoch": 0.489181351773366, "grad_norm": 0.1698407083749771, "learning_rate": 1.8812304126709797e-05, "loss": 0.5155, "step": 2193 }, { "epoch": 0.4894044166852554, "grad_norm": 0.16547353565692902, "learning_rate": 1.881119144738629e-05, "loss": 0.504, "step": 2194 }, { "epoch": 0.48962748159714475, "grad_norm": 0.15916848182678223, "learning_rate": 1.8810078280043574e-05, "loss": 0.5158, "step": 2195 }, { "epoch": 0.48985054650903415, "grad_norm": 0.16655400395393372, "learning_rate": 1.8808964624743303e-05, "loss": 0.521, "step": 2196 }, { "epoch": 0.4900736114209235, "grad_norm": 0.16791512072086334, "learning_rate": 1.8807850481547165e-05, "loss": 0.5327, "step": 2197 }, { "epoch": 0.4902966763328129, "grad_norm": 0.1612214297056198, "learning_rate": 1.880673585051686e-05, "loss": 0.5074, "step": 2198 }, { "epoch": 0.4905197412447022, "grad_norm": 0.33009010553359985, "learning_rate": 1.880562073171413e-05, "loss": 0.4991, "step": 2199 }, { "epoch": 0.49074280615659155, "grad_norm": 0.17632457613945007, "learning_rate": 1.8804505125200732e-05, "loss": 0.5094, "step": 2200 }, { "epoch": 0.49096587106848094, "grad_norm": 0.17380337417125702, "learning_rate": 1.8803389031038462e-05, "loss": 0.5132, "step": 2201 }, { "epoch": 0.4911889359803703, "grad_norm": 0.16013850271701813, "learning_rate": 1.880227244928913e-05, "loss": 0.4987, "step": 2202 }, { "epoch": 0.49141200089225967, "grad_norm": 0.18471701443195343, "learning_rate": 1.8801155380014578e-05, "loss": 0.5203, "step": 2203 }, { "epoch": 0.491635065804149, "grad_norm": 0.16928328573703766, "learning_rate": 1.8800037823276683e-05, "loss": 0.4868, "step": 2204 }, { "epoch": 0.49185813071603834, "grad_norm": 0.1749187409877777, "learning_rate": 1.8798919779137337e-05, "loss": 0.5049, "step": 2205 }, { "epoch": 0.49208119562792774, "grad_norm": 0.173324853181839, "learning_rate": 1.8797801247658465e-05, "loss": 0.4963, "step": 2206 }, { "epoch": 0.4923042605398171, "grad_norm": 0.16650435328483582, "learning_rate": 1.8796682228902024e-05, "loss": 0.5232, "step": 2207 }, { "epoch": 0.49252732545170647, "grad_norm": 0.16675227880477905, "learning_rate": 1.8795562722929986e-05, "loss": 0.4887, "step": 2208 }, { "epoch": 0.4927503903635958, "grad_norm": 0.20501257479190826, "learning_rate": 1.8794442729804356e-05, "loss": 0.5045, "step": 2209 }, { "epoch": 0.49297345527548514, "grad_norm": 0.15837359428405762, "learning_rate": 1.879332224958717e-05, "loss": 0.4704, "step": 2210 }, { "epoch": 0.49319652018737453, "grad_norm": 0.15580782294273376, "learning_rate": 1.8792201282340485e-05, "loss": 0.4753, "step": 2211 }, { "epoch": 0.49341958509926387, "grad_norm": 0.16155748069286346, "learning_rate": 1.879107982812639e-05, "loss": 0.5178, "step": 2212 }, { "epoch": 0.49364265001115326, "grad_norm": 0.17078126966953278, "learning_rate": 1.8789957887006994e-05, "loss": 0.5208, "step": 2213 }, { "epoch": 0.4938657149230426, "grad_norm": 0.16327013075351715, "learning_rate": 1.8788835459044438e-05, "loss": 0.5446, "step": 2214 }, { "epoch": 0.494088779834932, "grad_norm": 0.1604888141155243, "learning_rate": 1.878771254430089e-05, "loss": 0.5142, "step": 2215 }, { "epoch": 0.4943118447468213, "grad_norm": 0.16108398139476776, "learning_rate": 1.8786589142838548e-05, "loss": 0.517, "step": 2216 }, { "epoch": 0.49453490965871066, "grad_norm": 0.15508361160755157, "learning_rate": 1.8785465254719625e-05, "loss": 0.4782, "step": 2217 }, { "epoch": 0.49475797457060006, "grad_norm": 0.1612912118434906, "learning_rate": 1.878434088000638e-05, "loss": 0.5104, "step": 2218 }, { "epoch": 0.4949810394824894, "grad_norm": 0.1641056090593338, "learning_rate": 1.8783216018761075e-05, "loss": 0.5227, "step": 2219 }, { "epoch": 0.4952041043943788, "grad_norm": 0.16892564296722412, "learning_rate": 1.878209067104602e-05, "loss": 0.5142, "step": 2220 }, { "epoch": 0.4954271693062681, "grad_norm": 0.18777111172676086, "learning_rate": 1.8780964836923545e-05, "loss": 0.5091, "step": 2221 }, { "epoch": 0.49565023421815746, "grad_norm": 0.15524406731128693, "learning_rate": 1.8779838516455998e-05, "loss": 0.5155, "step": 2222 }, { "epoch": 0.49587329913004685, "grad_norm": 0.1688852459192276, "learning_rate": 1.877871170970577e-05, "loss": 0.5266, "step": 2223 }, { "epoch": 0.4960963640419362, "grad_norm": 0.2029954195022583, "learning_rate": 1.8777584416735268e-05, "loss": 0.4751, "step": 2224 }, { "epoch": 0.4963194289538256, "grad_norm": 0.168426051735878, "learning_rate": 1.8776456637606926e-05, "loss": 0.5263, "step": 2225 }, { "epoch": 0.4965424938657149, "grad_norm": 0.18985538184642792, "learning_rate": 1.877532837238321e-05, "loss": 0.528, "step": 2226 }, { "epoch": 0.4967655587776043, "grad_norm": 0.15908297896385193, "learning_rate": 1.8774199621126605e-05, "loss": 0.5226, "step": 2227 }, { "epoch": 0.49698862368949365, "grad_norm": 0.15700525045394897, "learning_rate": 1.8773070383899638e-05, "loss": 0.4857, "step": 2228 }, { "epoch": 0.497211688601383, "grad_norm": 0.17015953361988068, "learning_rate": 1.877194066076485e-05, "loss": 0.488, "step": 2229 }, { "epoch": 0.4974347535132724, "grad_norm": 0.21846015751361847, "learning_rate": 1.8770810451784806e-05, "loss": 0.5305, "step": 2230 }, { "epoch": 0.4976578184251617, "grad_norm": 0.16297823190689087, "learning_rate": 1.8769679757022114e-05, "loss": 0.494, "step": 2231 }, { "epoch": 0.4978808833370511, "grad_norm": 0.1758735030889511, "learning_rate": 1.876854857653939e-05, "loss": 0.5394, "step": 2232 }, { "epoch": 0.49810394824894044, "grad_norm": 0.1673547923564911, "learning_rate": 1.876741691039929e-05, "loss": 0.5035, "step": 2233 }, { "epoch": 0.4983270131608298, "grad_norm": 0.16478639841079712, "learning_rate": 1.8766284758664487e-05, "loss": 0.5, "step": 2234 }, { "epoch": 0.49855007807271917, "grad_norm": 0.16910696029663086, "learning_rate": 1.8765152121397697e-05, "loss": 0.5087, "step": 2235 }, { "epoch": 0.4987731429846085, "grad_norm": 0.16947272419929504, "learning_rate": 1.876401899866165e-05, "loss": 0.5064, "step": 2236 }, { "epoch": 0.4989962078964979, "grad_norm": 0.16172346472740173, "learning_rate": 1.87628853905191e-05, "loss": 0.4901, "step": 2237 }, { "epoch": 0.49921927280838724, "grad_norm": 0.16061736643314362, "learning_rate": 1.8761751297032838e-05, "loss": 0.5038, "step": 2238 }, { "epoch": 0.4994423377202766, "grad_norm": 0.15990641713142395, "learning_rate": 1.8760616718265676e-05, "loss": 0.4854, "step": 2239 }, { "epoch": 0.49966540263216597, "grad_norm": 0.16735920310020447, "learning_rate": 1.875948165428045e-05, "loss": 0.5027, "step": 2240 }, { "epoch": 0.4998884675440553, "grad_norm": 0.16122539341449738, "learning_rate": 1.8758346105140033e-05, "loss": 0.4922, "step": 2241 }, { "epoch": 0.5001115324559446, "grad_norm": 0.1718963235616684, "learning_rate": 1.8757210070907315e-05, "loss": 0.5138, "step": 2242 }, { "epoch": 0.500334597367834, "grad_norm": 0.17143313586711884, "learning_rate": 1.875607355164522e-05, "loss": 0.5308, "step": 2243 }, { "epoch": 0.5005576622797234, "grad_norm": 0.15493686497211456, "learning_rate": 1.875493654741669e-05, "loss": 0.5243, "step": 2244 }, { "epoch": 0.5007807271916127, "grad_norm": 0.162288635969162, "learning_rate": 1.8753799058284707e-05, "loss": 0.5052, "step": 2245 }, { "epoch": 0.5010037921035021, "grad_norm": 0.17301948368549347, "learning_rate": 1.8752661084312268e-05, "loss": 0.5119, "step": 2246 }, { "epoch": 0.5012268570153915, "grad_norm": 0.16729313135147095, "learning_rate": 1.8751522625562405e-05, "loss": 0.5372, "step": 2247 }, { "epoch": 0.5014499219272809, "grad_norm": 0.16277892887592316, "learning_rate": 1.8750383682098166e-05, "loss": 0.4847, "step": 2248 }, { "epoch": 0.5016729868391702, "grad_norm": 0.16257484257221222, "learning_rate": 1.8749244253982633e-05, "loss": 0.5105, "step": 2249 }, { "epoch": 0.5018960517510596, "grad_norm": 0.16304615139961243, "learning_rate": 1.8748104341278924e-05, "loss": 0.5073, "step": 2250 }, { "epoch": 0.502119116662949, "grad_norm": 0.1586243063211441, "learning_rate": 1.874696394405017e-05, "loss": 0.5096, "step": 2251 }, { "epoch": 0.5023421815748382, "grad_norm": 0.16425400972366333, "learning_rate": 1.874582306235953e-05, "loss": 0.4817, "step": 2252 }, { "epoch": 0.5025652464867276, "grad_norm": 0.16808679699897766, "learning_rate": 1.874468169627019e-05, "loss": 0.4866, "step": 2253 }, { "epoch": 0.502788311398617, "grad_norm": 0.1656448394060135, "learning_rate": 1.8743539845845378e-05, "loss": 0.4903, "step": 2254 }, { "epoch": 0.5030113763105064, "grad_norm": 0.16811887919902802, "learning_rate": 1.8742397511148328e-05, "loss": 0.4954, "step": 2255 }, { "epoch": 0.5032344412223957, "grad_norm": 0.17424499988555908, "learning_rate": 1.8741254692242315e-05, "loss": 0.5301, "step": 2256 }, { "epoch": 0.5034575061342851, "grad_norm": 0.17697711288928986, "learning_rate": 1.874011138919063e-05, "loss": 0.5103, "step": 2257 }, { "epoch": 0.5036805710461745, "grad_norm": 0.16454057395458221, "learning_rate": 1.8738967602056597e-05, "loss": 0.499, "step": 2258 }, { "epoch": 0.5039036359580638, "grad_norm": 0.16936853528022766, "learning_rate": 1.873782333090357e-05, "loss": 0.5029, "step": 2259 }, { "epoch": 0.5041267008699531, "grad_norm": 0.16321606934070587, "learning_rate": 1.873667857579492e-05, "loss": 0.5043, "step": 2260 }, { "epoch": 0.5043497657818425, "grad_norm": 0.17789475619792938, "learning_rate": 1.873553333679406e-05, "loss": 0.5206, "step": 2261 }, { "epoch": 0.5045728306937318, "grad_norm": 0.17053046822547913, "learning_rate": 1.8734387613964414e-05, "loss": 0.5241, "step": 2262 }, { "epoch": 0.5047958956056212, "grad_norm": 0.16408482193946838, "learning_rate": 1.8733241407369438e-05, "loss": 0.4999, "step": 2263 }, { "epoch": 0.5050189605175106, "grad_norm": 0.1780211627483368, "learning_rate": 1.873209471707262e-05, "loss": 0.5014, "step": 2264 }, { "epoch": 0.5052420254294, "grad_norm": 0.17059922218322754, "learning_rate": 1.873094754313747e-05, "loss": 0.5315, "step": 2265 }, { "epoch": 0.5054650903412893, "grad_norm": 0.17030328512191772, "learning_rate": 1.8729799885627528e-05, "loss": 0.5295, "step": 2266 }, { "epoch": 0.5056881552531787, "grad_norm": 0.1972958743572235, "learning_rate": 1.872865174460635e-05, "loss": 0.5094, "step": 2267 }, { "epoch": 0.5059112201650681, "grad_norm": 0.1663295030593872, "learning_rate": 1.8727503120137537e-05, "loss": 0.5023, "step": 2268 }, { "epoch": 0.5061342850769573, "grad_norm": 0.1621520221233368, "learning_rate": 1.87263540122847e-05, "loss": 0.5141, "step": 2269 }, { "epoch": 0.5063573499888467, "grad_norm": 0.35805073380470276, "learning_rate": 1.872520442111149e-05, "loss": 0.539, "step": 2270 }, { "epoch": 0.5065804149007361, "grad_norm": 0.16250286996364594, "learning_rate": 1.8724054346681573e-05, "loss": 0.5173, "step": 2271 }, { "epoch": 0.5068034798126255, "grad_norm": 0.16895325481891632, "learning_rate": 1.872290378905865e-05, "loss": 0.5041, "step": 2272 }, { "epoch": 0.5070265447245148, "grad_norm": 0.1690344661474228, "learning_rate": 1.872175274830645e-05, "loss": 0.5337, "step": 2273 }, { "epoch": 0.5072496096364042, "grad_norm": 0.15611621737480164, "learning_rate": 1.8720601224488716e-05, "loss": 0.4819, "step": 2274 }, { "epoch": 0.5074726745482936, "grad_norm": 0.18844662606716156, "learning_rate": 1.871944921766923e-05, "loss": 0.5082, "step": 2275 }, { "epoch": 0.5076957394601829, "grad_norm": 0.16826370358467102, "learning_rate": 1.8718296727911803e-05, "loss": 0.4914, "step": 2276 }, { "epoch": 0.5079188043720723, "grad_norm": 0.15561175346374512, "learning_rate": 1.871714375528026e-05, "loss": 0.5147, "step": 2277 }, { "epoch": 0.5081418692839617, "grad_norm": 0.1746017336845398, "learning_rate": 1.8715990299838463e-05, "loss": 0.5224, "step": 2278 }, { "epoch": 0.508364934195851, "grad_norm": 0.1721627563238144, "learning_rate": 1.8714836361650303e-05, "loss": 0.509, "step": 2279 }, { "epoch": 0.5085879991077403, "grad_norm": 0.15273146331310272, "learning_rate": 1.871368194077968e-05, "loss": 0.4933, "step": 2280 }, { "epoch": 0.5088110640196297, "grad_norm": 0.17017610371112823, "learning_rate": 1.8712527037290546e-05, "loss": 0.518, "step": 2281 }, { "epoch": 0.5090341289315191, "grad_norm": 0.16407892107963562, "learning_rate": 1.8711371651246854e-05, "loss": 0.5152, "step": 2282 }, { "epoch": 0.5092571938434084, "grad_norm": 0.18274806439876556, "learning_rate": 1.8710215782712606e-05, "loss": 0.4959, "step": 2283 }, { "epoch": 0.5094802587552978, "grad_norm": 0.17501875758171082, "learning_rate": 1.870905943175182e-05, "loss": 0.5372, "step": 2284 }, { "epoch": 0.5097033236671872, "grad_norm": 0.1640838235616684, "learning_rate": 1.870790259842854e-05, "loss": 0.5102, "step": 2285 }, { "epoch": 0.5099263885790765, "grad_norm": 0.15836408734321594, "learning_rate": 1.870674528280684e-05, "loss": 0.5194, "step": 2286 }, { "epoch": 0.5101494534909659, "grad_norm": 0.16688112914562225, "learning_rate": 1.8705587484950815e-05, "loss": 0.5077, "step": 2287 }, { "epoch": 0.5103725184028552, "grad_norm": 0.16494499146938324, "learning_rate": 1.8704429204924598e-05, "loss": 0.4954, "step": 2288 }, { "epoch": 0.5105955833147446, "grad_norm": 0.1580001264810562, "learning_rate": 1.8703270442792337e-05, "loss": 0.507, "step": 2289 }, { "epoch": 0.5108186482266339, "grad_norm": 0.17867796123027802, "learning_rate": 1.8702111198618213e-05, "loss": 0.5081, "step": 2290 }, { "epoch": 0.5110417131385233, "grad_norm": 0.28131747245788574, "learning_rate": 1.8700951472466435e-05, "loss": 0.5188, "step": 2291 }, { "epoch": 0.5112647780504127, "grad_norm": 0.1570468544960022, "learning_rate": 1.869979126440123e-05, "loss": 0.4992, "step": 2292 }, { "epoch": 0.511487842962302, "grad_norm": 0.1686072200536728, "learning_rate": 1.8698630574486862e-05, "loss": 0.4951, "step": 2293 }, { "epoch": 0.5117109078741914, "grad_norm": 0.1702122986316681, "learning_rate": 1.869746940278762e-05, "loss": 0.4965, "step": 2294 }, { "epoch": 0.5119339727860808, "grad_norm": 0.1632847636938095, "learning_rate": 1.8696307749367807e-05, "loss": 0.48, "step": 2295 }, { "epoch": 0.5121570376979702, "grad_norm": 0.20230402052402496, "learning_rate": 1.8695145614291773e-05, "loss": 0.5262, "step": 2296 }, { "epoch": 0.5123801026098594, "grad_norm": 0.17471125721931458, "learning_rate": 1.8693982997623877e-05, "loss": 0.5207, "step": 2297 }, { "epoch": 0.5126031675217488, "grad_norm": 0.17141500115394592, "learning_rate": 1.869281989942852e-05, "loss": 0.4995, "step": 2298 }, { "epoch": 0.5128262324336382, "grad_norm": 0.1641875058412552, "learning_rate": 1.8691656319770112e-05, "loss": 0.4773, "step": 2299 }, { "epoch": 0.5130492973455275, "grad_norm": 0.1655300408601761, "learning_rate": 1.8690492258713107e-05, "loss": 0.5171, "step": 2300 }, { "epoch": 0.5132723622574169, "grad_norm": 0.15635208785533905, "learning_rate": 1.8689327716321975e-05, "loss": 0.5005, "step": 2301 }, { "epoch": 0.5134954271693063, "grad_norm": 0.1540510356426239, "learning_rate": 1.8688162692661214e-05, "loss": 0.4925, "step": 2302 }, { "epoch": 0.5137184920811956, "grad_norm": 0.16273614764213562, "learning_rate": 1.8686997187795354e-05, "loss": 0.5153, "step": 2303 }, { "epoch": 0.513941556993085, "grad_norm": 0.16257858276367188, "learning_rate": 1.8685831201788945e-05, "loss": 0.495, "step": 2304 }, { "epoch": 0.5141646219049744, "grad_norm": 0.1771703064441681, "learning_rate": 1.8684664734706572e-05, "loss": 0.5499, "step": 2305 }, { "epoch": 0.5143876868168638, "grad_norm": 0.17417144775390625, "learning_rate": 1.8683497786612834e-05, "loss": 0.4777, "step": 2306 }, { "epoch": 0.514610751728753, "grad_norm": 0.2703874707221985, "learning_rate": 1.8682330357572368e-05, "loss": 0.5032, "step": 2307 }, { "epoch": 0.5148338166406424, "grad_norm": 0.16738004982471466, "learning_rate": 1.8681162447649834e-05, "loss": 0.514, "step": 2308 }, { "epoch": 0.5150568815525318, "grad_norm": 0.17614148557186127, "learning_rate": 1.8679994056909915e-05, "loss": 0.5193, "step": 2309 }, { "epoch": 0.5152799464644211, "grad_norm": 0.1655927300453186, "learning_rate": 1.8678825185417328e-05, "loss": 0.5253, "step": 2310 }, { "epoch": 0.5155030113763105, "grad_norm": 0.16509908437728882, "learning_rate": 1.867765583323681e-05, "loss": 0.5108, "step": 2311 }, { "epoch": 0.5157260762881999, "grad_norm": 0.17742401361465454, "learning_rate": 1.8676486000433123e-05, "loss": 0.5024, "step": 2312 }, { "epoch": 0.5159491412000893, "grad_norm": 0.1574956476688385, "learning_rate": 1.8675315687071068e-05, "loss": 0.4944, "step": 2313 }, { "epoch": 0.5161722061119786, "grad_norm": 0.16360540688037872, "learning_rate": 1.867414489321546e-05, "loss": 0.5054, "step": 2314 }, { "epoch": 0.516395271023868, "grad_norm": 0.1697058230638504, "learning_rate": 1.8672973618931144e-05, "loss": 0.5198, "step": 2315 }, { "epoch": 0.5166183359357573, "grad_norm": 0.16391722857952118, "learning_rate": 1.8671801864282996e-05, "loss": 0.5138, "step": 2316 }, { "epoch": 0.5168414008476466, "grad_norm": 0.1696222871541977, "learning_rate": 1.867062962933591e-05, "loss": 0.4929, "step": 2317 }, { "epoch": 0.517064465759536, "grad_norm": 0.16633786261081696, "learning_rate": 1.8669456914154817e-05, "loss": 0.4723, "step": 2318 }, { "epoch": 0.5172875306714254, "grad_norm": 0.16019189357757568, "learning_rate": 1.8668283718804664e-05, "loss": 0.5093, "step": 2319 }, { "epoch": 0.5175105955833147, "grad_norm": 0.1600850522518158, "learning_rate": 1.8667110043350435e-05, "loss": 0.4998, "step": 2320 }, { "epoch": 0.5177336604952041, "grad_norm": 0.1658647060394287, "learning_rate": 1.8665935887857136e-05, "loss": 0.5201, "step": 2321 }, { "epoch": 0.5179567254070935, "grad_norm": 0.16224202513694763, "learning_rate": 1.8664761252389795e-05, "loss": 0.4979, "step": 2322 }, { "epoch": 0.5181797903189829, "grad_norm": 0.16525104641914368, "learning_rate": 1.866358613701347e-05, "loss": 0.5168, "step": 2323 }, { "epoch": 0.5184028552308722, "grad_norm": 0.16819415986537933, "learning_rate": 1.866241054179325e-05, "loss": 0.5189, "step": 2324 }, { "epoch": 0.5186259201427615, "grad_norm": 0.1534615010023117, "learning_rate": 1.8661234466794246e-05, "loss": 0.4935, "step": 2325 }, { "epoch": 0.5188489850546509, "grad_norm": 0.15495775640010834, "learning_rate": 1.8660057912081598e-05, "loss": 0.5039, "step": 2326 }, { "epoch": 0.5190720499665402, "grad_norm": 0.16145430505275726, "learning_rate": 1.8658880877720467e-05, "loss": 0.4758, "step": 2327 }, { "epoch": 0.5192951148784296, "grad_norm": 0.16488440334796906, "learning_rate": 1.8657703363776044e-05, "loss": 0.5183, "step": 2328 }, { "epoch": 0.519518179790319, "grad_norm": 0.16661033034324646, "learning_rate": 1.8656525370313553e-05, "loss": 0.519, "step": 2329 }, { "epoch": 0.5197412447022084, "grad_norm": 0.15961341559886932, "learning_rate": 1.8655346897398234e-05, "loss": 0.478, "step": 2330 }, { "epoch": 0.5199643096140977, "grad_norm": 0.15573342144489288, "learning_rate": 1.865416794509536e-05, "loss": 0.4843, "step": 2331 }, { "epoch": 0.5201873745259871, "grad_norm": 0.16535112261772156, "learning_rate": 1.8652988513470227e-05, "loss": 0.4943, "step": 2332 }, { "epoch": 0.5204104394378765, "grad_norm": 0.16046664118766785, "learning_rate": 1.865180860258816e-05, "loss": 0.5445, "step": 2333 }, { "epoch": 0.5206335043497657, "grad_norm": 0.16381436586380005, "learning_rate": 1.8650628212514516e-05, "loss": 0.4956, "step": 2334 }, { "epoch": 0.5208565692616551, "grad_norm": 0.17565996944904327, "learning_rate": 1.864944734331466e-05, "loss": 0.5186, "step": 2335 }, { "epoch": 0.5210796341735445, "grad_norm": 0.15597450733184814, "learning_rate": 1.8648265995054005e-05, "loss": 0.4993, "step": 2336 }, { "epoch": 0.5213026990854338, "grad_norm": 0.22045518457889557, "learning_rate": 1.8647084167797982e-05, "loss": 0.4938, "step": 2337 }, { "epoch": 0.5215257639973232, "grad_norm": 0.14948628842830658, "learning_rate": 1.8645901861612044e-05, "loss": 0.4637, "step": 2338 }, { "epoch": 0.5217488289092126, "grad_norm": 0.16708412766456604, "learning_rate": 1.8644719076561675e-05, "loss": 0.5093, "step": 2339 }, { "epoch": 0.521971893821102, "grad_norm": 0.1535186767578125, "learning_rate": 1.8643535812712386e-05, "loss": 0.4894, "step": 2340 }, { "epoch": 0.5221949587329913, "grad_norm": 0.19612246751785278, "learning_rate": 1.8642352070129715e-05, "loss": 0.4719, "step": 2341 }, { "epoch": 0.5224180236448807, "grad_norm": 0.1691182553768158, "learning_rate": 1.8641167848879225e-05, "loss": 0.5119, "step": 2342 }, { "epoch": 0.52264108855677, "grad_norm": 0.16633757948875427, "learning_rate": 1.86399831490265e-05, "loss": 0.4941, "step": 2343 }, { "epoch": 0.5228641534686593, "grad_norm": 0.15868477523326874, "learning_rate": 1.8638797970637162e-05, "loss": 0.5114, "step": 2344 }, { "epoch": 0.5230872183805487, "grad_norm": 0.1649598479270935, "learning_rate": 1.8637612313776856e-05, "loss": 0.5446, "step": 2345 }, { "epoch": 0.5233102832924381, "grad_norm": 0.1706569939851761, "learning_rate": 1.8636426178511246e-05, "loss": 0.509, "step": 2346 }, { "epoch": 0.5235333482043275, "grad_norm": 0.16140028834342957, "learning_rate": 1.8635239564906026e-05, "loss": 0.4963, "step": 2347 }, { "epoch": 0.5237564131162168, "grad_norm": 0.16061624884605408, "learning_rate": 1.8634052473026925e-05, "loss": 0.5149, "step": 2348 }, { "epoch": 0.5239794780281062, "grad_norm": 0.1699933260679245, "learning_rate": 1.8632864902939684e-05, "loss": 0.5436, "step": 2349 }, { "epoch": 0.5242025429399956, "grad_norm": 0.16812016069889069, "learning_rate": 1.8631676854710082e-05, "loss": 0.5179, "step": 2350 }, { "epoch": 0.5244256078518849, "grad_norm": 0.1655721366405487, "learning_rate": 1.8630488328403924e-05, "loss": 0.5069, "step": 2351 }, { "epoch": 0.5246486727637742, "grad_norm": 0.28273990750312805, "learning_rate": 1.8629299324087032e-05, "loss": 0.5207, "step": 2352 }, { "epoch": 0.5248717376756636, "grad_norm": 0.167250394821167, "learning_rate": 1.8628109841825263e-05, "loss": 0.4995, "step": 2353 }, { "epoch": 0.525094802587553, "grad_norm": 0.15988002717494965, "learning_rate": 1.8626919881684497e-05, "loss": 0.5331, "step": 2354 }, { "epoch": 0.5253178674994423, "grad_norm": 0.18735967576503754, "learning_rate": 1.8625729443730643e-05, "loss": 0.508, "step": 2355 }, { "epoch": 0.5255409324113317, "grad_norm": 0.16121627390384674, "learning_rate": 1.8624538528029638e-05, "loss": 0.4926, "step": 2356 }, { "epoch": 0.5257639973232211, "grad_norm": 0.16314919292926788, "learning_rate": 1.8623347134647437e-05, "loss": 0.4936, "step": 2357 }, { "epoch": 0.5259870622351104, "grad_norm": 0.16220073401927948, "learning_rate": 1.862215526365003e-05, "loss": 0.4866, "step": 2358 }, { "epoch": 0.5262101271469998, "grad_norm": 0.18198350071907043, "learning_rate": 1.8620962915103425e-05, "loss": 0.4969, "step": 2359 }, { "epoch": 0.5264331920588892, "grad_norm": 0.16916455328464508, "learning_rate": 1.8619770089073665e-05, "loss": 0.4872, "step": 2360 }, { "epoch": 0.5266562569707784, "grad_norm": 0.19399681687355042, "learning_rate": 1.861857678562682e-05, "loss": 0.5037, "step": 2361 }, { "epoch": 0.5268793218826678, "grad_norm": 0.16876184940338135, "learning_rate": 1.8617383004828978e-05, "loss": 0.5265, "step": 2362 }, { "epoch": 0.5271023867945572, "grad_norm": 0.17060886323451996, "learning_rate": 1.8616188746746262e-05, "loss": 0.5247, "step": 2363 }, { "epoch": 0.5273254517064466, "grad_norm": 0.17065522074699402, "learning_rate": 1.8614994011444812e-05, "loss": 0.5258, "step": 2364 }, { "epoch": 0.5275485166183359, "grad_norm": 0.41077789664268494, "learning_rate": 1.8613798798990806e-05, "loss": 0.4971, "step": 2365 }, { "epoch": 0.5277715815302253, "grad_norm": 0.1597769409418106, "learning_rate": 1.8612603109450437e-05, "loss": 0.4971, "step": 2366 }, { "epoch": 0.5279946464421147, "grad_norm": 0.15872445702552795, "learning_rate": 1.8611406942889934e-05, "loss": 0.4987, "step": 2367 }, { "epoch": 0.528217711354004, "grad_norm": 0.1601647585630417, "learning_rate": 1.861021029937555e-05, "loss": 0.4791, "step": 2368 }, { "epoch": 0.5284407762658934, "grad_norm": 0.1533002108335495, "learning_rate": 1.8609013178973555e-05, "loss": 0.4842, "step": 2369 }, { "epoch": 0.5286638411777828, "grad_norm": 0.16213175654411316, "learning_rate": 1.8607815581750257e-05, "loss": 0.5063, "step": 2370 }, { "epoch": 0.5288869060896721, "grad_norm": 0.1895373910665512, "learning_rate": 1.860661750777199e-05, "loss": 0.4885, "step": 2371 }, { "epoch": 0.5291099710015614, "grad_norm": 0.1578717827796936, "learning_rate": 1.8605418957105105e-05, "loss": 0.4541, "step": 2372 }, { "epoch": 0.5293330359134508, "grad_norm": 0.17269465327262878, "learning_rate": 1.8604219929815987e-05, "loss": 0.5234, "step": 2373 }, { "epoch": 0.5295561008253402, "grad_norm": 0.16912835836410522, "learning_rate": 1.860302042597105e-05, "loss": 0.5295, "step": 2374 }, { "epoch": 0.5297791657372295, "grad_norm": 0.16457562148571014, "learning_rate": 1.8601820445636722e-05, "loss": 0.5096, "step": 2375 }, { "epoch": 0.5300022306491189, "grad_norm": 0.17353565990924835, "learning_rate": 1.860061998887947e-05, "loss": 0.5086, "step": 2376 }, { "epoch": 0.5302252955610083, "grad_norm": 0.1719389110803604, "learning_rate": 1.859941905576579e-05, "loss": 0.5045, "step": 2377 }, { "epoch": 0.5304483604728976, "grad_norm": 0.15863987803459167, "learning_rate": 1.8598217646362183e-05, "loss": 0.5007, "step": 2378 }, { "epoch": 0.530671425384787, "grad_norm": 0.16614408791065216, "learning_rate": 1.85970157607352e-05, "loss": 0.5123, "step": 2379 }, { "epoch": 0.5308944902966763, "grad_norm": 0.1657867580652237, "learning_rate": 1.859581339895141e-05, "loss": 0.5237, "step": 2380 }, { "epoch": 0.5311175552085657, "grad_norm": 0.1731244921684265, "learning_rate": 1.85946105610774e-05, "loss": 0.5407, "step": 2381 }, { "epoch": 0.531340620120455, "grad_norm": 0.15216992795467377, "learning_rate": 1.85934072471798e-05, "loss": 0.4709, "step": 2382 }, { "epoch": 0.5315636850323444, "grad_norm": 0.1583755612373352, "learning_rate": 1.8592203457325248e-05, "loss": 0.491, "step": 2383 }, { "epoch": 0.5317867499442338, "grad_norm": 0.16302180290222168, "learning_rate": 1.859099919158042e-05, "loss": 0.4984, "step": 2384 }, { "epoch": 0.5320098148561231, "grad_norm": 0.3964548110961914, "learning_rate": 1.858979445001202e-05, "loss": 0.5208, "step": 2385 }, { "epoch": 0.5322328797680125, "grad_norm": 0.1690683513879776, "learning_rate": 1.8588589232686768e-05, "loss": 0.4932, "step": 2386 }, { "epoch": 0.5324559446799019, "grad_norm": 0.18451258540153503, "learning_rate": 1.8587383539671424e-05, "loss": 0.5261, "step": 2387 }, { "epoch": 0.5326790095917913, "grad_norm": 0.18741649389266968, "learning_rate": 1.858617737103276e-05, "loss": 0.499, "step": 2388 }, { "epoch": 0.5329020745036805, "grad_norm": 0.16601239144802094, "learning_rate": 1.8584970726837587e-05, "loss": 0.5176, "step": 2389 }, { "epoch": 0.5331251394155699, "grad_norm": 0.16434918344020844, "learning_rate": 1.858376360715273e-05, "loss": 0.497, "step": 2390 }, { "epoch": 0.5333482043274593, "grad_norm": 0.16978149116039276, "learning_rate": 1.8582556012045053e-05, "loss": 0.4946, "step": 2391 }, { "epoch": 0.5335712692393486, "grad_norm": 0.17810384929180145, "learning_rate": 1.8581347941581438e-05, "loss": 0.5223, "step": 2392 }, { "epoch": 0.533794334151238, "grad_norm": 0.1722012311220169, "learning_rate": 1.8580139395828795e-05, "loss": 0.5082, "step": 2393 }, { "epoch": 0.5340173990631274, "grad_norm": 0.16442382335662842, "learning_rate": 1.857893037485406e-05, "loss": 0.4838, "step": 2394 }, { "epoch": 0.5342404639750167, "grad_norm": 0.17343981564044952, "learning_rate": 1.8577720878724195e-05, "loss": 0.5024, "step": 2395 }, { "epoch": 0.5344635288869061, "grad_norm": 0.1579180806875229, "learning_rate": 1.8576510907506192e-05, "loss": 0.5168, "step": 2396 }, { "epoch": 0.5346865937987955, "grad_norm": 0.1650589406490326, "learning_rate": 1.8575300461267073e-05, "loss": 0.5063, "step": 2397 }, { "epoch": 0.5349096587106849, "grad_norm": 0.17452824115753174, "learning_rate": 1.8574089540073868e-05, "loss": 0.5272, "step": 2398 }, { "epoch": 0.5351327236225741, "grad_norm": 0.16344332695007324, "learning_rate": 1.8572878143993652e-05, "loss": 0.4876, "step": 2399 }, { "epoch": 0.5353557885344635, "grad_norm": 0.16399069130420685, "learning_rate": 1.857166627309352e-05, "loss": 0.4929, "step": 2400 }, { "epoch": 0.5355788534463529, "grad_norm": 0.1590677946805954, "learning_rate": 1.857045392744059e-05, "loss": 0.4672, "step": 2401 }, { "epoch": 0.5358019183582422, "grad_norm": 0.15974955260753632, "learning_rate": 1.8569241107102014e-05, "loss": 0.504, "step": 2402 }, { "epoch": 0.5360249832701316, "grad_norm": 0.15898270905017853, "learning_rate": 1.856802781214496e-05, "loss": 0.5031, "step": 2403 }, { "epoch": 0.536248048182021, "grad_norm": 0.15949919819831848, "learning_rate": 1.856681404263663e-05, "loss": 0.4957, "step": 2404 }, { "epoch": 0.5364711130939104, "grad_norm": 0.1708550751209259, "learning_rate": 1.8565599798644253e-05, "loss": 0.5325, "step": 2405 }, { "epoch": 0.5366941780057997, "grad_norm": 0.1631123423576355, "learning_rate": 1.856438508023508e-05, "loss": 0.5044, "step": 2406 }, { "epoch": 0.536917242917689, "grad_norm": 0.17329680919647217, "learning_rate": 1.8563169887476386e-05, "loss": 0.4905, "step": 2407 }, { "epoch": 0.5371403078295784, "grad_norm": 0.1555010825395584, "learning_rate": 1.8561954220435483e-05, "loss": 0.4896, "step": 2408 }, { "epoch": 0.5373633727414677, "grad_norm": 0.1617012768983841, "learning_rate": 1.85607380791797e-05, "loss": 0.5199, "step": 2409 }, { "epoch": 0.5375864376533571, "grad_norm": 0.16019612550735474, "learning_rate": 1.8559521463776388e-05, "loss": 0.5331, "step": 2410 }, { "epoch": 0.5378095025652465, "grad_norm": 0.15812602639198303, "learning_rate": 1.855830437429294e-05, "loss": 0.4946, "step": 2411 }, { "epoch": 0.5380325674771358, "grad_norm": 0.16577614843845367, "learning_rate": 1.8557086810796756e-05, "loss": 0.5049, "step": 2412 }, { "epoch": 0.5382556323890252, "grad_norm": 3.4729576110839844, "learning_rate": 1.8555868773355283e-05, "loss": 0.5572, "step": 2413 }, { "epoch": 0.5384786973009146, "grad_norm": 0.177010640501976, "learning_rate": 1.8554650262035975e-05, "loss": 0.4871, "step": 2414 }, { "epoch": 0.538701762212804, "grad_norm": 0.16966521739959717, "learning_rate": 1.8553431276906328e-05, "loss": 0.5043, "step": 2415 }, { "epoch": 0.5389248271246933, "grad_norm": 0.16035056114196777, "learning_rate": 1.855221181803385e-05, "loss": 0.4951, "step": 2416 }, { "epoch": 0.5391478920365826, "grad_norm": 0.1640123724937439, "learning_rate": 1.8550991885486093e-05, "loss": 0.5175, "step": 2417 }, { "epoch": 0.539370956948472, "grad_norm": 0.16451038420200348, "learning_rate": 1.8549771479330612e-05, "loss": 0.5192, "step": 2418 }, { "epoch": 0.5395940218603613, "grad_norm": 0.16037864983081818, "learning_rate": 1.8548550599635007e-05, "loss": 0.5174, "step": 2419 }, { "epoch": 0.5398170867722507, "grad_norm": 0.1533394753932953, "learning_rate": 1.85473292464669e-05, "loss": 0.502, "step": 2420 }, { "epoch": 0.5400401516841401, "grad_norm": 0.16923432052135468, "learning_rate": 1.854610741989393e-05, "loss": 0.4917, "step": 2421 }, { "epoch": 0.5402632165960295, "grad_norm": 0.16241200268268585, "learning_rate": 1.8544885119983774e-05, "loss": 0.4922, "step": 2422 }, { "epoch": 0.5404862815079188, "grad_norm": 0.1572006493806839, "learning_rate": 1.8543662346804138e-05, "loss": 0.4771, "step": 2423 }, { "epoch": 0.5407093464198082, "grad_norm": 0.18193307518959045, "learning_rate": 1.8542439100422733e-05, "loss": 0.512, "step": 2424 }, { "epoch": 0.5409324113316976, "grad_norm": 0.1868167519569397, "learning_rate": 1.8541215380907317e-05, "loss": 0.5112, "step": 2425 }, { "epoch": 0.5411554762435868, "grad_norm": 0.1707518994808197, "learning_rate": 1.8539991188325664e-05, "loss": 0.4923, "step": 2426 }, { "epoch": 0.5413785411554762, "grad_norm": 0.16489239037036896, "learning_rate": 1.8538766522745587e-05, "loss": 0.4787, "step": 2427 }, { "epoch": 0.5416016060673656, "grad_norm": 0.16489636898040771, "learning_rate": 1.8537541384234906e-05, "loss": 0.4966, "step": 2428 }, { "epoch": 0.541824670979255, "grad_norm": 0.17048819363117218, "learning_rate": 1.853631577286148e-05, "loss": 0.5079, "step": 2429 }, { "epoch": 0.5420477358911443, "grad_norm": 0.16793492436408997, "learning_rate": 1.853508968869319e-05, "loss": 0.5122, "step": 2430 }, { "epoch": 0.5422708008030337, "grad_norm": 0.16388513147830963, "learning_rate": 1.8533863131797948e-05, "loss": 0.4653, "step": 2431 }, { "epoch": 0.5424938657149231, "grad_norm": 0.16700832545757294, "learning_rate": 1.853263610224368e-05, "loss": 0.5208, "step": 2432 }, { "epoch": 0.5427169306268124, "grad_norm": 0.17072373628616333, "learning_rate": 1.8531408600098356e-05, "loss": 0.5032, "step": 2433 }, { "epoch": 0.5429399955387018, "grad_norm": 0.16046811640262604, "learning_rate": 1.8530180625429958e-05, "loss": 0.515, "step": 2434 }, { "epoch": 0.5431630604505912, "grad_norm": 0.16855685412883759, "learning_rate": 1.8528952178306504e-05, "loss": 0.4915, "step": 2435 }, { "epoch": 0.5433861253624804, "grad_norm": 0.17628896236419678, "learning_rate": 1.8527723258796025e-05, "loss": 0.5221, "step": 2436 }, { "epoch": 0.5436091902743698, "grad_norm": 0.1593201607465744, "learning_rate": 1.852649386696659e-05, "loss": 0.5027, "step": 2437 }, { "epoch": 0.5438322551862592, "grad_norm": 0.18933804333209991, "learning_rate": 1.852526400288629e-05, "loss": 0.5187, "step": 2438 }, { "epoch": 0.5440553200981486, "grad_norm": 0.16613252460956573, "learning_rate": 1.852403366662325e-05, "loss": 0.4939, "step": 2439 }, { "epoch": 0.5442783850100379, "grad_norm": 0.16360655426979065, "learning_rate": 1.85228028582456e-05, "loss": 0.4835, "step": 2440 }, { "epoch": 0.5445014499219273, "grad_norm": 0.1624102145433426, "learning_rate": 1.8521571577821522e-05, "loss": 0.5308, "step": 2441 }, { "epoch": 0.5447245148338167, "grad_norm": 0.16573922336101532, "learning_rate": 1.8520339825419204e-05, "loss": 0.5148, "step": 2442 }, { "epoch": 0.544947579745706, "grad_norm": 0.17061501741409302, "learning_rate": 1.8519107601106875e-05, "loss": 0.5025, "step": 2443 }, { "epoch": 0.5451706446575953, "grad_norm": 0.16382652521133423, "learning_rate": 1.851787490495278e-05, "loss": 0.5214, "step": 2444 }, { "epoch": 0.5453937095694847, "grad_norm": 0.16731955111026764, "learning_rate": 1.8516641737025187e-05, "loss": 0.4915, "step": 2445 }, { "epoch": 0.5456167744813741, "grad_norm": 0.1665150374174118, "learning_rate": 1.8515408097392408e-05, "loss": 0.4585, "step": 2446 }, { "epoch": 0.5458398393932634, "grad_norm": 0.16348014771938324, "learning_rate": 1.851417398612276e-05, "loss": 0.5042, "step": 2447 }, { "epoch": 0.5460629043051528, "grad_norm": 0.1641550064086914, "learning_rate": 1.85129394032846e-05, "loss": 0.5103, "step": 2448 }, { "epoch": 0.5462859692170422, "grad_norm": 0.16449302434921265, "learning_rate": 1.8511704348946314e-05, "loss": 0.5043, "step": 2449 }, { "epoch": 0.5465090341289315, "grad_norm": 0.1623837649822235, "learning_rate": 1.85104688231763e-05, "loss": 0.4831, "step": 2450 }, { "epoch": 0.5467320990408209, "grad_norm": 0.16712717711925507, "learning_rate": 1.8509232826042983e-05, "loss": 0.5146, "step": 2451 }, { "epoch": 0.5469551639527103, "grad_norm": 0.14933447539806366, "learning_rate": 1.850799635761483e-05, "loss": 0.5011, "step": 2452 }, { "epoch": 0.5471782288645995, "grad_norm": 0.17392055690288544, "learning_rate": 1.8506759417960322e-05, "loss": 0.482, "step": 2453 }, { "epoch": 0.5474012937764889, "grad_norm": 0.1616058647632599, "learning_rate": 1.850552200714797e-05, "loss": 0.5152, "step": 2454 }, { "epoch": 0.5476243586883783, "grad_norm": 0.16296276450157166, "learning_rate": 1.8504284125246304e-05, "loss": 0.5073, "step": 2455 }, { "epoch": 0.5478474236002677, "grad_norm": 0.16247029602527618, "learning_rate": 1.850304577232389e-05, "loss": 0.5194, "step": 2456 }, { "epoch": 0.548070488512157, "grad_norm": 0.16784507036209106, "learning_rate": 1.8501806948449316e-05, "loss": 0.5212, "step": 2457 }, { "epoch": 0.5482935534240464, "grad_norm": 0.1538800299167633, "learning_rate": 1.8500567653691192e-05, "loss": 0.4907, "step": 2458 }, { "epoch": 0.5485166183359358, "grad_norm": 0.1586543768644333, "learning_rate": 1.8499327888118163e-05, "loss": 0.4873, "step": 2459 }, { "epoch": 0.5487396832478251, "grad_norm": 0.15387628972530365, "learning_rate": 1.8498087651798893e-05, "loss": 0.5102, "step": 2460 }, { "epoch": 0.5489627481597145, "grad_norm": 0.15858915448188782, "learning_rate": 1.8496846944802072e-05, "loss": 0.4983, "step": 2461 }, { "epoch": 0.5491858130716039, "grad_norm": 0.15983784198760986, "learning_rate": 1.849560576719642e-05, "loss": 0.4728, "step": 2462 }, { "epoch": 0.5494088779834932, "grad_norm": 0.1605166792869568, "learning_rate": 1.849436411905068e-05, "loss": 0.5083, "step": 2463 }, { "epoch": 0.5496319428953825, "grad_norm": 0.15921124815940857, "learning_rate": 1.8493122000433628e-05, "loss": 0.5054, "step": 2464 }, { "epoch": 0.5498550078072719, "grad_norm": 0.15854284167289734, "learning_rate": 1.849187941141405e-05, "loss": 0.5111, "step": 2465 }, { "epoch": 0.5500780727191613, "grad_norm": 0.16846506297588348, "learning_rate": 1.8490636352060778e-05, "loss": 0.526, "step": 2466 }, { "epoch": 0.5503011376310506, "grad_norm": 0.1619185209274292, "learning_rate": 1.8489392822442657e-05, "loss": 0.4965, "step": 2467 }, { "epoch": 0.55052420254294, "grad_norm": 0.17305007576942444, "learning_rate": 1.8488148822628557e-05, "loss": 0.5171, "step": 2468 }, { "epoch": 0.5507472674548294, "grad_norm": 0.16538777947425842, "learning_rate": 1.8486904352687384e-05, "loss": 0.5187, "step": 2469 }, { "epoch": 0.5509703323667187, "grad_norm": 0.1914980560541153, "learning_rate": 1.8485659412688065e-05, "loss": 0.5105, "step": 2470 }, { "epoch": 0.551193397278608, "grad_norm": 0.16565662622451782, "learning_rate": 1.8484414002699552e-05, "loss": 0.4949, "step": 2471 }, { "epoch": 0.5514164621904974, "grad_norm": 0.17171315848827362, "learning_rate": 1.848316812279082e-05, "loss": 0.507, "step": 2472 }, { "epoch": 0.5516395271023868, "grad_norm": 0.1598774790763855, "learning_rate": 1.8481921773030878e-05, "loss": 0.5101, "step": 2473 }, { "epoch": 0.5518625920142761, "grad_norm": 0.16791561245918274, "learning_rate": 1.8480674953488752e-05, "loss": 0.4902, "step": 2474 }, { "epoch": 0.5520856569261655, "grad_norm": 0.17411862313747406, "learning_rate": 1.8479427664233505e-05, "loss": 0.5017, "step": 2475 }, { "epoch": 0.5523087218380549, "grad_norm": 0.16108182072639465, "learning_rate": 1.8478179905334213e-05, "loss": 0.4886, "step": 2476 }, { "epoch": 0.5525317867499442, "grad_norm": 0.1858782172203064, "learning_rate": 1.847693167685999e-05, "loss": 0.5047, "step": 2477 }, { "epoch": 0.5527548516618336, "grad_norm": 0.17761607468128204, "learning_rate": 1.847568297887997e-05, "loss": 0.5233, "step": 2478 }, { "epoch": 0.552977916573723, "grad_norm": 0.1658019870519638, "learning_rate": 1.8474433811463307e-05, "loss": 0.5263, "step": 2479 }, { "epoch": 0.5532009814856124, "grad_norm": 1.4999321699142456, "learning_rate": 1.84731841746792e-05, "loss": 0.5359, "step": 2480 }, { "epoch": 0.5534240463975016, "grad_norm": 0.17992182075977325, "learning_rate": 1.847193406859685e-05, "loss": 0.5033, "step": 2481 }, { "epoch": 0.553647111309391, "grad_norm": 0.17310257256031036, "learning_rate": 1.8470683493285503e-05, "loss": 0.4905, "step": 2482 }, { "epoch": 0.5538701762212804, "grad_norm": 0.16551616787910461, "learning_rate": 1.846943244881442e-05, "loss": 0.4925, "step": 2483 }, { "epoch": 0.5540932411331697, "grad_norm": 0.17117410898208618, "learning_rate": 1.846818093525289e-05, "loss": 0.5128, "step": 2484 }, { "epoch": 0.5543163060450591, "grad_norm": 0.15640629827976227, "learning_rate": 1.8466928952670242e-05, "loss": 0.5054, "step": 2485 }, { "epoch": 0.5545393709569485, "grad_norm": 0.16013699769973755, "learning_rate": 1.8465676501135804e-05, "loss": 0.4966, "step": 2486 }, { "epoch": 0.5547624358688378, "grad_norm": 0.16586218774318695, "learning_rate": 1.846442358071895e-05, "loss": 0.506, "step": 2487 }, { "epoch": 0.5549855007807272, "grad_norm": 0.1749541014432907, "learning_rate": 1.8463170191489075e-05, "loss": 0.4733, "step": 2488 }, { "epoch": 0.5552085656926166, "grad_norm": 0.16143982112407684, "learning_rate": 1.84619163335156e-05, "loss": 0.5014, "step": 2489 }, { "epoch": 0.555431630604506, "grad_norm": 0.1761699765920639, "learning_rate": 1.846066200686797e-05, "loss": 0.5305, "step": 2490 }, { "epoch": 0.5556546955163952, "grad_norm": 0.25962182879447937, "learning_rate": 1.8459407211615658e-05, "loss": 0.5047, "step": 2491 }, { "epoch": 0.5558777604282846, "grad_norm": 0.15808863937854767, "learning_rate": 1.8458151947828165e-05, "loss": 0.5079, "step": 2492 }, { "epoch": 0.556100825340174, "grad_norm": 0.1679726392030716, "learning_rate": 1.8456896215575013e-05, "loss": 0.5097, "step": 2493 }, { "epoch": 0.5563238902520633, "grad_norm": 0.15282638370990753, "learning_rate": 1.845564001492575e-05, "loss": 0.5196, "step": 2494 }, { "epoch": 0.5565469551639527, "grad_norm": 0.1774146854877472, "learning_rate": 1.8454383345949954e-05, "loss": 0.5227, "step": 2495 }, { "epoch": 0.5567700200758421, "grad_norm": 0.1605686992406845, "learning_rate": 1.8453126208717235e-05, "loss": 0.4751, "step": 2496 }, { "epoch": 0.5569930849877315, "grad_norm": 0.16172072291374207, "learning_rate": 1.845186860329721e-05, "loss": 0.5158, "step": 2497 }, { "epoch": 0.5572161498996208, "grad_norm": 0.16925646364688873, "learning_rate": 1.8450610529759535e-05, "loss": 0.5148, "step": 2498 }, { "epoch": 0.5574392148115102, "grad_norm": 0.16925497353076935, "learning_rate": 1.8449351988173894e-05, "loss": 0.4947, "step": 2499 }, { "epoch": 0.5576622797233995, "grad_norm": 1.354637861251831, "learning_rate": 1.8448092978609993e-05, "loss": 0.5136, "step": 2500 }, { "epoch": 0.5578853446352888, "grad_norm": 0.16736604273319244, "learning_rate": 1.844683350113756e-05, "loss": 0.5389, "step": 2501 }, { "epoch": 0.5581084095471782, "grad_norm": 0.16498203575611115, "learning_rate": 1.8445573555826355e-05, "loss": 0.5018, "step": 2502 }, { "epoch": 0.5583314744590676, "grad_norm": 0.17059291899204254, "learning_rate": 1.8444313142746164e-05, "loss": 0.5282, "step": 2503 }, { "epoch": 0.558554539370957, "grad_norm": 0.21622738242149353, "learning_rate": 1.844305226196679e-05, "loss": 0.4923, "step": 2504 }, { "epoch": 0.5587776042828463, "grad_norm": 0.16687920689582825, "learning_rate": 1.844179091355808e-05, "loss": 0.4998, "step": 2505 }, { "epoch": 0.5590006691947357, "grad_norm": 0.1704476922750473, "learning_rate": 1.8440529097589885e-05, "loss": 0.5267, "step": 2506 }, { "epoch": 0.5592237341066251, "grad_norm": 0.16384254395961761, "learning_rate": 1.8439266814132092e-05, "loss": 0.5053, "step": 2507 }, { "epoch": 0.5594467990185144, "grad_norm": 0.16077467799186707, "learning_rate": 1.843800406325462e-05, "loss": 0.5151, "step": 2508 }, { "epoch": 0.5596698639304037, "grad_norm": 0.16944189369678497, "learning_rate": 1.843674084502741e-05, "loss": 0.5054, "step": 2509 }, { "epoch": 0.5598929288422931, "grad_norm": 0.16107740998268127, "learning_rate": 1.8435477159520418e-05, "loss": 0.4861, "step": 2510 }, { "epoch": 0.5601159937541824, "grad_norm": 0.17457795143127441, "learning_rate": 1.843421300680364e-05, "loss": 0.486, "step": 2511 }, { "epoch": 0.5603390586660718, "grad_norm": 0.16926448047161102, "learning_rate": 1.8432948386947092e-05, "loss": 0.5238, "step": 2512 }, { "epoch": 0.5605621235779612, "grad_norm": 0.16914579272270203, "learning_rate": 1.8431683300020817e-05, "loss": 0.5021, "step": 2513 }, { "epoch": 0.5607851884898506, "grad_norm": 0.16727939248085022, "learning_rate": 1.8430417746094886e-05, "loss": 0.53, "step": 2514 }, { "epoch": 0.5610082534017399, "grad_norm": 0.1788937747478485, "learning_rate": 1.842915172523939e-05, "loss": 0.5226, "step": 2515 }, { "epoch": 0.5612313183136293, "grad_norm": 0.17947039008140564, "learning_rate": 1.8427885237524446e-05, "loss": 0.4914, "step": 2516 }, { "epoch": 0.5614543832255187, "grad_norm": 0.17127928137779236, "learning_rate": 1.842661828302021e-05, "loss": 0.4802, "step": 2517 }, { "epoch": 0.5616774481374079, "grad_norm": 0.20038791000843048, "learning_rate": 1.8425350861796845e-05, "loss": 0.4911, "step": 2518 }, { "epoch": 0.5619005130492973, "grad_norm": 0.16238313913345337, "learning_rate": 1.842408297392455e-05, "loss": 0.492, "step": 2519 }, { "epoch": 0.5621235779611867, "grad_norm": 0.17273001372814178, "learning_rate": 1.8422814619473556e-05, "loss": 0.555, "step": 2520 }, { "epoch": 0.5623466428730761, "grad_norm": 0.19861721992492676, "learning_rate": 1.84215457985141e-05, "loss": 0.5245, "step": 2521 }, { "epoch": 0.5625697077849654, "grad_norm": 0.15922409296035767, "learning_rate": 1.8420276511116467e-05, "loss": 0.4968, "step": 2522 }, { "epoch": 0.5627927726968548, "grad_norm": 0.15657752752304077, "learning_rate": 1.8419006757350956e-05, "loss": 0.4923, "step": 2523 }, { "epoch": 0.5630158376087442, "grad_norm": 0.1599670946598053, "learning_rate": 1.8417736537287893e-05, "loss": 0.5381, "step": 2524 }, { "epoch": 0.5632389025206335, "grad_norm": 0.16177695989608765, "learning_rate": 1.841646585099763e-05, "loss": 0.5025, "step": 2525 }, { "epoch": 0.5634619674325229, "grad_norm": 0.17016616463661194, "learning_rate": 1.8415194698550548e-05, "loss": 0.5024, "step": 2526 }, { "epoch": 0.5636850323444123, "grad_norm": 0.15452872216701508, "learning_rate": 1.8413923080017047e-05, "loss": 0.4838, "step": 2527 }, { "epoch": 0.5639080972563015, "grad_norm": 0.17372727394104004, "learning_rate": 1.8412650995467564e-05, "loss": 0.5272, "step": 2528 }, { "epoch": 0.5641311621681909, "grad_norm": 0.20483483374118805, "learning_rate": 1.8411378444972548e-05, "loss": 0.4984, "step": 2529 }, { "epoch": 0.5643542270800803, "grad_norm": 0.16677002608776093, "learning_rate": 1.8410105428602485e-05, "loss": 0.5217, "step": 2530 }, { "epoch": 0.5645772919919697, "grad_norm": 0.16841678321361542, "learning_rate": 1.840883194642788e-05, "loss": 0.5331, "step": 2531 }, { "epoch": 0.564800356903859, "grad_norm": 0.16345760226249695, "learning_rate": 1.8407557998519273e-05, "loss": 0.5372, "step": 2532 }, { "epoch": 0.5650234218157484, "grad_norm": 0.16982464492321014, "learning_rate": 1.840628358494721e-05, "loss": 0.4795, "step": 2533 }, { "epoch": 0.5652464867276378, "grad_norm": 0.16752153635025024, "learning_rate": 1.840500870578229e-05, "loss": 0.5088, "step": 2534 }, { "epoch": 0.5654695516395271, "grad_norm": 0.16022369265556335, "learning_rate": 1.840373336109512e-05, "loss": 0.4676, "step": 2535 }, { "epoch": 0.5656926165514164, "grad_norm": 0.15749748051166534, "learning_rate": 1.8402457550956336e-05, "loss": 0.5325, "step": 2536 }, { "epoch": 0.5659156814633058, "grad_norm": 0.16572564840316772, "learning_rate": 1.8401181275436596e-05, "loss": 0.4972, "step": 2537 }, { "epoch": 0.5661387463751952, "grad_norm": 0.15949216485023499, "learning_rate": 1.839990453460659e-05, "loss": 0.5088, "step": 2538 }, { "epoch": 0.5663618112870845, "grad_norm": 0.15577349066734314, "learning_rate": 1.8398627328537037e-05, "loss": 0.4871, "step": 2539 }, { "epoch": 0.5665848761989739, "grad_norm": 0.22663244605064392, "learning_rate": 1.839734965729867e-05, "loss": 0.51, "step": 2540 }, { "epoch": 0.5668079411108633, "grad_norm": 0.16689926385879517, "learning_rate": 1.8396071520962256e-05, "loss": 0.4998, "step": 2541 }, { "epoch": 0.5670310060227526, "grad_norm": 0.16619545221328735, "learning_rate": 1.8394792919598592e-05, "loss": 0.5208, "step": 2542 }, { "epoch": 0.567254070934642, "grad_norm": 0.15518589317798615, "learning_rate": 1.8393513853278492e-05, "loss": 0.4967, "step": 2543 }, { "epoch": 0.5674771358465314, "grad_norm": 0.19833387434482574, "learning_rate": 1.8392234322072792e-05, "loss": 0.5029, "step": 2544 }, { "epoch": 0.5677002007584206, "grad_norm": 0.15093770623207092, "learning_rate": 1.839095432605237e-05, "loss": 0.5017, "step": 2545 }, { "epoch": 0.56792326567031, "grad_norm": 0.19351163506507874, "learning_rate": 1.8389673865288114e-05, "loss": 0.494, "step": 2546 }, { "epoch": 0.5681463305821994, "grad_norm": 0.16114504635334015, "learning_rate": 1.8388392939850946e-05, "loss": 0.5118, "step": 2547 }, { "epoch": 0.5683693954940888, "grad_norm": 0.16146999597549438, "learning_rate": 1.8387111549811812e-05, "loss": 0.4732, "step": 2548 }, { "epoch": 0.5685924604059781, "grad_norm": 0.1651589721441269, "learning_rate": 1.8385829695241687e-05, "loss": 0.5086, "step": 2549 }, { "epoch": 0.5688155253178675, "grad_norm": 0.15596236288547516, "learning_rate": 1.838454737621156e-05, "loss": 0.5017, "step": 2550 }, { "epoch": 0.5690385902297569, "grad_norm": 0.1615191251039505, "learning_rate": 1.838326459279246e-05, "loss": 0.5212, "step": 2551 }, { "epoch": 0.5692616551416462, "grad_norm": 0.16675913333892822, "learning_rate": 1.8381981345055435e-05, "loss": 0.5229, "step": 2552 }, { "epoch": 0.5694847200535356, "grad_norm": 0.16236376762390137, "learning_rate": 1.8380697633071558e-05, "loss": 0.4955, "step": 2553 }, { "epoch": 0.569707784965425, "grad_norm": 0.17616188526153564, "learning_rate": 1.837941345691193e-05, "loss": 0.5067, "step": 2554 }, { "epoch": 0.5699308498773143, "grad_norm": 0.16186662018299103, "learning_rate": 1.8378128816647676e-05, "loss": 0.5054, "step": 2555 }, { "epoch": 0.5701539147892036, "grad_norm": 0.1674317866563797, "learning_rate": 1.8376843712349946e-05, "loss": 0.5009, "step": 2556 }, { "epoch": 0.570376979701093, "grad_norm": 0.17086781561374664, "learning_rate": 1.837555814408992e-05, "loss": 0.4998, "step": 2557 }, { "epoch": 0.5706000446129824, "grad_norm": 0.1624806523323059, "learning_rate": 1.8374272111938797e-05, "loss": 0.519, "step": 2558 }, { "epoch": 0.5708231095248717, "grad_norm": 0.1701088398694992, "learning_rate": 1.837298561596781e-05, "loss": 0.5173, "step": 2559 }, { "epoch": 0.5710461744367611, "grad_norm": 0.15940968692302704, "learning_rate": 1.8371698656248212e-05, "loss": 0.515, "step": 2560 }, { "epoch": 0.5712692393486505, "grad_norm": 0.17150092124938965, "learning_rate": 1.837041123285128e-05, "loss": 0.4826, "step": 2561 }, { "epoch": 0.5714923042605398, "grad_norm": 0.16845233738422394, "learning_rate": 1.836912334584833e-05, "loss": 0.5018, "step": 2562 }, { "epoch": 0.5717153691724292, "grad_norm": 0.18033871054649353, "learning_rate": 1.8367834995310676e-05, "loss": 0.5177, "step": 2563 }, { "epoch": 0.5719384340843185, "grad_norm": 0.17002245783805847, "learning_rate": 1.8366546181309686e-05, "loss": 0.5287, "step": 2564 }, { "epoch": 0.5721614989962079, "grad_norm": 0.16821357607841492, "learning_rate": 1.836525690391674e-05, "loss": 0.522, "step": 2565 }, { "epoch": 0.5723845639080972, "grad_norm": 0.1578892022371292, "learning_rate": 1.836396716320325e-05, "loss": 0.4929, "step": 2566 }, { "epoch": 0.5726076288199866, "grad_norm": 0.15691736340522766, "learning_rate": 1.836267695924065e-05, "loss": 0.4775, "step": 2567 }, { "epoch": 0.572830693731876, "grad_norm": 0.16604529321193695, "learning_rate": 1.8361386292100394e-05, "loss": 0.4854, "step": 2568 }, { "epoch": 0.5730537586437653, "grad_norm": 0.16472390294075012, "learning_rate": 1.8360095161853966e-05, "loss": 0.486, "step": 2569 }, { "epoch": 0.5732768235556547, "grad_norm": 0.1708393096923828, "learning_rate": 1.8358803568572885e-05, "loss": 0.5086, "step": 2570 }, { "epoch": 0.5734998884675441, "grad_norm": 0.1598411649465561, "learning_rate": 1.8357511512328683e-05, "loss": 0.5071, "step": 2571 }, { "epoch": 0.5737229533794335, "grad_norm": 0.19416102766990662, "learning_rate": 1.8356218993192922e-05, "loss": 0.5092, "step": 2572 }, { "epoch": 0.5739460182913227, "grad_norm": 0.1696127951145172, "learning_rate": 1.835492601123719e-05, "loss": 0.5157, "step": 2573 }, { "epoch": 0.5741690832032121, "grad_norm": 0.16586486995220184, "learning_rate": 1.8353632566533102e-05, "loss": 0.4948, "step": 2574 }, { "epoch": 0.5743921481151015, "grad_norm": 0.17883430421352386, "learning_rate": 1.8352338659152296e-05, "loss": 0.4989, "step": 2575 }, { "epoch": 0.5746152130269908, "grad_norm": 0.15909375250339508, "learning_rate": 1.8351044289166435e-05, "loss": 0.4892, "step": 2576 }, { "epoch": 0.5748382779388802, "grad_norm": 0.17203538119792938, "learning_rate": 1.834974945664721e-05, "loss": 0.5039, "step": 2577 }, { "epoch": 0.5750613428507696, "grad_norm": 0.17230457067489624, "learning_rate": 1.834845416166634e-05, "loss": 0.5102, "step": 2578 }, { "epoch": 0.575284407762659, "grad_norm": 0.16863304376602173, "learning_rate": 1.8347158404295566e-05, "loss": 0.5365, "step": 2579 }, { "epoch": 0.5755074726745483, "grad_norm": 0.15985195338726044, "learning_rate": 1.8345862184606653e-05, "loss": 0.4991, "step": 2580 }, { "epoch": 0.5757305375864377, "grad_norm": 0.16000472009181976, "learning_rate": 1.8344565502671396e-05, "loss": 0.4898, "step": 2581 }, { "epoch": 0.575953602498327, "grad_norm": 0.16451792418956757, "learning_rate": 1.8343268358561607e-05, "loss": 0.5382, "step": 2582 }, { "epoch": 0.5761766674102163, "grad_norm": 0.16820542514324188, "learning_rate": 1.834197075234914e-05, "loss": 0.4993, "step": 2583 }, { "epoch": 0.5763997323221057, "grad_norm": 0.16512157022953033, "learning_rate": 1.834067268410586e-05, "loss": 0.5139, "step": 2584 }, { "epoch": 0.5766227972339951, "grad_norm": 0.16488297283649445, "learning_rate": 1.833937415390366e-05, "loss": 0.5371, "step": 2585 }, { "epoch": 0.5768458621458844, "grad_norm": 0.1621064841747284, "learning_rate": 1.8338075161814462e-05, "loss": 0.5218, "step": 2586 }, { "epoch": 0.5770689270577738, "grad_norm": 0.1790938526391983, "learning_rate": 1.8336775707910214e-05, "loss": 0.5357, "step": 2587 }, { "epoch": 0.5772919919696632, "grad_norm": 0.16097491979599, "learning_rate": 1.8335475792262888e-05, "loss": 0.4874, "step": 2588 }, { "epoch": 0.5775150568815526, "grad_norm": 0.17577522993087769, "learning_rate": 1.8334175414944476e-05, "loss": 0.5097, "step": 2589 }, { "epoch": 0.5777381217934419, "grad_norm": 0.17199388146400452, "learning_rate": 1.833287457602701e-05, "loss": 0.4969, "step": 2590 }, { "epoch": 0.5779611867053313, "grad_norm": 0.16499276459217072, "learning_rate": 1.833157327558253e-05, "loss": 0.5207, "step": 2591 }, { "epoch": 0.5781842516172206, "grad_norm": 0.16708426177501678, "learning_rate": 1.8330271513683118e-05, "loss": 0.5077, "step": 2592 }, { "epoch": 0.5784073165291099, "grad_norm": 0.1643514633178711, "learning_rate": 1.8328969290400867e-05, "loss": 0.4884, "step": 2593 }, { "epoch": 0.5786303814409993, "grad_norm": 0.1548931747674942, "learning_rate": 1.832766660580791e-05, "loss": 0.5047, "step": 2594 }, { "epoch": 0.5788534463528887, "grad_norm": 0.16089920699596405, "learning_rate": 1.832636345997639e-05, "loss": 0.4916, "step": 2595 }, { "epoch": 0.5790765112647781, "grad_norm": 0.15845981240272522, "learning_rate": 1.8325059852978485e-05, "loss": 0.4832, "step": 2596 }, { "epoch": 0.5792995761766674, "grad_norm": 0.1613757461309433, "learning_rate": 1.83237557848864e-05, "loss": 0.5192, "step": 2597 }, { "epoch": 0.5795226410885568, "grad_norm": 0.15937237441539764, "learning_rate": 1.8322451255772365e-05, "loss": 0.5028, "step": 2598 }, { "epoch": 0.5797457060004462, "grad_norm": 0.16132612526416779, "learning_rate": 1.8321146265708627e-05, "loss": 0.4948, "step": 2599 }, { "epoch": 0.5799687709123355, "grad_norm": 0.18276172876358032, "learning_rate": 1.8319840814767463e-05, "loss": 0.4845, "step": 2600 }, { "epoch": 0.5801918358242248, "grad_norm": 0.1661817729473114, "learning_rate": 1.8318534903021182e-05, "loss": 0.5344, "step": 2601 }, { "epoch": 0.5804149007361142, "grad_norm": 0.15862146019935608, "learning_rate": 1.8317228530542117e-05, "loss": 0.5043, "step": 2602 }, { "epoch": 0.5806379656480035, "grad_norm": 0.15994656085968018, "learning_rate": 1.8315921697402618e-05, "loss": 0.4765, "step": 2603 }, { "epoch": 0.5808610305598929, "grad_norm": 0.16988401114940643, "learning_rate": 1.8314614403675063e-05, "loss": 0.5194, "step": 2604 }, { "epoch": 0.5810840954717823, "grad_norm": 0.16828812658786774, "learning_rate": 1.831330664943186e-05, "loss": 0.5357, "step": 2605 }, { "epoch": 0.5813071603836717, "grad_norm": 0.19693851470947266, "learning_rate": 1.8311998434745445e-05, "loss": 0.4976, "step": 2606 }, { "epoch": 0.581530225295561, "grad_norm": 0.1689445823431015, "learning_rate": 1.831068975968827e-05, "loss": 0.4992, "step": 2607 }, { "epoch": 0.5817532902074504, "grad_norm": 0.15751276910305023, "learning_rate": 1.830938062433282e-05, "loss": 0.4711, "step": 2608 }, { "epoch": 0.5819763551193398, "grad_norm": 0.16275669634342194, "learning_rate": 1.8308071028751608e-05, "loss": 0.5184, "step": 2609 }, { "epoch": 0.582199420031229, "grad_norm": 0.16536571085453033, "learning_rate": 1.8306760973017158e-05, "loss": 0.5172, "step": 2610 }, { "epoch": 0.5824224849431184, "grad_norm": 0.18836647272109985, "learning_rate": 1.830545045720203e-05, "loss": 0.5019, "step": 2611 }, { "epoch": 0.5826455498550078, "grad_norm": 0.16014382243156433, "learning_rate": 1.830413948137882e-05, "loss": 0.5182, "step": 2612 }, { "epoch": 0.5828686147668972, "grad_norm": 0.1588478833436966, "learning_rate": 1.8302828045620128e-05, "loss": 0.5072, "step": 2613 }, { "epoch": 0.5830916796787865, "grad_norm": 0.17403388023376465, "learning_rate": 1.830151614999859e-05, "loss": 0.5093, "step": 2614 }, { "epoch": 0.5833147445906759, "grad_norm": 0.17456629872322083, "learning_rate": 1.830020379458687e-05, "loss": 0.4922, "step": 2615 }, { "epoch": 0.5835378095025653, "grad_norm": 0.15623332560062408, "learning_rate": 1.829889097945765e-05, "loss": 0.5133, "step": 2616 }, { "epoch": 0.5837608744144546, "grad_norm": 0.18376080691814423, "learning_rate": 1.8297577704683653e-05, "loss": 0.5154, "step": 2617 }, { "epoch": 0.583983939326344, "grad_norm": 0.17060859501361847, "learning_rate": 1.8296263970337602e-05, "loss": 0.5058, "step": 2618 }, { "epoch": 0.5842070042382334, "grad_norm": 0.1654299646615982, "learning_rate": 1.829494977649227e-05, "loss": 0.5072, "step": 2619 }, { "epoch": 0.5844300691501226, "grad_norm": 0.1547902524471283, "learning_rate": 1.829363512322044e-05, "loss": 0.4967, "step": 2620 }, { "epoch": 0.584653134062012, "grad_norm": 0.1600826382637024, "learning_rate": 1.829232001059493e-05, "loss": 0.5396, "step": 2621 }, { "epoch": 0.5848761989739014, "grad_norm": 0.16009269654750824, "learning_rate": 1.8291004438688578e-05, "loss": 0.525, "step": 2622 }, { "epoch": 0.5850992638857908, "grad_norm": 0.2055627703666687, "learning_rate": 1.8289688407574246e-05, "loss": 0.4993, "step": 2623 }, { "epoch": 0.5853223287976801, "grad_norm": 0.1680591106414795, "learning_rate": 1.8288371917324827e-05, "loss": 0.5184, "step": 2624 }, { "epoch": 0.5855453937095695, "grad_norm": 0.1637965738773346, "learning_rate": 1.828705496801323e-05, "loss": 0.5205, "step": 2625 }, { "epoch": 0.5857684586214589, "grad_norm": 0.172703817486763, "learning_rate": 1.828573755971241e-05, "loss": 0.5201, "step": 2626 }, { "epoch": 0.5859915235333482, "grad_norm": 0.18580362200737, "learning_rate": 1.8284419692495316e-05, "loss": 0.5246, "step": 2627 }, { "epoch": 0.5862145884452375, "grad_norm": 0.16274447739124298, "learning_rate": 1.8283101366434954e-05, "loss": 0.5172, "step": 2628 }, { "epoch": 0.5864376533571269, "grad_norm": 0.16170893609523773, "learning_rate": 1.8281782581604334e-05, "loss": 0.506, "step": 2629 }, { "epoch": 0.5866607182690163, "grad_norm": 0.15339437127113342, "learning_rate": 1.82804633380765e-05, "loss": 0.4706, "step": 2630 }, { "epoch": 0.5868837831809056, "grad_norm": 0.17161330580711365, "learning_rate": 1.827914363592452e-05, "loss": 0.5217, "step": 2631 }, { "epoch": 0.587106848092795, "grad_norm": 0.16379369795322418, "learning_rate": 1.8277823475221485e-05, "loss": 0.4954, "step": 2632 }, { "epoch": 0.5873299130046844, "grad_norm": 0.16465184092521667, "learning_rate": 1.827650285604052e-05, "loss": 0.4995, "step": 2633 }, { "epoch": 0.5875529779165737, "grad_norm": 0.1681264042854309, "learning_rate": 1.8275181778454767e-05, "loss": 0.5391, "step": 2634 }, { "epoch": 0.5877760428284631, "grad_norm": 0.17729119956493378, "learning_rate": 1.827386024253739e-05, "loss": 0.5313, "step": 2635 }, { "epoch": 0.5879991077403525, "grad_norm": 0.1703905314207077, "learning_rate": 1.8272538248361592e-05, "loss": 0.5407, "step": 2636 }, { "epoch": 0.5882221726522417, "grad_norm": 0.1546456664800644, "learning_rate": 1.8271215796000588e-05, "loss": 0.4965, "step": 2637 }, { "epoch": 0.5884452375641311, "grad_norm": 0.15894514322280884, "learning_rate": 1.8269892885527624e-05, "loss": 0.498, "step": 2638 }, { "epoch": 0.5886683024760205, "grad_norm": 0.15367206931114197, "learning_rate": 1.826856951701597e-05, "loss": 0.482, "step": 2639 }, { "epoch": 0.5888913673879099, "grad_norm": 0.16719990968704224, "learning_rate": 1.826724569053893e-05, "loss": 0.5287, "step": 2640 }, { "epoch": 0.5891144322997992, "grad_norm": 0.1742510050535202, "learning_rate": 1.8265921406169816e-05, "loss": 0.4773, "step": 2641 }, { "epoch": 0.5893374972116886, "grad_norm": 0.16305537521839142, "learning_rate": 1.8264596663981985e-05, "loss": 0.47, "step": 2642 }, { "epoch": 0.589560562123578, "grad_norm": 0.1746242344379425, "learning_rate": 1.82632714640488e-05, "loss": 0.4978, "step": 2643 }, { "epoch": 0.5897836270354673, "grad_norm": 0.16970300674438477, "learning_rate": 1.8261945806443666e-05, "loss": 0.5225, "step": 2644 }, { "epoch": 0.5900066919473567, "grad_norm": 0.16862879693508148, "learning_rate": 1.826061969124e-05, "loss": 0.5299, "step": 2645 }, { "epoch": 0.5902297568592461, "grad_norm": 0.1616392582654953, "learning_rate": 1.825929311851126e-05, "loss": 0.5237, "step": 2646 }, { "epoch": 0.5904528217711354, "grad_norm": 0.15673169493675232, "learning_rate": 1.8257966088330907e-05, "loss": 0.5005, "step": 2647 }, { "epoch": 0.5906758866830247, "grad_norm": 0.16055789589881897, "learning_rate": 1.825663860077245e-05, "loss": 0.5179, "step": 2648 }, { "epoch": 0.5908989515949141, "grad_norm": 0.1535293161869049, "learning_rate": 1.8255310655909414e-05, "loss": 0.4926, "step": 2649 }, { "epoch": 0.5911220165068035, "grad_norm": 0.16450564563274384, "learning_rate": 1.8253982253815343e-05, "loss": 0.5016, "step": 2650 }, { "epoch": 0.5913450814186928, "grad_norm": 0.1581798493862152, "learning_rate": 1.8252653394563814e-05, "loss": 0.5031, "step": 2651 }, { "epoch": 0.5915681463305822, "grad_norm": 0.16955383121967316, "learning_rate": 1.825132407822843e-05, "loss": 0.5408, "step": 2652 }, { "epoch": 0.5917912112424716, "grad_norm": 0.16490530967712402, "learning_rate": 1.8249994304882818e-05, "loss": 0.5352, "step": 2653 }, { "epoch": 0.592014276154361, "grad_norm": 0.1716136932373047, "learning_rate": 1.8248664074600626e-05, "loss": 0.4936, "step": 2654 }, { "epoch": 0.5922373410662503, "grad_norm": 0.16612227261066437, "learning_rate": 1.8247333387455534e-05, "loss": 0.4886, "step": 2655 }, { "epoch": 0.5924604059781396, "grad_norm": 0.16571937501430511, "learning_rate": 1.8246002243521234e-05, "loss": 0.5171, "step": 2656 }, { "epoch": 0.592683470890029, "grad_norm": 0.15452441573143005, "learning_rate": 1.8244670642871464e-05, "loss": 0.4825, "step": 2657 }, { "epoch": 0.5929065358019183, "grad_norm": 0.16785788536071777, "learning_rate": 1.8243338585579974e-05, "loss": 0.4993, "step": 2658 }, { "epoch": 0.5931296007138077, "grad_norm": 0.16040126979351044, "learning_rate": 1.824200607172054e-05, "loss": 0.498, "step": 2659 }, { "epoch": 0.5933526656256971, "grad_norm": 0.16780081391334534, "learning_rate": 1.8240673101366963e-05, "loss": 0.509, "step": 2660 }, { "epoch": 0.5935757305375864, "grad_norm": 0.16159792244434357, "learning_rate": 1.823933967459308e-05, "loss": 0.5054, "step": 2661 }, { "epoch": 0.5937987954494758, "grad_norm": 0.1585383266210556, "learning_rate": 1.823800579147273e-05, "loss": 0.4861, "step": 2662 }, { "epoch": 0.5940218603613652, "grad_norm": 0.1630701869726181, "learning_rate": 1.8236671452079805e-05, "loss": 0.5226, "step": 2663 }, { "epoch": 0.5942449252732546, "grad_norm": 0.16565611958503723, "learning_rate": 1.8235336656488203e-05, "loss": 0.4905, "step": 2664 }, { "epoch": 0.5944679901851438, "grad_norm": 0.15455248951911926, "learning_rate": 1.8234001404771856e-05, "loss": 0.4844, "step": 2665 }, { "epoch": 0.5946910550970332, "grad_norm": 0.1679016500711441, "learning_rate": 1.8232665697004713e-05, "loss": 0.5312, "step": 2666 }, { "epoch": 0.5949141200089226, "grad_norm": 0.16260258853435516, "learning_rate": 1.823132953326076e-05, "loss": 0.5106, "step": 2667 }, { "epoch": 0.5951371849208119, "grad_norm": 0.15242820978164673, "learning_rate": 1.8229992913614004e-05, "loss": 0.4641, "step": 2668 }, { "epoch": 0.5953602498327013, "grad_norm": 0.17028586566448212, "learning_rate": 1.822865583813847e-05, "loss": 0.5183, "step": 2669 }, { "epoch": 0.5955833147445907, "grad_norm": 0.14964601397514343, "learning_rate": 1.8227318306908216e-05, "loss": 0.4843, "step": 2670 }, { "epoch": 0.5958063796564801, "grad_norm": 0.1675204187631607, "learning_rate": 1.822598031999732e-05, "loss": 0.5098, "step": 2671 }, { "epoch": 0.5960294445683694, "grad_norm": 0.15445132553577423, "learning_rate": 1.822464187747989e-05, "loss": 0.4759, "step": 2672 }, { "epoch": 0.5962525094802588, "grad_norm": 0.1613750010728836, "learning_rate": 1.822330297943006e-05, "loss": 0.5154, "step": 2673 }, { "epoch": 0.5964755743921482, "grad_norm": 0.16264407336711884, "learning_rate": 1.8221963625921984e-05, "loss": 0.4758, "step": 2674 }, { "epoch": 0.5966986393040374, "grad_norm": 0.16100256145000458, "learning_rate": 1.8220623817029843e-05, "loss": 0.4946, "step": 2675 }, { "epoch": 0.5969217042159268, "grad_norm": 0.1642381250858307, "learning_rate": 1.8219283552827847e-05, "loss": 0.5029, "step": 2676 }, { "epoch": 0.5971447691278162, "grad_norm": 0.16491542756557465, "learning_rate": 1.8217942833390227e-05, "loss": 0.5077, "step": 2677 }, { "epoch": 0.5973678340397055, "grad_norm": 0.1611475944519043, "learning_rate": 1.821660165879124e-05, "loss": 0.5346, "step": 2678 }, { "epoch": 0.5975908989515949, "grad_norm": 0.16811859607696533, "learning_rate": 1.8215260029105166e-05, "loss": 0.5173, "step": 2679 }, { "epoch": 0.5978139638634843, "grad_norm": 0.17261561751365662, "learning_rate": 1.8213917944406315e-05, "loss": 0.5142, "step": 2680 }, { "epoch": 0.5980370287753737, "grad_norm": 0.1671370267868042, "learning_rate": 1.8212575404769023e-05, "loss": 0.5289, "step": 2681 }, { "epoch": 0.598260093687263, "grad_norm": 0.16574646532535553, "learning_rate": 1.8211232410267645e-05, "loss": 0.5179, "step": 2682 }, { "epoch": 0.5984831585991524, "grad_norm": 0.1583874672651291, "learning_rate": 1.8209888960976565e-05, "loss": 0.503, "step": 2683 }, { "epoch": 0.5987062235110417, "grad_norm": 0.15771937370300293, "learning_rate": 1.8208545056970193e-05, "loss": 0.4811, "step": 2684 }, { "epoch": 0.598929288422931, "grad_norm": 0.16496342420578003, "learning_rate": 1.820720069832296e-05, "loss": 0.5178, "step": 2685 }, { "epoch": 0.5991523533348204, "grad_norm": 0.1662297397851944, "learning_rate": 1.820585588510933e-05, "loss": 0.5286, "step": 2686 }, { "epoch": 0.5993754182467098, "grad_norm": 0.15326498448848724, "learning_rate": 1.8204510617403785e-05, "loss": 0.4983, "step": 2687 }, { "epoch": 0.5995984831585992, "grad_norm": 0.1545604169368744, "learning_rate": 1.820316489528083e-05, "loss": 0.4746, "step": 2688 }, { "epoch": 0.5998215480704885, "grad_norm": 0.16388210654258728, "learning_rate": 1.8201818718815004e-05, "loss": 0.4766, "step": 2689 }, { "epoch": 0.6000446129823779, "grad_norm": 0.16617290675640106, "learning_rate": 1.820047208808087e-05, "loss": 0.5301, "step": 2690 }, { "epoch": 0.6002676778942673, "grad_norm": 0.16141341626644135, "learning_rate": 1.8199125003153e-05, "loss": 0.5095, "step": 2691 }, { "epoch": 0.6004907428061566, "grad_norm": 0.1575475037097931, "learning_rate": 1.8197777464106022e-05, "loss": 0.5164, "step": 2692 }, { "epoch": 0.600713807718046, "grad_norm": 0.1886652559041977, "learning_rate": 1.8196429471014558e-05, "loss": 0.4922, "step": 2693 }, { "epoch": 0.6009368726299353, "grad_norm": 0.16036200523376465, "learning_rate": 1.8195081023953268e-05, "loss": 0.5116, "step": 2694 }, { "epoch": 0.6011599375418246, "grad_norm": 0.16269893944263458, "learning_rate": 1.8193732122996847e-05, "loss": 0.5135, "step": 2695 }, { "epoch": 0.601383002453714, "grad_norm": 0.1868923157453537, "learning_rate": 1.819238276822e-05, "loss": 0.5052, "step": 2696 }, { "epoch": 0.6016060673656034, "grad_norm": 0.16269947588443756, "learning_rate": 1.8191032959697464e-05, "loss": 0.4829, "step": 2697 }, { "epoch": 0.6018291322774928, "grad_norm": 0.1679493635892868, "learning_rate": 1.8189682697504e-05, "loss": 0.4967, "step": 2698 }, { "epoch": 0.6020521971893821, "grad_norm": 0.16471365094184875, "learning_rate": 1.8188331981714386e-05, "loss": 0.5189, "step": 2699 }, { "epoch": 0.6022752621012715, "grad_norm": 0.16562478244304657, "learning_rate": 1.8186980812403448e-05, "loss": 0.5217, "step": 2700 }, { "epoch": 0.6024983270131609, "grad_norm": 0.16145823895931244, "learning_rate": 1.818562918964601e-05, "loss": 0.5008, "step": 2701 }, { "epoch": 0.6027213919250501, "grad_norm": 0.17072793841362, "learning_rate": 1.8184277113516938e-05, "loss": 0.5302, "step": 2702 }, { "epoch": 0.6029444568369395, "grad_norm": 0.16824592649936676, "learning_rate": 1.8182924584091122e-05, "loss": 0.5358, "step": 2703 }, { "epoch": 0.6031675217488289, "grad_norm": 0.1661488264799118, "learning_rate": 1.8181571601443465e-05, "loss": 0.5391, "step": 2704 }, { "epoch": 0.6033905866607183, "grad_norm": 0.15754079818725586, "learning_rate": 1.8180218165648913e-05, "loss": 0.5013, "step": 2705 }, { "epoch": 0.6036136515726076, "grad_norm": 0.1813340038061142, "learning_rate": 1.817886427678242e-05, "loss": 0.5301, "step": 2706 }, { "epoch": 0.603836716484497, "grad_norm": 0.16588057577610016, "learning_rate": 1.817750993491898e-05, "loss": 0.4924, "step": 2707 }, { "epoch": 0.6040597813963864, "grad_norm": 0.17572402954101562, "learning_rate": 1.8176155140133596e-05, "loss": 0.5075, "step": 2708 }, { "epoch": 0.6042828463082757, "grad_norm": 0.157828688621521, "learning_rate": 1.8174799892501315e-05, "loss": 0.5133, "step": 2709 }, { "epoch": 0.6045059112201651, "grad_norm": 0.1617002636194229, "learning_rate": 1.817344419209719e-05, "loss": 0.5092, "step": 2710 }, { "epoch": 0.6047289761320545, "grad_norm": 0.1876181662082672, "learning_rate": 1.817208803899632e-05, "loss": 0.5326, "step": 2711 }, { "epoch": 0.6049520410439437, "grad_norm": 0.15636108815670013, "learning_rate": 1.8170731433273802e-05, "loss": 0.4922, "step": 2712 }, { "epoch": 0.6051751059558331, "grad_norm": 0.15785710513591766, "learning_rate": 1.8169374375004784e-05, "loss": 0.4956, "step": 2713 }, { "epoch": 0.6053981708677225, "grad_norm": 0.17369434237480164, "learning_rate": 1.8168016864264426e-05, "loss": 0.493, "step": 2714 }, { "epoch": 0.6056212357796119, "grad_norm": 0.16383057832717896, "learning_rate": 1.8166658901127915e-05, "loss": 0.4638, "step": 2715 }, { "epoch": 0.6058443006915012, "grad_norm": 0.17268022894859314, "learning_rate": 1.8165300485670464e-05, "loss": 0.5056, "step": 2716 }, { "epoch": 0.6060673656033906, "grad_norm": 0.1691250205039978, "learning_rate": 1.8163941617967313e-05, "loss": 0.5161, "step": 2717 }, { "epoch": 0.60629043051528, "grad_norm": 0.1628870815038681, "learning_rate": 1.8162582298093715e-05, "loss": 0.5015, "step": 2718 }, { "epoch": 0.6065134954271693, "grad_norm": 0.17182950675487518, "learning_rate": 1.816122252612497e-05, "loss": 0.5237, "step": 2719 }, { "epoch": 0.6067365603390587, "grad_norm": 0.17092610895633698, "learning_rate": 1.8159862302136386e-05, "loss": 0.4941, "step": 2720 }, { "epoch": 0.606959625250948, "grad_norm": 0.16498810052871704, "learning_rate": 1.8158501626203298e-05, "loss": 0.5197, "step": 2721 }, { "epoch": 0.6071826901628374, "grad_norm": 0.16107720136642456, "learning_rate": 1.815714049840107e-05, "loss": 0.5186, "step": 2722 }, { "epoch": 0.6074057550747267, "grad_norm": 0.1599961370229721, "learning_rate": 1.8155778918805095e-05, "loss": 0.5035, "step": 2723 }, { "epoch": 0.6076288199866161, "grad_norm": 0.16911983489990234, "learning_rate": 1.815441688749078e-05, "loss": 0.5009, "step": 2724 }, { "epoch": 0.6078518848985055, "grad_norm": 0.16482600569725037, "learning_rate": 1.8153054404533562e-05, "loss": 0.4991, "step": 2725 }, { "epoch": 0.6080749498103948, "grad_norm": 0.16007374227046967, "learning_rate": 1.8151691470008906e-05, "loss": 0.4837, "step": 2726 }, { "epoch": 0.6082980147222842, "grad_norm": 0.16472531855106354, "learning_rate": 1.81503280839923e-05, "loss": 0.4997, "step": 2727 }, { "epoch": 0.6085210796341736, "grad_norm": 0.1605815589427948, "learning_rate": 1.814896424655926e-05, "loss": 0.5011, "step": 2728 }, { "epoch": 0.608744144546063, "grad_norm": 0.16523852944374084, "learning_rate": 1.814759995778532e-05, "loss": 0.4947, "step": 2729 }, { "epoch": 0.6089672094579522, "grad_norm": 0.1722632497549057, "learning_rate": 1.8146235217746043e-05, "loss": 0.495, "step": 2730 }, { "epoch": 0.6091902743698416, "grad_norm": 0.16873040795326233, "learning_rate": 1.8144870026517018e-05, "loss": 0.5273, "step": 2731 }, { "epoch": 0.609413339281731, "grad_norm": 0.16078028082847595, "learning_rate": 1.8143504384173858e-05, "loss": 0.5051, "step": 2732 }, { "epoch": 0.6096364041936203, "grad_norm": 0.18267033994197845, "learning_rate": 1.8142138290792202e-05, "loss": 0.4946, "step": 2733 }, { "epoch": 0.6098594691055097, "grad_norm": 0.17000767588615417, "learning_rate": 1.814077174644771e-05, "loss": 0.5079, "step": 2734 }, { "epoch": 0.6100825340173991, "grad_norm": 0.1608540564775467, "learning_rate": 1.813940475121607e-05, "loss": 0.4912, "step": 2735 }, { "epoch": 0.6103055989292884, "grad_norm": 0.160196915268898, "learning_rate": 1.8138037305172997e-05, "loss": 0.5315, "step": 2736 }, { "epoch": 0.6105286638411778, "grad_norm": 0.16966816782951355, "learning_rate": 1.813666940839423e-05, "loss": 0.532, "step": 2737 }, { "epoch": 0.6107517287530672, "grad_norm": 0.19911563396453857, "learning_rate": 1.8135301060955525e-05, "loss": 0.4847, "step": 2738 }, { "epoch": 0.6109747936649566, "grad_norm": 0.16148284077644348, "learning_rate": 1.8133932262932678e-05, "loss": 0.4756, "step": 2739 }, { "epoch": 0.6111978585768458, "grad_norm": 0.16302482783794403, "learning_rate": 1.8132563014401497e-05, "loss": 0.5021, "step": 2740 }, { "epoch": 0.6114209234887352, "grad_norm": 0.1716173142194748, "learning_rate": 1.813119331543782e-05, "loss": 0.5303, "step": 2741 }, { "epoch": 0.6116439884006246, "grad_norm": 0.16563881933689117, "learning_rate": 1.812982316611751e-05, "loss": 0.4991, "step": 2742 }, { "epoch": 0.6118670533125139, "grad_norm": 0.1720925122499466, "learning_rate": 1.812845256651645e-05, "loss": 0.5026, "step": 2743 }, { "epoch": 0.6120901182244033, "grad_norm": 0.16051127016544342, "learning_rate": 1.8127081516710565e-05, "loss": 0.4837, "step": 2744 }, { "epoch": 0.6123131831362927, "grad_norm": 0.16201861202716827, "learning_rate": 1.8125710016775778e-05, "loss": 0.5161, "step": 2745 }, { "epoch": 0.6125362480481821, "grad_norm": 0.17014898359775543, "learning_rate": 1.812433806678806e-05, "loss": 0.5346, "step": 2746 }, { "epoch": 0.6127593129600714, "grad_norm": 0.1703520119190216, "learning_rate": 1.8122965666823398e-05, "loss": 0.5182, "step": 2747 }, { "epoch": 0.6129823778719607, "grad_norm": 0.19158673286437988, "learning_rate": 1.8121592816957797e-05, "loss": 0.5043, "step": 2748 }, { "epoch": 0.6132054427838501, "grad_norm": 0.14746567606925964, "learning_rate": 1.8120219517267302e-05, "loss": 0.4841, "step": 2749 }, { "epoch": 0.6134285076957394, "grad_norm": 0.16160626709461212, "learning_rate": 1.811884576782797e-05, "loss": 0.5158, "step": 2750 }, { "epoch": 0.6136515726076288, "grad_norm": 0.17975735664367676, "learning_rate": 1.8117471568715893e-05, "loss": 0.5235, "step": 2751 }, { "epoch": 0.6138746375195182, "grad_norm": 0.17119070887565613, "learning_rate": 1.8116096920007177e-05, "loss": 0.4951, "step": 2752 }, { "epoch": 0.6140977024314075, "grad_norm": 0.16311194002628326, "learning_rate": 1.8114721821777964e-05, "loss": 0.499, "step": 2753 }, { "epoch": 0.6143207673432969, "grad_norm": 0.16315460205078125, "learning_rate": 1.811334627410441e-05, "loss": 0.51, "step": 2754 }, { "epoch": 0.6145438322551863, "grad_norm": 0.44980496168136597, "learning_rate": 1.81119702770627e-05, "loss": 0.5005, "step": 2755 }, { "epoch": 0.6147668971670757, "grad_norm": 0.17961294949054718, "learning_rate": 1.8110593830729057e-05, "loss": 0.5204, "step": 2756 }, { "epoch": 0.614989962078965, "grad_norm": 0.16227351129055023, "learning_rate": 1.8109216935179712e-05, "loss": 0.5081, "step": 2757 }, { "epoch": 0.6152130269908543, "grad_norm": 0.1761593520641327, "learning_rate": 1.810783959049092e-05, "loss": 0.5163, "step": 2758 }, { "epoch": 0.6154360919027437, "grad_norm": 0.17095595598220825, "learning_rate": 1.810646179673897e-05, "loss": 0.4948, "step": 2759 }, { "epoch": 0.615659156814633, "grad_norm": 0.16665005683898926, "learning_rate": 1.8105083554000175e-05, "loss": 0.4937, "step": 2760 }, { "epoch": 0.6158822217265224, "grad_norm": 0.16761796176433563, "learning_rate": 1.810370486235087e-05, "loss": 0.5081, "step": 2761 }, { "epoch": 0.6161052866384118, "grad_norm": 0.17292195558547974, "learning_rate": 1.8102325721867417e-05, "loss": 0.5027, "step": 2762 }, { "epoch": 0.6163283515503012, "grad_norm": 0.1606137752532959, "learning_rate": 1.8100946132626197e-05, "loss": 0.4867, "step": 2763 }, { "epoch": 0.6165514164621905, "grad_norm": 0.1619354635477066, "learning_rate": 1.8099566094703626e-05, "loss": 0.474, "step": 2764 }, { "epoch": 0.6167744813740799, "grad_norm": 0.16491201519966125, "learning_rate": 1.8098185608176132e-05, "loss": 0.4953, "step": 2765 }, { "epoch": 0.6169975462859693, "grad_norm": 0.16283756494522095, "learning_rate": 1.8096804673120183e-05, "loss": 0.4934, "step": 2766 }, { "epoch": 0.6172206111978585, "grad_norm": 0.15283644199371338, "learning_rate": 1.809542328961226e-05, "loss": 0.4691, "step": 2767 }, { "epoch": 0.6174436761097479, "grad_norm": 0.16907426714897156, "learning_rate": 1.809404145772887e-05, "loss": 0.4901, "step": 2768 }, { "epoch": 0.6176667410216373, "grad_norm": 0.16500405967235565, "learning_rate": 1.8092659177546554e-05, "loss": 0.5042, "step": 2769 }, { "epoch": 0.6178898059335266, "grad_norm": 0.16616129875183105, "learning_rate": 1.8091276449141868e-05, "loss": 0.528, "step": 2770 }, { "epoch": 0.618112870845416, "grad_norm": 0.15396897494792938, "learning_rate": 1.8089893272591393e-05, "loss": 0.4797, "step": 2771 }, { "epoch": 0.6183359357573054, "grad_norm": 0.1576898992061615, "learning_rate": 1.8088509647971744e-05, "loss": 0.5007, "step": 2772 }, { "epoch": 0.6185590006691948, "grad_norm": 0.1703861653804779, "learning_rate": 1.808712557535955e-05, "loss": 0.5114, "step": 2773 }, { "epoch": 0.6187820655810841, "grad_norm": 0.15141689777374268, "learning_rate": 1.8085741054831472e-05, "loss": 0.457, "step": 2774 }, { "epoch": 0.6190051304929735, "grad_norm": 0.19050109386444092, "learning_rate": 1.8084356086464197e-05, "loss": 0.474, "step": 2775 }, { "epoch": 0.6192281954048628, "grad_norm": 0.17825506627559662, "learning_rate": 1.8082970670334425e-05, "loss": 0.5164, "step": 2776 }, { "epoch": 0.6194512603167521, "grad_norm": 0.16141526401042938, "learning_rate": 1.8081584806518897e-05, "loss": 0.5025, "step": 2777 }, { "epoch": 0.6196743252286415, "grad_norm": 0.23704519867897034, "learning_rate": 1.8080198495094364e-05, "loss": 0.5315, "step": 2778 }, { "epoch": 0.6198973901405309, "grad_norm": 0.15994718670845032, "learning_rate": 1.8078811736137612e-05, "loss": 0.4853, "step": 2779 }, { "epoch": 0.6201204550524203, "grad_norm": 0.1649598479270935, "learning_rate": 1.807742452972545e-05, "loss": 0.5262, "step": 2780 }, { "epoch": 0.6203435199643096, "grad_norm": 0.14894191920757294, "learning_rate": 1.8076036875934707e-05, "loss": 0.4816, "step": 2781 }, { "epoch": 0.620566584876199, "grad_norm": 0.17157211899757385, "learning_rate": 1.807464877484224e-05, "loss": 0.4847, "step": 2782 }, { "epoch": 0.6207896497880884, "grad_norm": 0.1663542538881302, "learning_rate": 1.8073260226524937e-05, "loss": 0.4931, "step": 2783 }, { "epoch": 0.6210127146999777, "grad_norm": 0.1610024869441986, "learning_rate": 1.8071871231059695e-05, "loss": 0.4924, "step": 2784 }, { "epoch": 0.621235779611867, "grad_norm": 0.16384142637252808, "learning_rate": 1.807048178852345e-05, "loss": 0.5143, "step": 2785 }, { "epoch": 0.6214588445237564, "grad_norm": 0.16362221539020538, "learning_rate": 1.8069091898993162e-05, "loss": 0.5172, "step": 2786 }, { "epoch": 0.6216819094356457, "grad_norm": 0.19173026084899902, "learning_rate": 1.8067701562545808e-05, "loss": 0.512, "step": 2787 }, { "epoch": 0.6219049743475351, "grad_norm": 0.15911833941936493, "learning_rate": 1.8066310779258393e-05, "loss": 0.4874, "step": 2788 }, { "epoch": 0.6221280392594245, "grad_norm": 0.15391753613948822, "learning_rate": 1.8064919549207946e-05, "loss": 0.4966, "step": 2789 }, { "epoch": 0.6223511041713139, "grad_norm": 0.21308940649032593, "learning_rate": 1.8063527872471523e-05, "loss": 0.5172, "step": 2790 }, { "epoch": 0.6225741690832032, "grad_norm": 0.15762938559055328, "learning_rate": 1.8062135749126208e-05, "loss": 0.4904, "step": 2791 }, { "epoch": 0.6227972339950926, "grad_norm": 0.18538956344127655, "learning_rate": 1.80607431792491e-05, "loss": 0.4907, "step": 2792 }, { "epoch": 0.623020298906982, "grad_norm": 0.1694374829530716, "learning_rate": 1.8059350162917333e-05, "loss": 0.4892, "step": 2793 }, { "epoch": 0.6232433638188712, "grad_norm": 0.1632799506187439, "learning_rate": 1.8057956700208055e-05, "loss": 0.531, "step": 2794 }, { "epoch": 0.6234664287307606, "grad_norm": 0.19030988216400146, "learning_rate": 1.805656279119845e-05, "loss": 0.4801, "step": 2795 }, { "epoch": 0.62368949364265, "grad_norm": 0.17578838765621185, "learning_rate": 1.8055168435965722e-05, "loss": 0.5129, "step": 2796 }, { "epoch": 0.6239125585545394, "grad_norm": 0.17337745428085327, "learning_rate": 1.8053773634587095e-05, "loss": 0.5252, "step": 2797 }, { "epoch": 0.6241356234664287, "grad_norm": 0.16400404274463654, "learning_rate": 1.8052378387139827e-05, "loss": 0.4977, "step": 2798 }, { "epoch": 0.6243586883783181, "grad_norm": 0.16884362697601318, "learning_rate": 1.8050982693701188e-05, "loss": 0.5209, "step": 2799 }, { "epoch": 0.6245817532902075, "grad_norm": 0.16655333340168, "learning_rate": 1.8049586554348487e-05, "loss": 0.5236, "step": 2800 }, { "epoch": 0.6248048182020968, "grad_norm": 0.1815658062696457, "learning_rate": 1.804818996915905e-05, "loss": 0.5155, "step": 2801 }, { "epoch": 0.6250278831139862, "grad_norm": 0.1637655794620514, "learning_rate": 1.8046792938210226e-05, "loss": 0.5108, "step": 2802 }, { "epoch": 0.6252509480258756, "grad_norm": 0.16214872896671295, "learning_rate": 1.804539546157939e-05, "loss": 0.5015, "step": 2803 }, { "epoch": 0.625474012937765, "grad_norm": 0.1624768078327179, "learning_rate": 1.804399753934395e-05, "loss": 0.4852, "step": 2804 }, { "epoch": 0.6256970778496542, "grad_norm": 0.1509588062763214, "learning_rate": 1.8042599171581322e-05, "loss": 0.4895, "step": 2805 }, { "epoch": 0.6259201427615436, "grad_norm": 0.15983150899410248, "learning_rate": 1.804120035836897e-05, "loss": 0.5066, "step": 2806 }, { "epoch": 0.626143207673433, "grad_norm": 0.1639653593301773, "learning_rate": 1.8039801099784356e-05, "loss": 0.5081, "step": 2807 }, { "epoch": 0.6263662725853223, "grad_norm": 0.16054677963256836, "learning_rate": 1.8038401395904984e-05, "loss": 0.5019, "step": 2808 }, { "epoch": 0.6265893374972117, "grad_norm": 0.16557928919792175, "learning_rate": 1.8037001246808382e-05, "loss": 0.5085, "step": 2809 }, { "epoch": 0.6268124024091011, "grad_norm": 0.17172253131866455, "learning_rate": 1.8035600652572093e-05, "loss": 0.5069, "step": 2810 }, { "epoch": 0.6270354673209904, "grad_norm": 0.16289275884628296, "learning_rate": 1.80341996132737e-05, "loss": 0.5054, "step": 2811 }, { "epoch": 0.6272585322328798, "grad_norm": 0.159224271774292, "learning_rate": 1.8032798128990788e-05, "loss": 0.4884, "step": 2812 }, { "epoch": 0.6274815971447691, "grad_norm": 0.1516553908586502, "learning_rate": 1.803139619980099e-05, "loss": 0.4823, "step": 2813 }, { "epoch": 0.6277046620566585, "grad_norm": 0.164667546749115, "learning_rate": 1.802999382578195e-05, "loss": 0.5184, "step": 2814 }, { "epoch": 0.6279277269685478, "grad_norm": 0.17175255715847015, "learning_rate": 1.8028591007011343e-05, "loss": 0.5166, "step": 2815 }, { "epoch": 0.6281507918804372, "grad_norm": 0.18747776746749878, "learning_rate": 1.8027187743566867e-05, "loss": 0.499, "step": 2816 }, { "epoch": 0.6283738567923266, "grad_norm": 0.16748382151126862, "learning_rate": 1.8025784035526235e-05, "loss": 0.5099, "step": 2817 }, { "epoch": 0.6285969217042159, "grad_norm": 0.19813144207000732, "learning_rate": 1.80243798829672e-05, "loss": 0.5045, "step": 2818 }, { "epoch": 0.6288199866161053, "grad_norm": 0.16501356661319733, "learning_rate": 1.8022975285967534e-05, "loss": 0.514, "step": 2819 }, { "epoch": 0.6290430515279947, "grad_norm": 0.17181000113487244, "learning_rate": 1.8021570244605028e-05, "loss": 0.4963, "step": 2820 }, { "epoch": 0.6292661164398841, "grad_norm": 0.16481012105941772, "learning_rate": 1.8020164758957505e-05, "loss": 0.4831, "step": 2821 }, { "epoch": 0.6294891813517733, "grad_norm": 0.1659911572933197, "learning_rate": 1.8018758829102808e-05, "loss": 0.5026, "step": 2822 }, { "epoch": 0.6297122462636627, "grad_norm": 0.18394367396831512, "learning_rate": 1.8017352455118812e-05, "loss": 0.5072, "step": 2823 }, { "epoch": 0.6299353111755521, "grad_norm": 0.1666680872440338, "learning_rate": 1.80159456370834e-05, "loss": 0.4986, "step": 2824 }, { "epoch": 0.6301583760874414, "grad_norm": 0.184769406914711, "learning_rate": 1.80145383750745e-05, "loss": 0.4863, "step": 2825 }, { "epoch": 0.6303814409993308, "grad_norm": 0.15709228813648224, "learning_rate": 1.801313066917005e-05, "loss": 0.4945, "step": 2826 }, { "epoch": 0.6306045059112202, "grad_norm": 0.16040046513080597, "learning_rate": 1.801172251944802e-05, "loss": 0.4704, "step": 2827 }, { "epoch": 0.6308275708231095, "grad_norm": 0.174714133143425, "learning_rate": 1.8010313925986398e-05, "loss": 0.5224, "step": 2828 }, { "epoch": 0.6310506357349989, "grad_norm": 0.17879654467105865, "learning_rate": 1.8008904888863206e-05, "loss": 0.5168, "step": 2829 }, { "epoch": 0.6312737006468883, "grad_norm": 0.16159847378730774, "learning_rate": 1.8007495408156483e-05, "loss": 0.4905, "step": 2830 }, { "epoch": 0.6314967655587777, "grad_norm": 0.16279365122318268, "learning_rate": 1.8006085483944295e-05, "loss": 0.5004, "step": 2831 }, { "epoch": 0.6317198304706669, "grad_norm": 0.16124996542930603, "learning_rate": 1.800467511630473e-05, "loss": 0.498, "step": 2832 }, { "epoch": 0.6319428953825563, "grad_norm": 0.16358453035354614, "learning_rate": 1.800326430531591e-05, "loss": 0.5172, "step": 2833 }, { "epoch": 0.6321659602944457, "grad_norm": 0.16300733387470245, "learning_rate": 1.8001853051055967e-05, "loss": 0.5024, "step": 2834 }, { "epoch": 0.632389025206335, "grad_norm": 0.1758897304534912, "learning_rate": 1.8000441353603072e-05, "loss": 0.4946, "step": 2835 }, { "epoch": 0.6326120901182244, "grad_norm": 0.1550527960062027, "learning_rate": 1.7999029213035408e-05, "loss": 0.5027, "step": 2836 }, { "epoch": 0.6328351550301138, "grad_norm": 0.15276572108268738, "learning_rate": 1.799761662943119e-05, "loss": 0.4968, "step": 2837 }, { "epoch": 0.6330582199420032, "grad_norm": 0.16600914299488068, "learning_rate": 1.7996203602868657e-05, "loss": 0.5278, "step": 2838 }, { "epoch": 0.6332812848538925, "grad_norm": 0.16788353025913239, "learning_rate": 1.799479013342607e-05, "loss": 0.5071, "step": 2839 }, { "epoch": 0.6335043497657818, "grad_norm": 0.16952337324619293, "learning_rate": 1.7993376221181716e-05, "loss": 0.4852, "step": 2840 }, { "epoch": 0.6337274146776712, "grad_norm": 0.28554677963256836, "learning_rate": 1.7991961866213907e-05, "loss": 0.474, "step": 2841 }, { "epoch": 0.6339504795895605, "grad_norm": 0.17595677077770233, "learning_rate": 1.7990547068600977e-05, "loss": 0.5045, "step": 2842 }, { "epoch": 0.6341735445014499, "grad_norm": 0.1631106734275818, "learning_rate": 1.798913182842129e-05, "loss": 0.4943, "step": 2843 }, { "epoch": 0.6343966094133393, "grad_norm": 0.1630406528711319, "learning_rate": 1.7987716145753226e-05, "loss": 0.5095, "step": 2844 }, { "epoch": 0.6346196743252286, "grad_norm": 0.17511357367038727, "learning_rate": 1.7986300020675198e-05, "loss": 0.5203, "step": 2845 }, { "epoch": 0.634842739237118, "grad_norm": 0.1971133053302765, "learning_rate": 1.798488345326564e-05, "loss": 0.5061, "step": 2846 }, { "epoch": 0.6350658041490074, "grad_norm": 0.15673868358135223, "learning_rate": 1.7983466443603008e-05, "loss": 0.4957, "step": 2847 }, { "epoch": 0.6352888690608968, "grad_norm": 0.1608821600675583, "learning_rate": 1.798204899176579e-05, "loss": 0.4848, "step": 2848 }, { "epoch": 0.635511933972786, "grad_norm": 0.15686407685279846, "learning_rate": 1.7980631097832485e-05, "loss": 0.4986, "step": 2849 }, { "epoch": 0.6357349988846754, "grad_norm": 0.16254571080207825, "learning_rate": 1.797921276188163e-05, "loss": 0.5255, "step": 2850 }, { "epoch": 0.6359580637965648, "grad_norm": 0.17136751115322113, "learning_rate": 1.7977793983991785e-05, "loss": 0.5029, "step": 2851 }, { "epoch": 0.6361811287084541, "grad_norm": 0.2001720815896988, "learning_rate": 1.7976374764241523e-05, "loss": 0.5208, "step": 2852 }, { "epoch": 0.6364041936203435, "grad_norm": 0.15888839960098267, "learning_rate": 1.7974955102709457e-05, "loss": 0.5125, "step": 2853 }, { "epoch": 0.6366272585322329, "grad_norm": 0.1562851518392563, "learning_rate": 1.797353499947421e-05, "loss": 0.4647, "step": 2854 }, { "epoch": 0.6368503234441223, "grad_norm": 0.157870814204216, "learning_rate": 1.7972114454614436e-05, "loss": 0.5072, "step": 2855 }, { "epoch": 0.6370733883560116, "grad_norm": 0.16168496012687683, "learning_rate": 1.7970693468208823e-05, "loss": 0.4946, "step": 2856 }, { "epoch": 0.637296453267901, "grad_norm": 0.23463092744350433, "learning_rate": 1.796927204033607e-05, "loss": 0.5073, "step": 2857 }, { "epoch": 0.6375195181797904, "grad_norm": 0.18455378711223602, "learning_rate": 1.7967850171074896e-05, "loss": 0.5278, "step": 2858 }, { "epoch": 0.6377425830916796, "grad_norm": 0.16380822658538818, "learning_rate": 1.796642786050406e-05, "loss": 0.5048, "step": 2859 }, { "epoch": 0.637965648003569, "grad_norm": 0.15988902747631073, "learning_rate": 1.7965005108702342e-05, "loss": 0.4787, "step": 2860 }, { "epoch": 0.6381887129154584, "grad_norm": 0.16252587735652924, "learning_rate": 1.796358191574854e-05, "loss": 0.5075, "step": 2861 }, { "epoch": 0.6384117778273477, "grad_norm": 0.1922154724597931, "learning_rate": 1.7962158281721475e-05, "loss": 0.4616, "step": 2862 }, { "epoch": 0.6386348427392371, "grad_norm": 0.17007192969322205, "learning_rate": 1.7960734206700002e-05, "loss": 0.4741, "step": 2863 }, { "epoch": 0.6388579076511265, "grad_norm": 0.16017115116119385, "learning_rate": 1.7959309690762992e-05, "loss": 0.4757, "step": 2864 }, { "epoch": 0.6390809725630159, "grad_norm": 0.16466043889522552, "learning_rate": 1.795788473398935e-05, "loss": 0.4856, "step": 2865 }, { "epoch": 0.6393040374749052, "grad_norm": 0.16355857253074646, "learning_rate": 1.795645933645799e-05, "loss": 0.5055, "step": 2866 }, { "epoch": 0.6395271023867946, "grad_norm": 0.1593533307313919, "learning_rate": 1.7955033498247863e-05, "loss": 0.5103, "step": 2867 }, { "epoch": 0.639750167298684, "grad_norm": 0.16536985337734222, "learning_rate": 1.7953607219437942e-05, "loss": 0.5029, "step": 2868 }, { "epoch": 0.6399732322105732, "grad_norm": 0.1590854823589325, "learning_rate": 1.7952180500107225e-05, "loss": 0.4687, "step": 2869 }, { "epoch": 0.6401962971224626, "grad_norm": 0.16659674048423767, "learning_rate": 1.7950753340334734e-05, "loss": 0.4999, "step": 2870 }, { "epoch": 0.640419362034352, "grad_norm": 0.17274489998817444, "learning_rate": 1.7949325740199507e-05, "loss": 0.4995, "step": 2871 }, { "epoch": 0.6406424269462414, "grad_norm": 0.16115543246269226, "learning_rate": 1.7947897699780616e-05, "loss": 0.484, "step": 2872 }, { "epoch": 0.6408654918581307, "grad_norm": 0.16735753417015076, "learning_rate": 1.7946469219157158e-05, "loss": 0.4917, "step": 2873 }, { "epoch": 0.6410885567700201, "grad_norm": 0.1667783111333847, "learning_rate": 1.7945040298408248e-05, "loss": 0.5106, "step": 2874 }, { "epoch": 0.6413116216819095, "grad_norm": 0.16274797916412354, "learning_rate": 1.794361093761303e-05, "loss": 0.5006, "step": 2875 }, { "epoch": 0.6415346865937988, "grad_norm": 0.15851566195487976, "learning_rate": 1.7942181136850672e-05, "loss": 0.5007, "step": 2876 }, { "epoch": 0.6417577515056881, "grad_norm": 0.16172711551189423, "learning_rate": 1.7940750896200363e-05, "loss": 0.5043, "step": 2877 }, { "epoch": 0.6419808164175775, "grad_norm": 0.18239612877368927, "learning_rate": 1.7939320215741322e-05, "loss": 0.4888, "step": 2878 }, { "epoch": 0.6422038813294669, "grad_norm": 0.17973902821540833, "learning_rate": 1.7937889095552787e-05, "loss": 0.4813, "step": 2879 }, { "epoch": 0.6424269462413562, "grad_norm": 0.1610574722290039, "learning_rate": 1.7936457535714023e-05, "loss": 0.5138, "step": 2880 }, { "epoch": 0.6426500111532456, "grad_norm": 0.16427603363990784, "learning_rate": 1.7935025536304317e-05, "loss": 0.483, "step": 2881 }, { "epoch": 0.642873076065135, "grad_norm": 0.18190504610538483, "learning_rate": 1.7933593097402983e-05, "loss": 0.4968, "step": 2882 }, { "epoch": 0.6430961409770243, "grad_norm": 0.16263769567012787, "learning_rate": 1.793216021908936e-05, "loss": 0.5245, "step": 2883 }, { "epoch": 0.6433192058889137, "grad_norm": 0.16960637271404266, "learning_rate": 1.793072690144281e-05, "loss": 0.5353, "step": 2884 }, { "epoch": 0.6435422708008031, "grad_norm": 0.1611735224723816, "learning_rate": 1.7929293144542715e-05, "loss": 0.4701, "step": 2885 }, { "epoch": 0.6437653357126923, "grad_norm": 0.16312386095523834, "learning_rate": 1.792785894846849e-05, "loss": 0.5167, "step": 2886 }, { "epoch": 0.6439884006245817, "grad_norm": 0.17161010205745697, "learning_rate": 1.7926424313299568e-05, "loss": 0.5092, "step": 2887 }, { "epoch": 0.6442114655364711, "grad_norm": 0.1609477698802948, "learning_rate": 1.7924989239115407e-05, "loss": 0.5295, "step": 2888 }, { "epoch": 0.6444345304483605, "grad_norm": 0.1607387214899063, "learning_rate": 1.7923553725995494e-05, "loss": 0.5205, "step": 2889 }, { "epoch": 0.6446575953602498, "grad_norm": 0.16367650032043457, "learning_rate": 1.7922117774019333e-05, "loss": 0.4875, "step": 2890 }, { "epoch": 0.6448806602721392, "grad_norm": 0.17014965415000916, "learning_rate": 1.7920681383266458e-05, "loss": 0.5218, "step": 2891 }, { "epoch": 0.6451037251840286, "grad_norm": 0.1545102298259735, "learning_rate": 1.7919244553816426e-05, "loss": 0.4954, "step": 2892 }, { "epoch": 0.6453267900959179, "grad_norm": 0.2512567937374115, "learning_rate": 1.7917807285748817e-05, "loss": 0.5173, "step": 2893 }, { "epoch": 0.6455498550078073, "grad_norm": 0.17359793186187744, "learning_rate": 1.7916369579143235e-05, "loss": 0.5253, "step": 2894 }, { "epoch": 0.6457729199196967, "grad_norm": 0.17602702975273132, "learning_rate": 1.7914931434079305e-05, "loss": 0.492, "step": 2895 }, { "epoch": 0.645995984831586, "grad_norm": 0.1598963886499405, "learning_rate": 1.791349285063669e-05, "loss": 0.5071, "step": 2896 }, { "epoch": 0.6462190497434753, "grad_norm": 0.16161389648914337, "learning_rate": 1.7912053828895064e-05, "loss": 0.4896, "step": 2897 }, { "epoch": 0.6464421146553647, "grad_norm": 0.16500675678253174, "learning_rate": 1.7910614368934127e-05, "loss": 0.5142, "step": 2898 }, { "epoch": 0.6466651795672541, "grad_norm": 0.15490639209747314, "learning_rate": 1.7909174470833604e-05, "loss": 0.4708, "step": 2899 }, { "epoch": 0.6468882444791434, "grad_norm": 0.1680765300989151, "learning_rate": 1.7907734134673252e-05, "loss": 0.4978, "step": 2900 }, { "epoch": 0.6471113093910328, "grad_norm": 0.16419318318367004, "learning_rate": 1.790629336053284e-05, "loss": 0.5106, "step": 2901 }, { "epoch": 0.6473343743029222, "grad_norm": 0.1587122082710266, "learning_rate": 1.790485214849217e-05, "loss": 0.456, "step": 2902 }, { "epoch": 0.6475574392148115, "grad_norm": 0.17150086164474487, "learning_rate": 1.7903410498631063e-05, "loss": 0.5066, "step": 2903 }, { "epoch": 0.6477805041267009, "grad_norm": 0.16548244655132294, "learning_rate": 1.790196841102937e-05, "loss": 0.5297, "step": 2904 }, { "epoch": 0.6480035690385902, "grad_norm": 0.1526126116514206, "learning_rate": 1.790052588576696e-05, "loss": 0.4693, "step": 2905 }, { "epoch": 0.6482266339504796, "grad_norm": 0.16352921724319458, "learning_rate": 1.7899082922923732e-05, "loss": 0.5291, "step": 2906 }, { "epoch": 0.6484496988623689, "grad_norm": 0.16485454142093658, "learning_rate": 1.78976395225796e-05, "loss": 0.4752, "step": 2907 }, { "epoch": 0.6486727637742583, "grad_norm": 0.1610778123140335, "learning_rate": 1.7896195684814516e-05, "loss": 0.479, "step": 2908 }, { "epoch": 0.6488958286861477, "grad_norm": 0.16191574931144714, "learning_rate": 1.7894751409708447e-05, "loss": 0.5059, "step": 2909 }, { "epoch": 0.649118893598037, "grad_norm": 0.1598249077796936, "learning_rate": 1.7893306697341385e-05, "loss": 0.487, "step": 2910 }, { "epoch": 0.6493419585099264, "grad_norm": 0.1673557460308075, "learning_rate": 1.7891861547793345e-05, "loss": 0.4926, "step": 2911 }, { "epoch": 0.6495650234218158, "grad_norm": 0.16955086588859558, "learning_rate": 1.789041596114437e-05, "loss": 0.4912, "step": 2912 }, { "epoch": 0.6497880883337052, "grad_norm": 0.15351925790309906, "learning_rate": 1.788896993747453e-05, "loss": 0.4758, "step": 2913 }, { "epoch": 0.6500111532455944, "grad_norm": 0.16994906961917877, "learning_rate": 1.7887523476863907e-05, "loss": 0.5038, "step": 2914 }, { "epoch": 0.6502342181574838, "grad_norm": 0.16765102744102478, "learning_rate": 1.7886076579392622e-05, "loss": 0.4787, "step": 2915 }, { "epoch": 0.6504572830693732, "grad_norm": 0.16875889897346497, "learning_rate": 1.7884629245140812e-05, "loss": 0.5267, "step": 2916 }, { "epoch": 0.6506803479812625, "grad_norm": 0.16783274710178375, "learning_rate": 1.7883181474188637e-05, "loss": 0.4913, "step": 2917 }, { "epoch": 0.6509034128931519, "grad_norm": 0.16633883118629456, "learning_rate": 1.7881733266616284e-05, "loss": 0.5019, "step": 2918 }, { "epoch": 0.6511264778050413, "grad_norm": 0.1654990315437317, "learning_rate": 1.7880284622503966e-05, "loss": 0.5033, "step": 2919 }, { "epoch": 0.6513495427169306, "grad_norm": 0.16043652594089508, "learning_rate": 1.7878835541931915e-05, "loss": 0.4957, "step": 2920 }, { "epoch": 0.65157260762882, "grad_norm": 0.16924645006656647, "learning_rate": 1.7877386024980392e-05, "loss": 0.5223, "step": 2921 }, { "epoch": 0.6517956725407094, "grad_norm": 0.1635693907737732, "learning_rate": 1.7875936071729682e-05, "loss": 0.4908, "step": 2922 }, { "epoch": 0.6520187374525988, "grad_norm": 0.1630011945962906, "learning_rate": 1.7874485682260087e-05, "loss": 0.5138, "step": 2923 }, { "epoch": 0.652241802364488, "grad_norm": 0.16332125663757324, "learning_rate": 1.7873034856651944e-05, "loss": 0.5119, "step": 2924 }, { "epoch": 0.6524648672763774, "grad_norm": 0.1802121102809906, "learning_rate": 1.787158359498561e-05, "loss": 0.4766, "step": 2925 }, { "epoch": 0.6526879321882668, "grad_norm": 0.16062189638614655, "learning_rate": 1.7870131897341458e-05, "loss": 0.4747, "step": 2926 }, { "epoch": 0.6529109971001561, "grad_norm": 0.16367454826831818, "learning_rate": 1.7868679763799898e-05, "loss": 0.4698, "step": 2927 }, { "epoch": 0.6531340620120455, "grad_norm": 0.16663846373558044, "learning_rate": 1.786722719444136e-05, "loss": 0.4903, "step": 2928 }, { "epoch": 0.6533571269239349, "grad_norm": 0.16041219234466553, "learning_rate": 1.786577418934629e-05, "loss": 0.496, "step": 2929 }, { "epoch": 0.6535801918358243, "grad_norm": 0.1660057008266449, "learning_rate": 1.7864320748595168e-05, "loss": 0.497, "step": 2930 }, { "epoch": 0.6538032567477136, "grad_norm": 0.17890667915344238, "learning_rate": 1.7862866872268493e-05, "loss": 0.5183, "step": 2931 }, { "epoch": 0.654026321659603, "grad_norm": 0.15913154184818268, "learning_rate": 1.7861412560446794e-05, "loss": 0.4827, "step": 2932 }, { "epoch": 0.6542493865714923, "grad_norm": 0.16374540328979492, "learning_rate": 1.7859957813210614e-05, "loss": 0.5119, "step": 2933 }, { "epoch": 0.6544724514833816, "grad_norm": 0.1748073399066925, "learning_rate": 1.7858502630640533e-05, "loss": 0.493, "step": 2934 }, { "epoch": 0.654695516395271, "grad_norm": 0.1679995357990265, "learning_rate": 1.7857047012817144e-05, "loss": 0.4878, "step": 2935 }, { "epoch": 0.6549185813071604, "grad_norm": 0.16782502830028534, "learning_rate": 1.7855590959821068e-05, "loss": 0.4942, "step": 2936 }, { "epoch": 0.6551416462190497, "grad_norm": 0.1666112244129181, "learning_rate": 1.785413447173295e-05, "loss": 0.5109, "step": 2937 }, { "epoch": 0.6553647111309391, "grad_norm": 0.16338278353214264, "learning_rate": 1.785267754863346e-05, "loss": 0.509, "step": 2938 }, { "epoch": 0.6555877760428285, "grad_norm": 0.16831496357917786, "learning_rate": 1.7851220190603295e-05, "loss": 0.4933, "step": 2939 }, { "epoch": 0.6558108409547179, "grad_norm": 0.16522802412509918, "learning_rate": 1.7849762397723168e-05, "loss": 0.5204, "step": 2940 }, { "epoch": 0.6560339058666071, "grad_norm": 0.1579723209142685, "learning_rate": 1.7848304170073822e-05, "loss": 0.4747, "step": 2941 }, { "epoch": 0.6562569707784965, "grad_norm": 0.16369852423667908, "learning_rate": 1.784684550773602e-05, "loss": 0.4763, "step": 2942 }, { "epoch": 0.6564800356903859, "grad_norm": 0.16621820628643036, "learning_rate": 1.7845386410790558e-05, "loss": 0.5113, "step": 2943 }, { "epoch": 0.6567031006022752, "grad_norm": 0.161569282412529, "learning_rate": 1.784392687931825e-05, "loss": 0.4899, "step": 2944 }, { "epoch": 0.6569261655141646, "grad_norm": 0.1616351306438446, "learning_rate": 1.7842466913399928e-05, "loss": 0.4879, "step": 2945 }, { "epoch": 0.657149230426054, "grad_norm": 0.16426199674606323, "learning_rate": 1.7841006513116456e-05, "loss": 0.5265, "step": 2946 }, { "epoch": 0.6573722953379434, "grad_norm": 0.1592702567577362, "learning_rate": 1.7839545678548727e-05, "loss": 0.5036, "step": 2947 }, { "epoch": 0.6575953602498327, "grad_norm": 0.15806902945041656, "learning_rate": 1.7838084409777637e-05, "loss": 0.4954, "step": 2948 }, { "epoch": 0.6578184251617221, "grad_norm": 0.17478401958942413, "learning_rate": 1.7836622706884138e-05, "loss": 0.5046, "step": 2949 }, { "epoch": 0.6580414900736115, "grad_norm": 0.16117317974567413, "learning_rate": 1.7835160569949174e-05, "loss": 0.4925, "step": 2950 }, { "epoch": 0.6582645549855007, "grad_norm": 0.1660463809967041, "learning_rate": 1.783369799905373e-05, "loss": 0.5003, "step": 2951 }, { "epoch": 0.6584876198973901, "grad_norm": 0.1700252741575241, "learning_rate": 1.7832234994278822e-05, "loss": 0.5169, "step": 2952 }, { "epoch": 0.6587106848092795, "grad_norm": 0.17790904641151428, "learning_rate": 1.7830771555705468e-05, "loss": 0.5088, "step": 2953 }, { "epoch": 0.6589337497211689, "grad_norm": 0.16493864357471466, "learning_rate": 1.782930768341473e-05, "loss": 0.5082, "step": 2954 }, { "epoch": 0.6591568146330582, "grad_norm": 0.16437609493732452, "learning_rate": 1.7827843377487683e-05, "loss": 0.4984, "step": 2955 }, { "epoch": 0.6593798795449476, "grad_norm": 0.16439390182495117, "learning_rate": 1.7826378638005432e-05, "loss": 0.5005, "step": 2956 }, { "epoch": 0.659602944456837, "grad_norm": 0.15785926580429077, "learning_rate": 1.78249134650491e-05, "loss": 0.5005, "step": 2957 }, { "epoch": 0.6598260093687263, "grad_norm": 0.20435695350170135, "learning_rate": 1.782344785869984e-05, "loss": 0.4711, "step": 2958 }, { "epoch": 0.6600490742806157, "grad_norm": 0.15359720587730408, "learning_rate": 1.7821981819038828e-05, "loss": 0.4738, "step": 2959 }, { "epoch": 0.660272139192505, "grad_norm": 0.17044633626937866, "learning_rate": 1.7820515346147262e-05, "loss": 0.4951, "step": 2960 }, { "epoch": 0.6604952041043943, "grad_norm": 0.17282849550247192, "learning_rate": 1.781904844010636e-05, "loss": 0.496, "step": 2961 }, { "epoch": 0.6607182690162837, "grad_norm": 0.1486678570508957, "learning_rate": 1.7817581100997374e-05, "loss": 0.4671, "step": 2962 }, { "epoch": 0.6609413339281731, "grad_norm": 0.16091682016849518, "learning_rate": 1.781611332890157e-05, "loss": 0.4824, "step": 2963 }, { "epoch": 0.6611643988400625, "grad_norm": 0.15881048142910004, "learning_rate": 1.7814645123900246e-05, "loss": 0.4864, "step": 2964 }, { "epoch": 0.6613874637519518, "grad_norm": 0.17700685560703278, "learning_rate": 1.781317648607472e-05, "loss": 0.4962, "step": 2965 }, { "epoch": 0.6616105286638412, "grad_norm": 0.15473395586013794, "learning_rate": 1.781170741550633e-05, "loss": 0.4737, "step": 2966 }, { "epoch": 0.6618335935757306, "grad_norm": 0.1670142114162445, "learning_rate": 1.781023791227645e-05, "loss": 0.5105, "step": 2967 }, { "epoch": 0.6620566584876199, "grad_norm": 0.18345218896865845, "learning_rate": 1.780876797646646e-05, "loss": 0.5086, "step": 2968 }, { "epoch": 0.6622797233995092, "grad_norm": 0.16229334473609924, "learning_rate": 1.7807297608157784e-05, "loss": 0.5007, "step": 2969 }, { "epoch": 0.6625027883113986, "grad_norm": 0.16335418820381165, "learning_rate": 1.7805826807431856e-05, "loss": 0.4976, "step": 2970 }, { "epoch": 0.662725853223288, "grad_norm": 0.16784057021141052, "learning_rate": 1.780435557437014e-05, "loss": 0.5106, "step": 2971 }, { "epoch": 0.6629489181351773, "grad_norm": 0.16552112996578217, "learning_rate": 1.7802883909054118e-05, "loss": 0.5087, "step": 2972 }, { "epoch": 0.6631719830470667, "grad_norm": 0.206945538520813, "learning_rate": 1.7801411811565308e-05, "loss": 0.4753, "step": 2973 }, { "epoch": 0.6633950479589561, "grad_norm": 0.1575002670288086, "learning_rate": 1.7799939281985236e-05, "loss": 0.4875, "step": 2974 }, { "epoch": 0.6636181128708454, "grad_norm": 0.1623086929321289, "learning_rate": 1.7798466320395463e-05, "loss": 0.4933, "step": 2975 }, { "epoch": 0.6638411777827348, "grad_norm": 0.1668468862771988, "learning_rate": 1.779699292687757e-05, "loss": 0.462, "step": 2976 }, { "epoch": 0.6640642426946242, "grad_norm": 0.43826133012771606, "learning_rate": 1.7795519101513166e-05, "loss": 0.5073, "step": 2977 }, { "epoch": 0.6642873076065134, "grad_norm": 0.18710243701934814, "learning_rate": 1.779404484438388e-05, "loss": 0.5103, "step": 2978 }, { "epoch": 0.6645103725184028, "grad_norm": 0.17884770035743713, "learning_rate": 1.7792570155571358e-05, "loss": 0.5219, "step": 2979 }, { "epoch": 0.6647334374302922, "grad_norm": 0.1652306616306305, "learning_rate": 1.7791095035157288e-05, "loss": 0.513, "step": 2980 }, { "epoch": 0.6649565023421816, "grad_norm": 0.175328329205513, "learning_rate": 1.7789619483223367e-05, "loss": 0.5064, "step": 2981 }, { "epoch": 0.6651795672540709, "grad_norm": 0.15750913321971893, "learning_rate": 1.7788143499851318e-05, "loss": 0.4608, "step": 2982 }, { "epoch": 0.6654026321659603, "grad_norm": 0.15582314133644104, "learning_rate": 1.7786667085122895e-05, "loss": 0.4793, "step": 2983 }, { "epoch": 0.6656256970778497, "grad_norm": 0.18893486261367798, "learning_rate": 1.7785190239119864e-05, "loss": 0.5277, "step": 2984 }, { "epoch": 0.665848761989739, "grad_norm": 0.16801688075065613, "learning_rate": 1.7783712961924032e-05, "loss": 0.5281, "step": 2985 }, { "epoch": 0.6660718269016284, "grad_norm": 0.16510803997516632, "learning_rate": 1.778223525361721e-05, "loss": 0.4675, "step": 2986 }, { "epoch": 0.6662948918135178, "grad_norm": 0.16860422492027283, "learning_rate": 1.778075711428125e-05, "loss": 0.4712, "step": 2987 }, { "epoch": 0.6665179567254071, "grad_norm": 0.17091190814971924, "learning_rate": 1.777927854399802e-05, "loss": 0.5208, "step": 2988 }, { "epoch": 0.6667410216372964, "grad_norm": 0.19528479874134064, "learning_rate": 1.7777799542849408e-05, "loss": 0.482, "step": 2989 }, { "epoch": 0.6669640865491858, "grad_norm": 0.16672208905220032, "learning_rate": 1.7776320110917334e-05, "loss": 0.5002, "step": 2990 }, { "epoch": 0.6671871514610752, "grad_norm": 0.1595972180366516, "learning_rate": 1.777484024828374e-05, "loss": 0.4868, "step": 2991 }, { "epoch": 0.6674102163729645, "grad_norm": 0.16304026544094086, "learning_rate": 1.7773359955030583e-05, "loss": 0.4839, "step": 2992 }, { "epoch": 0.6676332812848539, "grad_norm": 0.17252400517463684, "learning_rate": 1.7771879231239857e-05, "loss": 0.4737, "step": 2993 }, { "epoch": 0.6678563461967433, "grad_norm": 0.17409226298332214, "learning_rate": 1.777039807699357e-05, "loss": 0.5152, "step": 2994 }, { "epoch": 0.6680794111086326, "grad_norm": 0.15897680819034576, "learning_rate": 1.7768916492373763e-05, "loss": 0.5081, "step": 2995 }, { "epoch": 0.668302476020522, "grad_norm": 0.15727804601192474, "learning_rate": 1.7767434477462493e-05, "loss": 0.4902, "step": 2996 }, { "epoch": 0.6685255409324113, "grad_norm": 0.16594363749027252, "learning_rate": 1.776595203234184e-05, "loss": 0.4982, "step": 2997 }, { "epoch": 0.6687486058443007, "grad_norm": 0.16226732730865479, "learning_rate": 1.7764469157093916e-05, "loss": 0.5304, "step": 2998 }, { "epoch": 0.66897167075619, "grad_norm": 0.15772786736488342, "learning_rate": 1.7762985851800846e-05, "loss": 0.4707, "step": 2999 }, { "epoch": 0.6691947356680794, "grad_norm": 0.19541241228580475, "learning_rate": 1.776150211654479e-05, "loss": 0.4826, "step": 3000 }, { "epoch": 0.6694178005799688, "grad_norm": 0.17381907999515533, "learning_rate": 1.7760017951407924e-05, "loss": 0.5304, "step": 3001 }, { "epoch": 0.6696408654918581, "grad_norm": 0.16170580685138702, "learning_rate": 1.7758533356472454e-05, "loss": 0.4722, "step": 3002 }, { "epoch": 0.6698639304037475, "grad_norm": 0.16046100854873657, "learning_rate": 1.7757048331820604e-05, "loss": 0.4852, "step": 3003 }, { "epoch": 0.6700869953156369, "grad_norm": 0.17672637104988098, "learning_rate": 1.775556287753462e-05, "loss": 0.5, "step": 3004 }, { "epoch": 0.6703100602275263, "grad_norm": 0.15894585847854614, "learning_rate": 1.7754076993696784e-05, "loss": 0.4819, "step": 3005 }, { "epoch": 0.6705331251394155, "grad_norm": 0.16397017240524292, "learning_rate": 1.7752590680389382e-05, "loss": 0.5348, "step": 3006 }, { "epoch": 0.6707561900513049, "grad_norm": 0.17470906674861908, "learning_rate": 1.7751103937694748e-05, "loss": 0.5056, "step": 3007 }, { "epoch": 0.6709792549631943, "grad_norm": 0.16147972643375397, "learning_rate": 1.774961676569522e-05, "loss": 0.4845, "step": 3008 }, { "epoch": 0.6712023198750836, "grad_norm": 0.15075084567070007, "learning_rate": 1.774812916447317e-05, "loss": 0.4873, "step": 3009 }, { "epoch": 0.671425384786973, "grad_norm": 0.1605006605386734, "learning_rate": 1.774664113411099e-05, "loss": 0.4885, "step": 3010 }, { "epoch": 0.6716484496988624, "grad_norm": 0.17120316624641418, "learning_rate": 1.7745152674691093e-05, "loss": 0.4952, "step": 3011 }, { "epoch": 0.6718715146107517, "grad_norm": 0.17164325714111328, "learning_rate": 1.774366378629592e-05, "loss": 0.5225, "step": 3012 }, { "epoch": 0.6720945795226411, "grad_norm": 0.16912591457366943, "learning_rate": 1.774217446900794e-05, "loss": 0.5165, "step": 3013 }, { "epoch": 0.6723176444345305, "grad_norm": 0.1592075079679489, "learning_rate": 1.7740684722909638e-05, "loss": 0.481, "step": 3014 }, { "epoch": 0.6725407093464199, "grad_norm": 0.15714260935783386, "learning_rate": 1.7739194548083526e-05, "loss": 0.5122, "step": 3015 }, { "epoch": 0.6727637742583091, "grad_norm": 0.15991204977035522, "learning_rate": 1.7737703944612135e-05, "loss": 0.5006, "step": 3016 }, { "epoch": 0.6729868391701985, "grad_norm": 0.15693789720535278, "learning_rate": 1.7736212912578028e-05, "loss": 0.4867, "step": 3017 }, { "epoch": 0.6732099040820879, "grad_norm": 0.15840266644954681, "learning_rate": 1.773472145206379e-05, "loss": 0.4937, "step": 3018 }, { "epoch": 0.6734329689939772, "grad_norm": 0.16329781711101532, "learning_rate": 1.7733229563152024e-05, "loss": 0.489, "step": 3019 }, { "epoch": 0.6736560339058666, "grad_norm": 0.17248262465000153, "learning_rate": 1.7731737245925357e-05, "loss": 0.5112, "step": 3020 }, { "epoch": 0.673879098817756, "grad_norm": 0.1815134584903717, "learning_rate": 1.7730244500466454e-05, "loss": 0.5027, "step": 3021 }, { "epoch": 0.6741021637296454, "grad_norm": 0.17082563042640686, "learning_rate": 1.772875132685798e-05, "loss": 0.5018, "step": 3022 }, { "epoch": 0.6743252286415347, "grad_norm": 0.17142242193222046, "learning_rate": 1.772725772518264e-05, "loss": 0.5106, "step": 3023 }, { "epoch": 0.674548293553424, "grad_norm": 0.17516618967056274, "learning_rate": 1.7725763695523166e-05, "loss": 0.4743, "step": 3024 }, { "epoch": 0.6747713584653134, "grad_norm": 0.16038702428340912, "learning_rate": 1.77242692379623e-05, "loss": 0.5071, "step": 3025 }, { "epoch": 0.6749944233772027, "grad_norm": 0.16445225477218628, "learning_rate": 1.7722774352582816e-05, "loss": 0.4992, "step": 3026 }, { "epoch": 0.6752174882890921, "grad_norm": 0.16889688372612, "learning_rate": 1.772127903946751e-05, "loss": 0.4909, "step": 3027 }, { "epoch": 0.6754405532009815, "grad_norm": 0.17117194831371307, "learning_rate": 1.77197832986992e-05, "loss": 0.4927, "step": 3028 }, { "epoch": 0.6756636181128709, "grad_norm": 0.16765987873077393, "learning_rate": 1.7718287130360733e-05, "loss": 0.5065, "step": 3029 }, { "epoch": 0.6758866830247602, "grad_norm": 0.1618220955133438, "learning_rate": 1.7716790534534977e-05, "loss": 0.4931, "step": 3030 }, { "epoch": 0.6761097479366496, "grad_norm": 0.16639363765716553, "learning_rate": 1.7715293511304815e-05, "loss": 0.5044, "step": 3031 }, { "epoch": 0.676332812848539, "grad_norm": 0.167351633310318, "learning_rate": 1.7713796060753173e-05, "loss": 0.5188, "step": 3032 }, { "epoch": 0.6765558777604282, "grad_norm": 0.1656593531370163, "learning_rate": 1.771229818296298e-05, "loss": 0.5271, "step": 3033 }, { "epoch": 0.6767789426723176, "grad_norm": 0.168874591588974, "learning_rate": 1.7710799878017203e-05, "loss": 0.4947, "step": 3034 }, { "epoch": 0.677002007584207, "grad_norm": 0.16307704150676727, "learning_rate": 1.7709301145998827e-05, "loss": 0.5071, "step": 3035 }, { "epoch": 0.6772250724960963, "grad_norm": 0.1524697244167328, "learning_rate": 1.7707801986990857e-05, "loss": 0.497, "step": 3036 }, { "epoch": 0.6774481374079857, "grad_norm": 0.16599062085151672, "learning_rate": 1.7706302401076327e-05, "loss": 0.4726, "step": 3037 }, { "epoch": 0.6776712023198751, "grad_norm": 0.16413775086402893, "learning_rate": 1.77048023883383e-05, "loss": 0.527, "step": 3038 }, { "epoch": 0.6778942672317645, "grad_norm": 0.1584494262933731, "learning_rate": 1.770330194885985e-05, "loss": 0.5095, "step": 3039 }, { "epoch": 0.6781173321436538, "grad_norm": 0.15526416897773743, "learning_rate": 1.7701801082724084e-05, "loss": 0.4914, "step": 3040 }, { "epoch": 0.6783403970555432, "grad_norm": 0.16402754187583923, "learning_rate": 1.7700299790014126e-05, "loss": 0.5184, "step": 3041 }, { "epoch": 0.6785634619674326, "grad_norm": 0.16508348286151886, "learning_rate": 1.769879807081313e-05, "loss": 0.4783, "step": 3042 }, { "epoch": 0.6787865268793218, "grad_norm": 0.15416789054870605, "learning_rate": 1.769729592520427e-05, "loss": 0.4818, "step": 3043 }, { "epoch": 0.6790095917912112, "grad_norm": 0.27602240443229675, "learning_rate": 1.769579335327074e-05, "loss": 0.49, "step": 3044 }, { "epoch": 0.6792326567031006, "grad_norm": 0.15309108793735504, "learning_rate": 1.7694290355095768e-05, "loss": 0.4778, "step": 3045 }, { "epoch": 0.67945572161499, "grad_norm": 0.15830327570438385, "learning_rate": 1.76927869307626e-05, "loss": 0.4909, "step": 3046 }, { "epoch": 0.6796787865268793, "grad_norm": 0.16737417876720428, "learning_rate": 1.76912830803545e-05, "loss": 0.4642, "step": 3047 }, { "epoch": 0.6799018514387687, "grad_norm": 0.16713353991508484, "learning_rate": 1.7689778803954764e-05, "loss": 0.5076, "step": 3048 }, { "epoch": 0.6801249163506581, "grad_norm": 0.15644113719463348, "learning_rate": 1.7688274101646702e-05, "loss": 0.4746, "step": 3049 }, { "epoch": 0.6803479812625474, "grad_norm": 0.1714516133069992, "learning_rate": 1.7686768973513663e-05, "loss": 0.4924, "step": 3050 }, { "epoch": 0.6805710461744368, "grad_norm": 0.1625642329454422, "learning_rate": 1.7685263419639008e-05, "loss": 0.5103, "step": 3051 }, { "epoch": 0.6807941110863261, "grad_norm": 0.15343210101127625, "learning_rate": 1.768375744010612e-05, "loss": 0.4871, "step": 3052 }, { "epoch": 0.6810171759982154, "grad_norm": 0.1622699499130249, "learning_rate": 1.7682251034998413e-05, "loss": 0.5093, "step": 3053 }, { "epoch": 0.6812402409101048, "grad_norm": 0.15701012313365936, "learning_rate": 1.768074420439932e-05, "loss": 0.5052, "step": 3054 }, { "epoch": 0.6814633058219942, "grad_norm": 0.1647741198539734, "learning_rate": 1.76792369483923e-05, "loss": 0.528, "step": 3055 }, { "epoch": 0.6816863707338836, "grad_norm": 0.16413739323616028, "learning_rate": 1.7677729267060836e-05, "loss": 0.5029, "step": 3056 }, { "epoch": 0.6819094356457729, "grad_norm": 0.1663079708814621, "learning_rate": 1.7676221160488426e-05, "loss": 0.4961, "step": 3057 }, { "epoch": 0.6821325005576623, "grad_norm": 0.1598016768693924, "learning_rate": 1.7674712628758603e-05, "loss": 0.5201, "step": 3058 }, { "epoch": 0.6823555654695517, "grad_norm": 0.18910476565361023, "learning_rate": 1.767320367195492e-05, "loss": 0.5203, "step": 3059 }, { "epoch": 0.682578630381441, "grad_norm": 0.1525489091873169, "learning_rate": 1.767169429016095e-05, "loss": 0.4609, "step": 3060 }, { "epoch": 0.6828016952933303, "grad_norm": 0.22233475744724274, "learning_rate": 1.7670184483460296e-05, "loss": 0.4936, "step": 3061 }, { "epoch": 0.6830247602052197, "grad_norm": 0.16714778542518616, "learning_rate": 1.766867425193658e-05, "loss": 0.5248, "step": 3062 }, { "epoch": 0.6832478251171091, "grad_norm": 0.1673208624124527, "learning_rate": 1.766716359567344e-05, "loss": 0.5056, "step": 3063 }, { "epoch": 0.6834708900289984, "grad_norm": 0.1797683835029602, "learning_rate": 1.7665652514754554e-05, "loss": 0.5039, "step": 3064 }, { "epoch": 0.6836939549408878, "grad_norm": 0.1645476371049881, "learning_rate": 1.7664141009263614e-05, "loss": 0.4907, "step": 3065 }, { "epoch": 0.6839170198527772, "grad_norm": 0.15605735778808594, "learning_rate": 1.7662629079284336e-05, "loss": 0.4821, "step": 3066 }, { "epoch": 0.6841400847646665, "grad_norm": 0.16608816385269165, "learning_rate": 1.7661116724900456e-05, "loss": 0.5028, "step": 3067 }, { "epoch": 0.6843631496765559, "grad_norm": 0.16926267743110657, "learning_rate": 1.7659603946195746e-05, "loss": 0.4933, "step": 3068 }, { "epoch": 0.6845862145884453, "grad_norm": 0.15846210718154907, "learning_rate": 1.7658090743253985e-05, "loss": 0.4991, "step": 3069 }, { "epoch": 0.6848092795003345, "grad_norm": 0.16848038136959076, "learning_rate": 1.7656577116158988e-05, "loss": 0.5002, "step": 3070 }, { "epoch": 0.6850323444122239, "grad_norm": 0.16113083064556122, "learning_rate": 1.765506306499459e-05, "loss": 0.4999, "step": 3071 }, { "epoch": 0.6852554093241133, "grad_norm": 0.1538555920124054, "learning_rate": 1.7653548589844648e-05, "loss": 0.4812, "step": 3072 }, { "epoch": 0.6854784742360027, "grad_norm": 0.16473166644573212, "learning_rate": 1.765203369079304e-05, "loss": 0.4885, "step": 3073 }, { "epoch": 0.685701539147892, "grad_norm": 0.15544745326042175, "learning_rate": 1.765051836792367e-05, "loss": 0.4901, "step": 3074 }, { "epoch": 0.6859246040597814, "grad_norm": 0.1728777289390564, "learning_rate": 1.764900262132048e-05, "loss": 0.4578, "step": 3075 }, { "epoch": 0.6861476689716708, "grad_norm": 0.16069582104682922, "learning_rate": 1.76474864510674e-05, "loss": 0.4904, "step": 3076 }, { "epoch": 0.6863707338835601, "grad_norm": 0.15537656843662262, "learning_rate": 1.764596985724842e-05, "loss": 0.4899, "step": 3077 }, { "epoch": 0.6865937987954495, "grad_norm": 0.16955626010894775, "learning_rate": 1.7644452839947536e-05, "loss": 0.5011, "step": 3078 }, { "epoch": 0.6868168637073389, "grad_norm": 0.16193504631519318, "learning_rate": 1.7642935399248765e-05, "loss": 0.5128, "step": 3079 }, { "epoch": 0.6870399286192282, "grad_norm": 0.1631263643503189, "learning_rate": 1.7641417535236155e-05, "loss": 0.5026, "step": 3080 }, { "epoch": 0.6872629935311175, "grad_norm": 0.18346811830997467, "learning_rate": 1.7639899247993775e-05, "loss": 0.4857, "step": 3081 }, { "epoch": 0.6874860584430069, "grad_norm": 0.16757658123970032, "learning_rate": 1.7638380537605722e-05, "loss": 0.4935, "step": 3082 }, { "epoch": 0.6877091233548963, "grad_norm": 0.18027780950069427, "learning_rate": 1.7636861404156106e-05, "loss": 0.5448, "step": 3083 }, { "epoch": 0.6879321882667856, "grad_norm": 0.17672498524188995, "learning_rate": 1.763534184772907e-05, "loss": 0.5054, "step": 3084 }, { "epoch": 0.688155253178675, "grad_norm": 0.15495164692401886, "learning_rate": 1.763382186840877e-05, "loss": 0.4778, "step": 3085 }, { "epoch": 0.6883783180905644, "grad_norm": 0.16750416159629822, "learning_rate": 1.76323014662794e-05, "loss": 0.5147, "step": 3086 }, { "epoch": 0.6886013830024537, "grad_norm": 0.16243582963943481, "learning_rate": 1.763078064142516e-05, "loss": 0.5029, "step": 3087 }, { "epoch": 0.688824447914343, "grad_norm": 0.1765555739402771, "learning_rate": 1.7629259393930292e-05, "loss": 0.5402, "step": 3088 }, { "epoch": 0.6890475128262324, "grad_norm": 0.17327693104743958, "learning_rate": 1.7627737723879048e-05, "loss": 0.5223, "step": 3089 }, { "epoch": 0.6892705777381218, "grad_norm": 0.16204944252967834, "learning_rate": 1.762621563135571e-05, "loss": 0.4926, "step": 3090 }, { "epoch": 0.6894936426500111, "grad_norm": 0.16001787781715393, "learning_rate": 1.762469311644458e-05, "loss": 0.5038, "step": 3091 }, { "epoch": 0.6897167075619005, "grad_norm": 0.15672995150089264, "learning_rate": 1.7623170179229982e-05, "loss": 0.4752, "step": 3092 }, { "epoch": 0.6899397724737899, "grad_norm": 0.16134774684906006, "learning_rate": 1.7621646819796264e-05, "loss": 0.4911, "step": 3093 }, { "epoch": 0.6901628373856792, "grad_norm": 0.18240399658679962, "learning_rate": 1.762012303822781e-05, "loss": 0.4956, "step": 3094 }, { "epoch": 0.6903859022975686, "grad_norm": 0.16389527916908264, "learning_rate": 1.761859883460901e-05, "loss": 0.4713, "step": 3095 }, { "epoch": 0.690608967209458, "grad_norm": 0.16408023238182068, "learning_rate": 1.761707420902428e-05, "loss": 0.4925, "step": 3096 }, { "epoch": 0.6908320321213474, "grad_norm": 0.16707156598567963, "learning_rate": 1.761554916155807e-05, "loss": 0.5094, "step": 3097 }, { "epoch": 0.6910550970332366, "grad_norm": 0.1580830216407776, "learning_rate": 1.7614023692294838e-05, "loss": 0.4896, "step": 3098 }, { "epoch": 0.691278161945126, "grad_norm": 0.1949886530637741, "learning_rate": 1.7612497801319084e-05, "loss": 0.4919, "step": 3099 }, { "epoch": 0.6915012268570154, "grad_norm": 0.1662502884864807, "learning_rate": 1.7610971488715315e-05, "loss": 0.5104, "step": 3100 }, { "epoch": 0.6917242917689047, "grad_norm": 0.15772640705108643, "learning_rate": 1.760944475456807e-05, "loss": 0.4699, "step": 3101 }, { "epoch": 0.6919473566807941, "grad_norm": 0.17815501987934113, "learning_rate": 1.760791759896191e-05, "loss": 0.5133, "step": 3102 }, { "epoch": 0.6921704215926835, "grad_norm": 0.16980375349521637, "learning_rate": 1.760639002198142e-05, "loss": 0.4874, "step": 3103 }, { "epoch": 0.6923934865045729, "grad_norm": 0.16098888218402863, "learning_rate": 1.7604862023711204e-05, "loss": 0.508, "step": 3104 }, { "epoch": 0.6926165514164622, "grad_norm": 0.16175471246242523, "learning_rate": 1.760333360423589e-05, "loss": 0.4963, "step": 3105 }, { "epoch": 0.6928396163283516, "grad_norm": 0.1580919772386551, "learning_rate": 1.7601804763640137e-05, "loss": 0.4674, "step": 3106 }, { "epoch": 0.693062681240241, "grad_norm": 0.15538008511066437, "learning_rate": 1.7600275502008618e-05, "loss": 0.4907, "step": 3107 }, { "epoch": 0.6932857461521302, "grad_norm": 0.15990300476551056, "learning_rate": 1.7598745819426034e-05, "loss": 0.5035, "step": 3108 }, { "epoch": 0.6935088110640196, "grad_norm": 0.1725892871618271, "learning_rate": 1.759721571597711e-05, "loss": 0.5196, "step": 3109 }, { "epoch": 0.693731875975909, "grad_norm": 0.16250944137573242, "learning_rate": 1.7595685191746586e-05, "loss": 0.4907, "step": 3110 }, { "epoch": 0.6939549408877983, "grad_norm": 0.16884207725524902, "learning_rate": 1.759415424681924e-05, "loss": 0.535, "step": 3111 }, { "epoch": 0.6941780057996877, "grad_norm": 0.1701684594154358, "learning_rate": 1.7592622881279867e-05, "loss": 0.4878, "step": 3112 }, { "epoch": 0.6944010707115771, "grad_norm": 0.17077569663524628, "learning_rate": 1.7591091095213277e-05, "loss": 0.4851, "step": 3113 }, { "epoch": 0.6946241356234665, "grad_norm": 0.33712977170944214, "learning_rate": 1.758955888870431e-05, "loss": 0.476, "step": 3114 }, { "epoch": 0.6948472005353558, "grad_norm": 0.16534623503684998, "learning_rate": 1.7588026261837833e-05, "loss": 0.4962, "step": 3115 }, { "epoch": 0.6950702654472452, "grad_norm": 0.17375873029232025, "learning_rate": 1.758649321469873e-05, "loss": 0.4883, "step": 3116 }, { "epoch": 0.6952933303591345, "grad_norm": 0.16607031226158142, "learning_rate": 1.758495974737191e-05, "loss": 0.4727, "step": 3117 }, { "epoch": 0.6955163952710238, "grad_norm": 0.16579467058181763, "learning_rate": 1.7583425859942312e-05, "loss": 0.5156, "step": 3118 }, { "epoch": 0.6957394601829132, "grad_norm": 0.16067442297935486, "learning_rate": 1.7581891552494886e-05, "loss": 0.4797, "step": 3119 }, { "epoch": 0.6959625250948026, "grad_norm": 0.16372907161712646, "learning_rate": 1.7580356825114616e-05, "loss": 0.4918, "step": 3120 }, { "epoch": 0.696185590006692, "grad_norm": 0.1595798134803772, "learning_rate": 1.75788216778865e-05, "loss": 0.4963, "step": 3121 }, { "epoch": 0.6964086549185813, "grad_norm": 0.16394098103046417, "learning_rate": 1.757728611089557e-05, "loss": 0.5189, "step": 3122 }, { "epoch": 0.6966317198304707, "grad_norm": 0.16349278390407562, "learning_rate": 1.757575012422687e-05, "loss": 0.5392, "step": 3123 }, { "epoch": 0.6968547847423601, "grad_norm": 0.16474361717700958, "learning_rate": 1.7574213717965473e-05, "loss": 0.498, "step": 3124 }, { "epoch": 0.6970778496542493, "grad_norm": 0.16759975254535675, "learning_rate": 1.757267689219648e-05, "loss": 0.4909, "step": 3125 }, { "epoch": 0.6973009145661387, "grad_norm": 0.18048445880413055, "learning_rate": 1.7571139647005004e-05, "loss": 0.5313, "step": 3126 }, { "epoch": 0.6975239794780281, "grad_norm": 0.21055570244789124, "learning_rate": 1.7569601982476194e-05, "loss": 0.4773, "step": 3127 }, { "epoch": 0.6977470443899174, "grad_norm": 0.16457606852054596, "learning_rate": 1.7568063898695205e-05, "loss": 0.4817, "step": 3128 }, { "epoch": 0.6979701093018068, "grad_norm": 0.15786704421043396, "learning_rate": 1.7566525395747237e-05, "loss": 0.4793, "step": 3129 }, { "epoch": 0.6981931742136962, "grad_norm": 0.17408685386180878, "learning_rate": 1.7564986473717498e-05, "loss": 0.5238, "step": 3130 }, { "epoch": 0.6984162391255856, "grad_norm": 0.1875070482492447, "learning_rate": 1.7563447132691222e-05, "loss": 0.5133, "step": 3131 }, { "epoch": 0.6986393040374749, "grad_norm": 0.15555784106254578, "learning_rate": 1.7561907372753665e-05, "loss": 0.4851, "step": 3132 }, { "epoch": 0.6988623689493643, "grad_norm": 0.15845829248428345, "learning_rate": 1.756036719399011e-05, "loss": 0.4989, "step": 3133 }, { "epoch": 0.6990854338612537, "grad_norm": 0.20709070563316345, "learning_rate": 1.7558826596485866e-05, "loss": 0.4939, "step": 3134 }, { "epoch": 0.6993084987731429, "grad_norm": 0.17586678266525269, "learning_rate": 1.755728558032626e-05, "loss": 0.5028, "step": 3135 }, { "epoch": 0.6995315636850323, "grad_norm": 0.16700804233551025, "learning_rate": 1.7555744145596638e-05, "loss": 0.4955, "step": 3136 }, { "epoch": 0.6997546285969217, "grad_norm": 0.17724819481372833, "learning_rate": 1.755420229238238e-05, "loss": 0.4571, "step": 3137 }, { "epoch": 0.6999776935088111, "grad_norm": 0.17517662048339844, "learning_rate": 1.755266002076888e-05, "loss": 0.5144, "step": 3138 }, { "epoch": 0.7002007584207004, "grad_norm": 0.16020509600639343, "learning_rate": 1.755111733084156e-05, "loss": 0.4523, "step": 3139 }, { "epoch": 0.7004238233325898, "grad_norm": 0.15964041650295258, "learning_rate": 1.7549574222685864e-05, "loss": 0.4695, "step": 3140 }, { "epoch": 0.7006468882444792, "grad_norm": 0.17395976185798645, "learning_rate": 1.754803069638726e-05, "loss": 0.515, "step": 3141 }, { "epoch": 0.7008699531563685, "grad_norm": 0.15984676778316498, "learning_rate": 1.7546486752031237e-05, "loss": 0.4909, "step": 3142 }, { "epoch": 0.7010930180682579, "grad_norm": 0.16784314811229706, "learning_rate": 1.7544942389703305e-05, "loss": 0.4973, "step": 3143 }, { "epoch": 0.7013160829801472, "grad_norm": 0.16454213857650757, "learning_rate": 1.754339760948901e-05, "loss": 0.503, "step": 3144 }, { "epoch": 0.7015391478920365, "grad_norm": 0.1578417867422104, "learning_rate": 1.7541852411473902e-05, "loss": 0.4632, "step": 3145 }, { "epoch": 0.7017622128039259, "grad_norm": 0.16670000553131104, "learning_rate": 1.7540306795743566e-05, "loss": 0.4804, "step": 3146 }, { "epoch": 0.7019852777158153, "grad_norm": 0.15188740193843842, "learning_rate": 1.753876076238361e-05, "loss": 0.4884, "step": 3147 }, { "epoch": 0.7022083426277047, "grad_norm": 0.16512979567050934, "learning_rate": 1.7537214311479663e-05, "loss": 0.5002, "step": 3148 }, { "epoch": 0.702431407539594, "grad_norm": 0.15965288877487183, "learning_rate": 1.7535667443117377e-05, "loss": 0.4885, "step": 3149 }, { "epoch": 0.7026544724514834, "grad_norm": 0.1552652269601822, "learning_rate": 1.7534120157382425e-05, "loss": 0.4955, "step": 3150 }, { "epoch": 0.7028775373633728, "grad_norm": 0.16739974915981293, "learning_rate": 1.7532572454360506e-05, "loss": 0.4902, "step": 3151 }, { "epoch": 0.703100602275262, "grad_norm": 0.18262702226638794, "learning_rate": 1.7531024334137348e-05, "loss": 0.5283, "step": 3152 }, { "epoch": 0.7033236671871514, "grad_norm": 0.15477107465267181, "learning_rate": 1.7529475796798686e-05, "loss": 0.4969, "step": 3153 }, { "epoch": 0.7035467320990408, "grad_norm": 0.16841940581798553, "learning_rate": 1.7527926842430295e-05, "loss": 0.5351, "step": 3154 }, { "epoch": 0.7037697970109302, "grad_norm": 0.17480523884296417, "learning_rate": 1.7526377471117963e-05, "loss": 0.5279, "step": 3155 }, { "epoch": 0.7039928619228195, "grad_norm": 0.1635589897632599, "learning_rate": 1.75248276829475e-05, "loss": 0.5076, "step": 3156 }, { "epoch": 0.7042159268347089, "grad_norm": 0.18227113783359528, "learning_rate": 1.7523277478004747e-05, "loss": 0.4927, "step": 3157 }, { "epoch": 0.7044389917465983, "grad_norm": 0.17305073142051697, "learning_rate": 1.7521726856375568e-05, "loss": 0.4967, "step": 3158 }, { "epoch": 0.7046620566584876, "grad_norm": 0.16051623225212097, "learning_rate": 1.7520175818145838e-05, "loss": 0.5031, "step": 3159 }, { "epoch": 0.704885121570377, "grad_norm": 0.16289980709552765, "learning_rate": 1.751862436340147e-05, "loss": 0.5101, "step": 3160 }, { "epoch": 0.7051081864822664, "grad_norm": 0.16630062460899353, "learning_rate": 1.751707249222839e-05, "loss": 0.4928, "step": 3161 }, { "epoch": 0.7053312513941556, "grad_norm": 0.154288649559021, "learning_rate": 1.7515520204712552e-05, "loss": 0.4921, "step": 3162 }, { "epoch": 0.705554316306045, "grad_norm": 0.1542695015668869, "learning_rate": 1.751396750093993e-05, "loss": 0.4929, "step": 3163 }, { "epoch": 0.7057773812179344, "grad_norm": 0.1700357347726822, "learning_rate": 1.7512414380996524e-05, "loss": 0.5014, "step": 3164 }, { "epoch": 0.7060004461298238, "grad_norm": 0.1531635820865631, "learning_rate": 1.7510860844968355e-05, "loss": 0.4764, "step": 3165 }, { "epoch": 0.7062235110417131, "grad_norm": 0.1919977366924286, "learning_rate": 1.7509306892941464e-05, "loss": 0.5453, "step": 3166 }, { "epoch": 0.7064465759536025, "grad_norm": 0.16512922942638397, "learning_rate": 1.7507752525001924e-05, "loss": 0.5015, "step": 3167 }, { "epoch": 0.7066696408654919, "grad_norm": 0.1677495837211609, "learning_rate": 1.7506197741235822e-05, "loss": 0.5164, "step": 3168 }, { "epoch": 0.7068927057773812, "grad_norm": 0.15319664776325226, "learning_rate": 1.7504642541729273e-05, "loss": 0.4815, "step": 3169 }, { "epoch": 0.7071157706892706, "grad_norm": 0.16099566221237183, "learning_rate": 1.7503086926568416e-05, "loss": 0.4966, "step": 3170 }, { "epoch": 0.70733883560116, "grad_norm": 0.15989680588245392, "learning_rate": 1.750153089583941e-05, "loss": 0.4949, "step": 3171 }, { "epoch": 0.7075619005130493, "grad_norm": 0.16961508989334106, "learning_rate": 1.7499974449628433e-05, "loss": 0.4916, "step": 3172 }, { "epoch": 0.7077849654249386, "grad_norm": 0.1632029265165329, "learning_rate": 1.74984175880217e-05, "loss": 0.4921, "step": 3173 }, { "epoch": 0.708008030336828, "grad_norm": 0.1647975593805313, "learning_rate": 1.7496860311105426e-05, "loss": 0.4635, "step": 3174 }, { "epoch": 0.7082310952487174, "grad_norm": 0.15906676650047302, "learning_rate": 1.7495302618965874e-05, "loss": 0.4686, "step": 3175 }, { "epoch": 0.7084541601606067, "grad_norm": 0.16585782170295715, "learning_rate": 1.7493744511689316e-05, "loss": 0.5159, "step": 3176 }, { "epoch": 0.7086772250724961, "grad_norm": 0.16121666133403778, "learning_rate": 1.7492185989362052e-05, "loss": 0.5, "step": 3177 }, { "epoch": 0.7089002899843855, "grad_norm": 0.1639820635318756, "learning_rate": 1.7490627052070394e-05, "loss": 0.4959, "step": 3178 }, { "epoch": 0.7091233548962749, "grad_norm": 0.17450456321239471, "learning_rate": 1.74890676999007e-05, "loss": 0.4932, "step": 3179 }, { "epoch": 0.7093464198081642, "grad_norm": 0.1635378748178482, "learning_rate": 1.7487507932939324e-05, "loss": 0.5107, "step": 3180 }, { "epoch": 0.7095694847200535, "grad_norm": 0.16000831127166748, "learning_rate": 1.7485947751272657e-05, "loss": 0.462, "step": 3181 }, { "epoch": 0.7097925496319429, "grad_norm": 0.16015185415744781, "learning_rate": 1.748438715498712e-05, "loss": 0.4934, "step": 3182 }, { "epoch": 0.7100156145438322, "grad_norm": 0.1680067628622055, "learning_rate": 1.7482826144169144e-05, "loss": 0.5074, "step": 3183 }, { "epoch": 0.7102386794557216, "grad_norm": 0.16120266914367676, "learning_rate": 1.7481264718905187e-05, "loss": 0.4853, "step": 3184 }, { "epoch": 0.710461744367611, "grad_norm": 0.1535530686378479, "learning_rate": 1.747970287928173e-05, "loss": 0.4722, "step": 3185 }, { "epoch": 0.7106848092795003, "grad_norm": 0.1595955789089203, "learning_rate": 1.747814062538528e-05, "loss": 0.5031, "step": 3186 }, { "epoch": 0.7109078741913897, "grad_norm": 0.16170883178710938, "learning_rate": 1.7476577957302358e-05, "loss": 0.4947, "step": 3187 }, { "epoch": 0.7111309391032791, "grad_norm": 0.16378958523273468, "learning_rate": 1.747501487511952e-05, "loss": 0.4722, "step": 3188 }, { "epoch": 0.7113540040151685, "grad_norm": 0.17210274934768677, "learning_rate": 1.7473451378923344e-05, "loss": 0.5183, "step": 3189 }, { "epoch": 0.7115770689270577, "grad_norm": 0.15549586713314056, "learning_rate": 1.7471887468800416e-05, "loss": 0.48, "step": 3190 }, { "epoch": 0.7118001338389471, "grad_norm": 0.17286068201065063, "learning_rate": 1.747032314483736e-05, "loss": 0.4768, "step": 3191 }, { "epoch": 0.7120231987508365, "grad_norm": 0.1790837049484253, "learning_rate": 1.746875840712082e-05, "loss": 0.5181, "step": 3192 }, { "epoch": 0.7122462636627258, "grad_norm": 0.1648358702659607, "learning_rate": 1.746719325573746e-05, "loss": 0.508, "step": 3193 }, { "epoch": 0.7124693285746152, "grad_norm": 0.16890966892242432, "learning_rate": 1.7465627690773964e-05, "loss": 0.4928, "step": 3194 }, { "epoch": 0.7126923934865046, "grad_norm": 0.1561020165681839, "learning_rate": 1.7464061712317047e-05, "loss": 0.4783, "step": 3195 }, { "epoch": 0.712915458398394, "grad_norm": 0.17456288635730743, "learning_rate": 1.7462495320453442e-05, "loss": 0.5203, "step": 3196 }, { "epoch": 0.7131385233102833, "grad_norm": 0.16500715911388397, "learning_rate": 1.7460928515269902e-05, "loss": 0.5259, "step": 3197 }, { "epoch": 0.7133615882221727, "grad_norm": 0.1663879156112671, "learning_rate": 1.7459361296853217e-05, "loss": 0.5204, "step": 3198 }, { "epoch": 0.713584653134062, "grad_norm": 0.1611357033252716, "learning_rate": 1.745779366529018e-05, "loss": 0.498, "step": 3199 }, { "epoch": 0.7138077180459513, "grad_norm": 0.1556338518857956, "learning_rate": 1.7456225620667613e-05, "loss": 0.4753, "step": 3200 }, { "epoch": 0.7140307829578407, "grad_norm": 0.16986200213432312, "learning_rate": 1.7454657163072372e-05, "loss": 0.5075, "step": 3201 }, { "epoch": 0.7142538478697301, "grad_norm": 0.16040728986263275, "learning_rate": 1.7453088292591327e-05, "loss": 0.5016, "step": 3202 }, { "epoch": 0.7144769127816194, "grad_norm": 0.16356582939624786, "learning_rate": 1.7451519009311368e-05, "loss": 0.5118, "step": 3203 }, { "epoch": 0.7146999776935088, "grad_norm": 0.16113781929016113, "learning_rate": 1.744994931331942e-05, "loss": 0.4869, "step": 3204 }, { "epoch": 0.7149230426053982, "grad_norm": 0.17623494565486908, "learning_rate": 1.744837920470241e-05, "loss": 0.5247, "step": 3205 }, { "epoch": 0.7151461075172876, "grad_norm": 0.16005289554595947, "learning_rate": 1.744680868354731e-05, "loss": 0.4722, "step": 3206 }, { "epoch": 0.7153691724291769, "grad_norm": 0.18001532554626465, "learning_rate": 1.7445237749941106e-05, "loss": 0.4879, "step": 3207 }, { "epoch": 0.7155922373410663, "grad_norm": 0.1617603451013565, "learning_rate": 1.74436664039708e-05, "loss": 0.4962, "step": 3208 }, { "epoch": 0.7158153022529556, "grad_norm": 0.16146957874298096, "learning_rate": 1.7442094645723425e-05, "loss": 0.4849, "step": 3209 }, { "epoch": 0.7160383671648449, "grad_norm": 0.16159506142139435, "learning_rate": 1.744052247528604e-05, "loss": 0.4814, "step": 3210 }, { "epoch": 0.7162614320767343, "grad_norm": 0.16017811000347137, "learning_rate": 1.7438949892745717e-05, "loss": 0.4779, "step": 3211 }, { "epoch": 0.7164844969886237, "grad_norm": 0.16643457114696503, "learning_rate": 1.7437376898189554e-05, "loss": 0.5058, "step": 3212 }, { "epoch": 0.7167075619005131, "grad_norm": 0.174330934882164, "learning_rate": 1.7435803491704674e-05, "loss": 0.5037, "step": 3213 }, { "epoch": 0.7169306268124024, "grad_norm": 0.16516099870204926, "learning_rate": 1.7434229673378226e-05, "loss": 0.4929, "step": 3214 }, { "epoch": 0.7171536917242918, "grad_norm": 0.16369006037712097, "learning_rate": 1.7432655443297377e-05, "loss": 0.4788, "step": 3215 }, { "epoch": 0.7173767566361812, "grad_norm": 0.19648444652557373, "learning_rate": 1.7431080801549313e-05, "loss": 0.4808, "step": 3216 }, { "epoch": 0.7175998215480704, "grad_norm": 0.16290031373500824, "learning_rate": 1.742950574822125e-05, "loss": 0.4811, "step": 3217 }, { "epoch": 0.7178228864599598, "grad_norm": 0.15613336861133575, "learning_rate": 1.7427930283400428e-05, "loss": 0.4948, "step": 3218 }, { "epoch": 0.7180459513718492, "grad_norm": 0.17377032339572906, "learning_rate": 1.7426354407174102e-05, "loss": 0.5127, "step": 3219 }, { "epoch": 0.7182690162837385, "grad_norm": 0.16334526240825653, "learning_rate": 1.7424778119629556e-05, "loss": 0.4865, "step": 3220 }, { "epoch": 0.7184920811956279, "grad_norm": 0.15451814234256744, "learning_rate": 1.7423201420854092e-05, "loss": 0.4711, "step": 3221 }, { "epoch": 0.7187151461075173, "grad_norm": 0.17903545498847961, "learning_rate": 1.7421624310935043e-05, "loss": 0.5265, "step": 3222 }, { "epoch": 0.7189382110194067, "grad_norm": 0.15851649641990662, "learning_rate": 1.7420046789959754e-05, "loss": 0.5125, "step": 3223 }, { "epoch": 0.719161275931296, "grad_norm": 0.15990275144577026, "learning_rate": 1.74184688580156e-05, "loss": 0.4983, "step": 3224 }, { "epoch": 0.7193843408431854, "grad_norm": 0.16905120015144348, "learning_rate": 1.7416890515189977e-05, "loss": 0.514, "step": 3225 }, { "epoch": 0.7196074057550748, "grad_norm": 0.16614043712615967, "learning_rate": 1.74153117615703e-05, "loss": 0.5071, "step": 3226 }, { "epoch": 0.719830470666964, "grad_norm": 0.1659470647573471, "learning_rate": 1.741373259724402e-05, "loss": 0.5101, "step": 3227 }, { "epoch": 0.7200535355788534, "grad_norm": 0.17081058025360107, "learning_rate": 1.7412153022298587e-05, "loss": 0.5345, "step": 3228 }, { "epoch": 0.7202766004907428, "grad_norm": 0.17581294476985931, "learning_rate": 1.74105730368215e-05, "loss": 0.5119, "step": 3229 }, { "epoch": 0.7204996654026322, "grad_norm": 0.15835914015769958, "learning_rate": 1.7408992640900263e-05, "loss": 0.5, "step": 3230 }, { "epoch": 0.7207227303145215, "grad_norm": 0.16518062353134155, "learning_rate": 1.740741183462241e-05, "loss": 0.4791, "step": 3231 }, { "epoch": 0.7209457952264109, "grad_norm": 0.19645792245864868, "learning_rate": 1.7405830618075494e-05, "loss": 0.4851, "step": 3232 }, { "epoch": 0.7211688601383003, "grad_norm": 0.1701032668352127, "learning_rate": 1.7404248991347093e-05, "loss": 0.5165, "step": 3233 }, { "epoch": 0.7213919250501896, "grad_norm": 0.16348305344581604, "learning_rate": 1.740266695452481e-05, "loss": 0.5236, "step": 3234 }, { "epoch": 0.721614989962079, "grad_norm": 0.1572495847940445, "learning_rate": 1.7401084507696263e-05, "loss": 0.5141, "step": 3235 }, { "epoch": 0.7218380548739683, "grad_norm": 0.16454187035560608, "learning_rate": 1.7399501650949107e-05, "loss": 0.4714, "step": 3236 }, { "epoch": 0.7220611197858576, "grad_norm": 0.17999160289764404, "learning_rate": 1.7397918384371003e-05, "loss": 0.5035, "step": 3237 }, { "epoch": 0.722284184697747, "grad_norm": 0.16409459710121155, "learning_rate": 1.739633470804964e-05, "loss": 0.4903, "step": 3238 }, { "epoch": 0.7225072496096364, "grad_norm": 0.15939798951148987, "learning_rate": 1.739475062207274e-05, "loss": 0.4968, "step": 3239 }, { "epoch": 0.7227303145215258, "grad_norm": 0.1598650962114334, "learning_rate": 1.7393166126528035e-05, "loss": 0.4829, "step": 3240 }, { "epoch": 0.7229533794334151, "grad_norm": 0.1741870641708374, "learning_rate": 1.7391581221503286e-05, "loss": 0.5268, "step": 3241 }, { "epoch": 0.7231764443453045, "grad_norm": 0.16262130439281464, "learning_rate": 1.7389995907086273e-05, "loss": 0.4814, "step": 3242 }, { "epoch": 0.7233995092571939, "grad_norm": 0.16130922734737396, "learning_rate": 1.73884101833648e-05, "loss": 0.48, "step": 3243 }, { "epoch": 0.7236225741690832, "grad_norm": 0.167790949344635, "learning_rate": 1.7386824050426697e-05, "loss": 0.4774, "step": 3244 }, { "epoch": 0.7238456390809725, "grad_norm": 0.16130533814430237, "learning_rate": 1.7385237508359812e-05, "loss": 0.4765, "step": 3245 }, { "epoch": 0.7240687039928619, "grad_norm": 0.16216439008712769, "learning_rate": 1.7383650557252023e-05, "loss": 0.5197, "step": 3246 }, { "epoch": 0.7242917689047513, "grad_norm": 0.17525294423103333, "learning_rate": 1.7382063197191218e-05, "loss": 0.5063, "step": 3247 }, { "epoch": 0.7245148338166406, "grad_norm": 0.16659271717071533, "learning_rate": 1.738047542826532e-05, "loss": 0.5077, "step": 3248 }, { "epoch": 0.72473789872853, "grad_norm": 0.15952268242835999, "learning_rate": 1.7378887250562268e-05, "loss": 0.4991, "step": 3249 }, { "epoch": 0.7249609636404194, "grad_norm": 0.16580569744110107, "learning_rate": 1.737729866417002e-05, "loss": 0.5026, "step": 3250 }, { "epoch": 0.7251840285523087, "grad_norm": 0.16428150236606598, "learning_rate": 1.7375709669176572e-05, "loss": 0.4993, "step": 3251 }, { "epoch": 0.7254070934641981, "grad_norm": 0.17543014883995056, "learning_rate": 1.7374120265669927e-05, "loss": 0.483, "step": 3252 }, { "epoch": 0.7256301583760875, "grad_norm": 0.16254562139511108, "learning_rate": 1.7372530453738113e-05, "loss": 0.5051, "step": 3253 }, { "epoch": 0.7258532232879769, "grad_norm": 0.16587981581687927, "learning_rate": 1.737094023346919e-05, "loss": 0.4717, "step": 3254 }, { "epoch": 0.7260762881998661, "grad_norm": 0.15644344687461853, "learning_rate": 1.7369349604951233e-05, "loss": 0.4918, "step": 3255 }, { "epoch": 0.7262993531117555, "grad_norm": 0.16663451492786407, "learning_rate": 1.736775856827234e-05, "loss": 0.4973, "step": 3256 }, { "epoch": 0.7265224180236449, "grad_norm": 0.16686449944972992, "learning_rate": 1.736616712352063e-05, "loss": 0.5115, "step": 3257 }, { "epoch": 0.7267454829355342, "grad_norm": 0.15312035381793976, "learning_rate": 1.736457527078425e-05, "loss": 0.4498, "step": 3258 }, { "epoch": 0.7269685478474236, "grad_norm": 0.17026790976524353, "learning_rate": 1.7362983010151368e-05, "loss": 0.4958, "step": 3259 }, { "epoch": 0.727191612759313, "grad_norm": 0.16290387511253357, "learning_rate": 1.7361390341710173e-05, "loss": 0.4785, "step": 3260 }, { "epoch": 0.7274146776712023, "grad_norm": 0.15380245447158813, "learning_rate": 1.7359797265548876e-05, "loss": 0.466, "step": 3261 }, { "epoch": 0.7276377425830917, "grad_norm": 0.1699349284172058, "learning_rate": 1.7358203781755707e-05, "loss": 0.4858, "step": 3262 }, { "epoch": 0.727860807494981, "grad_norm": 0.16431613266468048, "learning_rate": 1.735660989041893e-05, "loss": 0.5011, "step": 3263 }, { "epoch": 0.7280838724068704, "grad_norm": 0.16305620968341827, "learning_rate": 1.735501559162682e-05, "loss": 0.5167, "step": 3264 }, { "epoch": 0.7283069373187597, "grad_norm": 0.15818987786769867, "learning_rate": 1.7353420885467688e-05, "loss": 0.4953, "step": 3265 }, { "epoch": 0.7285300022306491, "grad_norm": 0.164206400513649, "learning_rate": 1.7351825772029847e-05, "loss": 0.5128, "step": 3266 }, { "epoch": 0.7287530671425385, "grad_norm": 0.15800419449806213, "learning_rate": 1.7350230251401653e-05, "loss": 0.4654, "step": 3267 }, { "epoch": 0.7289761320544278, "grad_norm": 0.16068902611732483, "learning_rate": 1.734863432367147e-05, "loss": 0.4997, "step": 3268 }, { "epoch": 0.7291991969663172, "grad_norm": 0.16787229478359222, "learning_rate": 1.7347037988927696e-05, "loss": 0.5244, "step": 3269 }, { "epoch": 0.7294222618782066, "grad_norm": 0.1621689349412918, "learning_rate": 1.7345441247258743e-05, "loss": 0.4855, "step": 3270 }, { "epoch": 0.729645326790096, "grad_norm": 0.16722342371940613, "learning_rate": 1.734384409875305e-05, "loss": 0.4992, "step": 3271 }, { "epoch": 0.7298683917019853, "grad_norm": 0.15855719149112701, "learning_rate": 1.7342246543499074e-05, "loss": 0.5074, "step": 3272 }, { "epoch": 0.7300914566138746, "grad_norm": 0.17619627714157104, "learning_rate": 1.7340648581585296e-05, "loss": 0.4834, "step": 3273 }, { "epoch": 0.730314521525764, "grad_norm": 0.17650535702705383, "learning_rate": 1.7339050213100233e-05, "loss": 0.5393, "step": 3274 }, { "epoch": 0.7305375864376533, "grad_norm": 0.17073291540145874, "learning_rate": 1.73374514381324e-05, "loss": 0.4873, "step": 3275 }, { "epoch": 0.7307606513495427, "grad_norm": 0.1601572334766388, "learning_rate": 1.733585225677035e-05, "loss": 0.485, "step": 3276 }, { "epoch": 0.7309837162614321, "grad_norm": 0.1590786874294281, "learning_rate": 1.7334252669102665e-05, "loss": 0.4668, "step": 3277 }, { "epoch": 0.7312067811733214, "grad_norm": 0.15466643869876862, "learning_rate": 1.7332652675217928e-05, "loss": 0.4476, "step": 3278 }, { "epoch": 0.7314298460852108, "grad_norm": 0.15521705150604248, "learning_rate": 1.733105227520476e-05, "loss": 0.4878, "step": 3279 }, { "epoch": 0.7316529109971002, "grad_norm": 0.21078747510910034, "learning_rate": 1.7329451469151807e-05, "loss": 0.5203, "step": 3280 }, { "epoch": 0.7318759759089896, "grad_norm": 0.1662643849849701, "learning_rate": 1.7327850257147724e-05, "loss": 0.5099, "step": 3281 }, { "epoch": 0.7320990408208788, "grad_norm": 0.16159231960773468, "learning_rate": 1.73262486392812e-05, "loss": 0.4944, "step": 3282 }, { "epoch": 0.7323221057327682, "grad_norm": 0.16965213418006897, "learning_rate": 1.7324646615640947e-05, "loss": 0.4966, "step": 3283 }, { "epoch": 0.7325451706446576, "grad_norm": 0.1666761040687561, "learning_rate": 1.732304418631569e-05, "loss": 0.4846, "step": 3284 }, { "epoch": 0.7327682355565469, "grad_norm": 0.16207394003868103, "learning_rate": 1.7321441351394178e-05, "loss": 0.5124, "step": 3285 }, { "epoch": 0.7329913004684363, "grad_norm": 0.15828841924667358, "learning_rate": 1.7319838110965192e-05, "loss": 0.4887, "step": 3286 }, { "epoch": 0.7332143653803257, "grad_norm": 0.16760720312595367, "learning_rate": 1.731823446511753e-05, "loss": 0.4715, "step": 3287 }, { "epoch": 0.7334374302922151, "grad_norm": 0.1632707566022873, "learning_rate": 1.7316630413940005e-05, "loss": 0.4803, "step": 3288 }, { "epoch": 0.7336604952041044, "grad_norm": 0.1794753223657608, "learning_rate": 1.7315025957521468e-05, "loss": 0.5025, "step": 3289 }, { "epoch": 0.7338835601159938, "grad_norm": 0.1697871834039688, "learning_rate": 1.7313421095950778e-05, "loss": 0.4725, "step": 3290 }, { "epoch": 0.7341066250278832, "grad_norm": 0.16839033365249634, "learning_rate": 1.7311815829316826e-05, "loss": 0.5406, "step": 3291 }, { "epoch": 0.7343296899397724, "grad_norm": 0.16735753417015076, "learning_rate": 1.731021015770852e-05, "loss": 0.5238, "step": 3292 }, { "epoch": 0.7345527548516618, "grad_norm": 0.16878098249435425, "learning_rate": 1.7308604081214793e-05, "loss": 0.4978, "step": 3293 }, { "epoch": 0.7347758197635512, "grad_norm": 0.16095402836799622, "learning_rate": 1.7306997599924597e-05, "loss": 0.4696, "step": 3294 }, { "epoch": 0.7349988846754405, "grad_norm": 0.16328908503055573, "learning_rate": 1.730539071392691e-05, "loss": 0.4965, "step": 3295 }, { "epoch": 0.7352219495873299, "grad_norm": 0.1627175509929657, "learning_rate": 1.7303783423310735e-05, "loss": 0.4822, "step": 3296 }, { "epoch": 0.7354450144992193, "grad_norm": 0.1619403213262558, "learning_rate": 1.730217572816509e-05, "loss": 0.5027, "step": 3297 }, { "epoch": 0.7356680794111087, "grad_norm": 0.16253367066383362, "learning_rate": 1.7300567628579025e-05, "loss": 0.5028, "step": 3298 }, { "epoch": 0.735891144322998, "grad_norm": 0.16807517409324646, "learning_rate": 1.72989591246416e-05, "loss": 0.4756, "step": 3299 }, { "epoch": 0.7361142092348874, "grad_norm": 0.18981721997261047, "learning_rate": 1.7297350216441903e-05, "loss": 0.5209, "step": 3300 }, { "epoch": 0.7363372741467767, "grad_norm": 0.1598469614982605, "learning_rate": 1.7295740904069053e-05, "loss": 0.4872, "step": 3301 }, { "epoch": 0.736560339058666, "grad_norm": 0.20748665928840637, "learning_rate": 1.7294131187612176e-05, "loss": 0.4603, "step": 3302 }, { "epoch": 0.7367834039705554, "grad_norm": 0.16444922983646393, "learning_rate": 1.7292521067160434e-05, "loss": 0.4772, "step": 3303 }, { "epoch": 0.7370064688824448, "grad_norm": 0.16779246926307678, "learning_rate": 1.7290910542803004e-05, "loss": 0.4948, "step": 3304 }, { "epoch": 0.7372295337943342, "grad_norm": 0.16438250243663788, "learning_rate": 1.7289299614629083e-05, "loss": 0.5103, "step": 3305 }, { "epoch": 0.7374525987062235, "grad_norm": 0.17737194895744324, "learning_rate": 1.7287688282727903e-05, "loss": 0.4994, "step": 3306 }, { "epoch": 0.7376756636181129, "grad_norm": 0.15918989479541779, "learning_rate": 1.7286076547188703e-05, "loss": 0.4978, "step": 3307 }, { "epoch": 0.7378987285300023, "grad_norm": 0.3185868561267853, "learning_rate": 1.728446440810075e-05, "loss": 0.489, "step": 3308 }, { "epoch": 0.7381217934418915, "grad_norm": 0.1764698624610901, "learning_rate": 1.728285186555334e-05, "loss": 0.506, "step": 3309 }, { "epoch": 0.7383448583537809, "grad_norm": 0.16116134822368622, "learning_rate": 1.7281238919635784e-05, "loss": 0.519, "step": 3310 }, { "epoch": 0.7385679232656703, "grad_norm": 0.1670210361480713, "learning_rate": 1.7279625570437413e-05, "loss": 0.5228, "step": 3311 }, { "epoch": 0.7387909881775596, "grad_norm": 0.16115038096904755, "learning_rate": 1.7278011818047588e-05, "loss": 0.4882, "step": 3312 }, { "epoch": 0.739014053089449, "grad_norm": 0.16242891550064087, "learning_rate": 1.7276397662555685e-05, "loss": 0.5139, "step": 3313 }, { "epoch": 0.7392371180013384, "grad_norm": 0.16696135699748993, "learning_rate": 1.7274783104051112e-05, "loss": 0.521, "step": 3314 }, { "epoch": 0.7394601829132278, "grad_norm": 0.1544819325208664, "learning_rate": 1.727316814262329e-05, "loss": 0.4765, "step": 3315 }, { "epoch": 0.7396832478251171, "grad_norm": 0.16446231305599213, "learning_rate": 1.727155277836167e-05, "loss": 0.4683, "step": 3316 }, { "epoch": 0.7399063127370065, "grad_norm": 0.1666785031557083, "learning_rate": 1.7269937011355713e-05, "loss": 0.503, "step": 3317 }, { "epoch": 0.7401293776488959, "grad_norm": 0.16166886687278748, "learning_rate": 1.7268320841694915e-05, "loss": 0.5014, "step": 3318 }, { "epoch": 0.7403524425607851, "grad_norm": 0.27264273166656494, "learning_rate": 1.7266704269468786e-05, "loss": 0.4834, "step": 3319 }, { "epoch": 0.7405755074726745, "grad_norm": 0.16175110638141632, "learning_rate": 1.7265087294766872e-05, "loss": 0.4926, "step": 3320 }, { "epoch": 0.7407985723845639, "grad_norm": 0.1768079400062561, "learning_rate": 1.726346991767872e-05, "loss": 0.4933, "step": 3321 }, { "epoch": 0.7410216372964533, "grad_norm": 0.15689828991889954, "learning_rate": 1.7261852138293918e-05, "loss": 0.4928, "step": 3322 }, { "epoch": 0.7412447022083426, "grad_norm": 0.16128939390182495, "learning_rate": 1.7260233956702062e-05, "loss": 0.4739, "step": 3323 }, { "epoch": 0.741467767120232, "grad_norm": 0.16142840683460236, "learning_rate": 1.7258615372992783e-05, "loss": 0.5123, "step": 3324 }, { "epoch": 0.7416908320321214, "grad_norm": 0.15940620005130768, "learning_rate": 1.7256996387255725e-05, "loss": 0.5013, "step": 3325 }, { "epoch": 0.7419138969440107, "grad_norm": 0.16134943068027496, "learning_rate": 1.7255376999580557e-05, "loss": 0.4651, "step": 3326 }, { "epoch": 0.7421369618559001, "grad_norm": 0.15544810891151428, "learning_rate": 1.7253757210056978e-05, "loss": 0.4657, "step": 3327 }, { "epoch": 0.7423600267677894, "grad_norm": 0.15278422832489014, "learning_rate": 1.7252137018774694e-05, "loss": 0.4604, "step": 3328 }, { "epoch": 0.7425830916796788, "grad_norm": 0.15711228549480438, "learning_rate": 1.7250516425823443e-05, "loss": 0.4903, "step": 3329 }, { "epoch": 0.7428061565915681, "grad_norm": 0.15838393568992615, "learning_rate": 1.7248895431292988e-05, "loss": 0.484, "step": 3330 }, { "epoch": 0.7430292215034575, "grad_norm": 0.15732166171073914, "learning_rate": 1.72472740352731e-05, "loss": 0.4637, "step": 3331 }, { "epoch": 0.7432522864153469, "grad_norm": 0.16680875420570374, "learning_rate": 1.7245652237853593e-05, "loss": 0.4999, "step": 3332 }, { "epoch": 0.7434753513272362, "grad_norm": 0.16169287264347076, "learning_rate": 1.7244030039124287e-05, "loss": 0.4829, "step": 3333 }, { "epoch": 0.7436984162391256, "grad_norm": 0.15638568997383118, "learning_rate": 1.7242407439175035e-05, "loss": 0.4731, "step": 3334 }, { "epoch": 0.743921481151015, "grad_norm": 0.1748877912759781, "learning_rate": 1.72407844380957e-05, "loss": 0.4979, "step": 3335 }, { "epoch": 0.7441445460629043, "grad_norm": 0.15920549631118774, "learning_rate": 1.7239161035976175e-05, "loss": 0.5024, "step": 3336 }, { "epoch": 0.7443676109747936, "grad_norm": 0.16807498037815094, "learning_rate": 1.7237537232906376e-05, "loss": 0.5414, "step": 3337 }, { "epoch": 0.744590675886683, "grad_norm": 0.16192768514156342, "learning_rate": 1.723591302897624e-05, "loss": 0.4883, "step": 3338 }, { "epoch": 0.7448137407985724, "grad_norm": 0.15893089771270752, "learning_rate": 1.7234288424275726e-05, "loss": 0.5002, "step": 3339 }, { "epoch": 0.7450368057104617, "grad_norm": 0.1622776985168457, "learning_rate": 1.7232663418894812e-05, "loss": 0.4859, "step": 3340 }, { "epoch": 0.7452598706223511, "grad_norm": 0.1694253832101822, "learning_rate": 1.72310380129235e-05, "loss": 0.4903, "step": 3341 }, { "epoch": 0.7454829355342405, "grad_norm": 0.1649332493543625, "learning_rate": 1.722941220645182e-05, "loss": 0.5145, "step": 3342 }, { "epoch": 0.7457060004461298, "grad_norm": 0.19938278198242188, "learning_rate": 1.722778599956982e-05, "loss": 0.4949, "step": 3343 }, { "epoch": 0.7459290653580192, "grad_norm": 0.15563727915287018, "learning_rate": 1.7226159392367564e-05, "loss": 0.4911, "step": 3344 }, { "epoch": 0.7461521302699086, "grad_norm": 0.15972435474395752, "learning_rate": 1.7224532384935148e-05, "loss": 0.4748, "step": 3345 }, { "epoch": 0.746375195181798, "grad_norm": 0.16170111298561096, "learning_rate": 1.722290497736268e-05, "loss": 0.4701, "step": 3346 }, { "epoch": 0.7465982600936872, "grad_norm": 0.1537921279668808, "learning_rate": 1.7221277169740305e-05, "loss": 0.478, "step": 3347 }, { "epoch": 0.7468213250055766, "grad_norm": 0.15656088292598724, "learning_rate": 1.7219648962158174e-05, "loss": 0.4851, "step": 3348 }, { "epoch": 0.747044389917466, "grad_norm": 0.16441093385219574, "learning_rate": 1.7218020354706473e-05, "loss": 0.4846, "step": 3349 }, { "epoch": 0.7472674548293553, "grad_norm": 0.17135939002037048, "learning_rate": 1.72163913474754e-05, "loss": 0.5044, "step": 3350 }, { "epoch": 0.7474905197412447, "grad_norm": 0.17070244252681732, "learning_rate": 1.721476194055518e-05, "loss": 0.537, "step": 3351 }, { "epoch": 0.7477135846531341, "grad_norm": 0.1651678830385208, "learning_rate": 1.7213132134036063e-05, "loss": 0.5161, "step": 3352 }, { "epoch": 0.7479366495650234, "grad_norm": 0.16997648775577545, "learning_rate": 1.7211501928008317e-05, "loss": 0.5026, "step": 3353 }, { "epoch": 0.7481597144769128, "grad_norm": 0.17357714474201202, "learning_rate": 1.7209871322562232e-05, "loss": 0.4899, "step": 3354 }, { "epoch": 0.7483827793888022, "grad_norm": 0.16555210947990417, "learning_rate": 1.7208240317788115e-05, "loss": 0.5176, "step": 3355 }, { "epoch": 0.7486058443006915, "grad_norm": 0.15799009799957275, "learning_rate": 1.7206608913776315e-05, "loss": 0.4923, "step": 3356 }, { "epoch": 0.7488289092125808, "grad_norm": 0.15853382647037506, "learning_rate": 1.720497711061718e-05, "loss": 0.4881, "step": 3357 }, { "epoch": 0.7490519741244702, "grad_norm": 0.15760891139507294, "learning_rate": 1.720334490840109e-05, "loss": 0.5008, "step": 3358 }, { "epoch": 0.7492750390363596, "grad_norm": 0.16732452809810638, "learning_rate": 1.720171230721845e-05, "loss": 0.4923, "step": 3359 }, { "epoch": 0.7494981039482489, "grad_norm": 0.17124591767787933, "learning_rate": 1.7200079307159677e-05, "loss": 0.5272, "step": 3360 }, { "epoch": 0.7497211688601383, "grad_norm": 0.16751034557819366, "learning_rate": 1.7198445908315226e-05, "loss": 0.4801, "step": 3361 }, { "epoch": 0.7499442337720277, "grad_norm": 0.16137580573558807, "learning_rate": 1.719681211077556e-05, "loss": 0.5371, "step": 3362 }, { "epoch": 0.7501672986839171, "grad_norm": 0.16350311040878296, "learning_rate": 1.7195177914631172e-05, "loss": 0.5152, "step": 3363 }, { "epoch": 0.7503903635958064, "grad_norm": 0.1560794562101364, "learning_rate": 1.719354331997257e-05, "loss": 0.4788, "step": 3364 }, { "epoch": 0.7506134285076957, "grad_norm": 0.16271278262138367, "learning_rate": 1.7191908326890288e-05, "loss": 0.4975, "step": 3365 }, { "epoch": 0.7508364934195851, "grad_norm": 0.21892762184143066, "learning_rate": 1.7190272935474883e-05, "loss": 0.511, "step": 3366 }, { "epoch": 0.7510595583314744, "grad_norm": 0.1879514753818512, "learning_rate": 1.7188637145816937e-05, "loss": 0.5277, "step": 3367 }, { "epoch": 0.7512826232433638, "grad_norm": 0.17167915403842926, "learning_rate": 1.718700095800705e-05, "loss": 0.5128, "step": 3368 }, { "epoch": 0.7515056881552532, "grad_norm": 0.16950075328350067, "learning_rate": 1.718536437213584e-05, "loss": 0.5175, "step": 3369 }, { "epoch": 0.7517287530671425, "grad_norm": 0.1571718156337738, "learning_rate": 1.718372738829395e-05, "loss": 0.489, "step": 3370 }, { "epoch": 0.7519518179790319, "grad_norm": 0.161586731672287, "learning_rate": 1.718209000657205e-05, "loss": 0.5112, "step": 3371 }, { "epoch": 0.7521748828909213, "grad_norm": 0.16906411945819855, "learning_rate": 1.718045222706083e-05, "loss": 0.5171, "step": 3372 }, { "epoch": 0.7523979478028107, "grad_norm": 0.17176932096481323, "learning_rate": 1.7178814049851e-05, "loss": 0.5352, "step": 3373 }, { "epoch": 0.7526210127146999, "grad_norm": 0.15188109874725342, "learning_rate": 1.717717547503329e-05, "loss": 0.4857, "step": 3374 }, { "epoch": 0.7528440776265893, "grad_norm": 0.1867418736219406, "learning_rate": 1.7175536502698456e-05, "loss": 0.5199, "step": 3375 }, { "epoch": 0.7530671425384787, "grad_norm": 0.1905105859041214, "learning_rate": 1.7173897132937274e-05, "loss": 0.5077, "step": 3376 }, { "epoch": 0.753290207450368, "grad_norm": 0.16345377266407013, "learning_rate": 1.7172257365840544e-05, "loss": 0.5362, "step": 3377 }, { "epoch": 0.7535132723622574, "grad_norm": 0.17045968770980835, "learning_rate": 1.7170617201499083e-05, "loss": 0.5057, "step": 3378 }, { "epoch": 0.7537363372741468, "grad_norm": 0.16535595059394836, "learning_rate": 1.716897664000374e-05, "loss": 0.5141, "step": 3379 }, { "epoch": 0.7539594021860362, "grad_norm": 0.16590653359889984, "learning_rate": 1.716733568144538e-05, "loss": 0.5146, "step": 3380 }, { "epoch": 0.7541824670979255, "grad_norm": 0.156137153506279, "learning_rate": 1.716569432591488e-05, "loss": 0.4984, "step": 3381 }, { "epoch": 0.7544055320098149, "grad_norm": 0.17076677083969116, "learning_rate": 1.7164052573503155e-05, "loss": 0.4968, "step": 3382 }, { "epoch": 0.7546285969217043, "grad_norm": 0.15686576068401337, "learning_rate": 1.7162410424301132e-05, "loss": 0.4868, "step": 3383 }, { "epoch": 0.7548516618335935, "grad_norm": 0.19442743062973022, "learning_rate": 1.716076787839977e-05, "loss": 0.5058, "step": 3384 }, { "epoch": 0.7550747267454829, "grad_norm": 0.16424842178821564, "learning_rate": 1.715912493589004e-05, "loss": 0.4857, "step": 3385 }, { "epoch": 0.7552977916573723, "grad_norm": 0.6959996819496155, "learning_rate": 1.7157481596862936e-05, "loss": 0.5091, "step": 3386 }, { "epoch": 0.7555208565692616, "grad_norm": 0.17369645833969116, "learning_rate": 1.7155837861409482e-05, "loss": 0.5116, "step": 3387 }, { "epoch": 0.755743921481151, "grad_norm": 0.2050577700138092, "learning_rate": 1.7154193729620713e-05, "loss": 0.4982, "step": 3388 }, { "epoch": 0.7559669863930404, "grad_norm": 0.15558819472789764, "learning_rate": 1.7152549201587695e-05, "loss": 0.4873, "step": 3389 }, { "epoch": 0.7561900513049298, "grad_norm": 0.16767403483390808, "learning_rate": 1.715090427740151e-05, "loss": 0.4806, "step": 3390 }, { "epoch": 0.7564131162168191, "grad_norm": 0.1653711050748825, "learning_rate": 1.714925895715326e-05, "loss": 0.4858, "step": 3391 }, { "epoch": 0.7566361811287085, "grad_norm": 0.1628393530845642, "learning_rate": 1.7147613240934087e-05, "loss": 0.5036, "step": 3392 }, { "epoch": 0.7568592460405978, "grad_norm": 0.1690330058336258, "learning_rate": 1.714596712883513e-05, "loss": 0.4788, "step": 3393 }, { "epoch": 0.7570823109524871, "grad_norm": 0.18115870654582977, "learning_rate": 1.714432062094756e-05, "loss": 0.5009, "step": 3394 }, { "epoch": 0.7573053758643765, "grad_norm": 0.17084982991218567, "learning_rate": 1.7142673717362578e-05, "loss": 0.5281, "step": 3395 }, { "epoch": 0.7575284407762659, "grad_norm": 0.16751186549663544, "learning_rate": 1.7141026418171396e-05, "loss": 0.5377, "step": 3396 }, { "epoch": 0.7577515056881553, "grad_norm": 0.16510246694087982, "learning_rate": 1.713937872346525e-05, "loss": 0.4995, "step": 3397 }, { "epoch": 0.7579745706000446, "grad_norm": 0.1689106822013855, "learning_rate": 1.7137730633335404e-05, "loss": 0.4891, "step": 3398 }, { "epoch": 0.758197635511934, "grad_norm": 0.16398201882839203, "learning_rate": 1.7136082147873136e-05, "loss": 0.5013, "step": 3399 }, { "epoch": 0.7584207004238234, "grad_norm": 0.16415871679782867, "learning_rate": 1.713443326716975e-05, "loss": 0.5343, "step": 3400 }, { "epoch": 0.7586437653357126, "grad_norm": 0.164589986205101, "learning_rate": 1.7132783991316577e-05, "loss": 0.4897, "step": 3401 }, { "epoch": 0.758866830247602, "grad_norm": 0.17511430382728577, "learning_rate": 1.7131134320404953e-05, "loss": 0.5088, "step": 3402 }, { "epoch": 0.7590898951594914, "grad_norm": 0.1833367496728897, "learning_rate": 1.7129484254526257e-05, "loss": 0.4811, "step": 3403 }, { "epoch": 0.7593129600713808, "grad_norm": 0.16160368919372559, "learning_rate": 1.7127833793771874e-05, "loss": 0.508, "step": 3404 }, { "epoch": 0.7595360249832701, "grad_norm": 0.1855868101119995, "learning_rate": 1.7126182938233228e-05, "loss": 0.5091, "step": 3405 }, { "epoch": 0.7597590898951595, "grad_norm": 0.16717424988746643, "learning_rate": 1.7124531688001735e-05, "loss": 0.482, "step": 3406 }, { "epoch": 0.7599821548070489, "grad_norm": 0.1635364592075348, "learning_rate": 1.7122880043168872e-05, "loss": 0.4706, "step": 3407 }, { "epoch": 0.7602052197189382, "grad_norm": 0.15802796185016632, "learning_rate": 1.71212280038261e-05, "loss": 0.4835, "step": 3408 }, { "epoch": 0.7604282846308276, "grad_norm": 0.16960306465625763, "learning_rate": 1.7119575570064926e-05, "loss": 0.474, "step": 3409 }, { "epoch": 0.760651349542717, "grad_norm": 0.16367004811763763, "learning_rate": 1.7117922741976878e-05, "loss": 0.5132, "step": 3410 }, { "epoch": 0.7608744144546062, "grad_norm": 0.1700059324502945, "learning_rate": 1.7116269519653493e-05, "loss": 0.5124, "step": 3411 }, { "epoch": 0.7610974793664956, "grad_norm": 0.19023141264915466, "learning_rate": 1.711461590318634e-05, "loss": 0.487, "step": 3412 }, { "epoch": 0.761320544278385, "grad_norm": 0.16813622415065765, "learning_rate": 1.7112961892667003e-05, "loss": 0.4749, "step": 3413 }, { "epoch": 0.7615436091902744, "grad_norm": 0.16135910153388977, "learning_rate": 1.7111307488187096e-05, "loss": 0.5041, "step": 3414 }, { "epoch": 0.7617666741021637, "grad_norm": 0.16579851508140564, "learning_rate": 1.710965268983825e-05, "loss": 0.5062, "step": 3415 }, { "epoch": 0.7619897390140531, "grad_norm": 0.15735271573066711, "learning_rate": 1.7107997497712113e-05, "loss": 0.5071, "step": 3416 }, { "epoch": 0.7622128039259425, "grad_norm": 0.1686030477285385, "learning_rate": 1.7106341911900365e-05, "loss": 0.5271, "step": 3417 }, { "epoch": 0.7624358688378318, "grad_norm": 0.17071577906608582, "learning_rate": 1.7104685932494704e-05, "loss": 0.5179, "step": 3418 }, { "epoch": 0.7626589337497212, "grad_norm": 0.15663312375545502, "learning_rate": 1.7103029559586843e-05, "loss": 0.4928, "step": 3419 }, { "epoch": 0.7628819986616105, "grad_norm": 0.17715679109096527, "learning_rate": 1.7101372793268526e-05, "loss": 0.5371, "step": 3420 }, { "epoch": 0.7631050635734999, "grad_norm": 0.1651579737663269, "learning_rate": 1.709971563363151e-05, "loss": 0.5072, "step": 3421 }, { "epoch": 0.7633281284853892, "grad_norm": 0.15921546518802643, "learning_rate": 1.7098058080767587e-05, "loss": 0.5008, "step": 3422 }, { "epoch": 0.7635511933972786, "grad_norm": 0.17586684226989746, "learning_rate": 1.709640013476856e-05, "loss": 0.4962, "step": 3423 }, { "epoch": 0.763774258309168, "grad_norm": 0.17664553225040436, "learning_rate": 1.7094741795726254e-05, "loss": 0.5107, "step": 3424 }, { "epoch": 0.7639973232210573, "grad_norm": 0.1621769219636917, "learning_rate": 1.7093083063732518e-05, "loss": 0.5017, "step": 3425 }, { "epoch": 0.7642203881329467, "grad_norm": 0.1619473695755005, "learning_rate": 1.7091423938879227e-05, "loss": 0.4858, "step": 3426 }, { "epoch": 0.7644434530448361, "grad_norm": 0.1741994023323059, "learning_rate": 1.7089764421258272e-05, "loss": 0.5226, "step": 3427 }, { "epoch": 0.7646665179567254, "grad_norm": 0.16964323818683624, "learning_rate": 1.7088104510961564e-05, "loss": 0.4987, "step": 3428 }, { "epoch": 0.7648895828686147, "grad_norm": 0.1797408014535904, "learning_rate": 1.7086444208081047e-05, "loss": 0.4833, "step": 3429 }, { "epoch": 0.7651126477805041, "grad_norm": 0.19941550493240356, "learning_rate": 1.708478351270867e-05, "loss": 0.5138, "step": 3430 }, { "epoch": 0.7653357126923935, "grad_norm": 0.17348411679267883, "learning_rate": 1.708312242493642e-05, "loss": 0.4866, "step": 3431 }, { "epoch": 0.7655587776042828, "grad_norm": 0.1664038747549057, "learning_rate": 1.7081460944856294e-05, "loss": 0.5124, "step": 3432 }, { "epoch": 0.7657818425161722, "grad_norm": 0.15917468070983887, "learning_rate": 1.7079799072560318e-05, "loss": 0.5066, "step": 3433 }, { "epoch": 0.7660049074280616, "grad_norm": 0.16369706392288208, "learning_rate": 1.7078136808140532e-05, "loss": 0.4903, "step": 3434 }, { "epoch": 0.7662279723399509, "grad_norm": 0.16175441443920135, "learning_rate": 1.707647415168901e-05, "loss": 0.525, "step": 3435 }, { "epoch": 0.7664510372518403, "grad_norm": 0.19323702156543732, "learning_rate": 1.707481110329783e-05, "loss": 0.4865, "step": 3436 }, { "epoch": 0.7666741021637297, "grad_norm": 0.15811483561992645, "learning_rate": 1.707314766305912e-05, "loss": 0.4709, "step": 3437 }, { "epoch": 0.7668971670756191, "grad_norm": 0.1613859087228775, "learning_rate": 1.707148383106499e-05, "loss": 0.4969, "step": 3438 }, { "epoch": 0.7671202319875083, "grad_norm": 0.16382627189159393, "learning_rate": 1.706981960740761e-05, "loss": 0.4893, "step": 3439 }, { "epoch": 0.7673432968993977, "grad_norm": 0.17395590245723724, "learning_rate": 1.706815499217915e-05, "loss": 0.5073, "step": 3440 }, { "epoch": 0.7675663618112871, "grad_norm": 0.16161273419857025, "learning_rate": 1.7066489985471802e-05, "loss": 0.4766, "step": 3441 }, { "epoch": 0.7677894267231764, "grad_norm": 0.16916361451148987, "learning_rate": 1.706482458737779e-05, "loss": 0.5102, "step": 3442 }, { "epoch": 0.7680124916350658, "grad_norm": 0.16515444219112396, "learning_rate": 1.7063158797989355e-05, "loss": 0.5006, "step": 3443 }, { "epoch": 0.7682355565469552, "grad_norm": 0.16712456941604614, "learning_rate": 1.7061492617398755e-05, "loss": 0.4757, "step": 3444 }, { "epoch": 0.7684586214588445, "grad_norm": 0.1649584323167801, "learning_rate": 1.7059826045698275e-05, "loss": 0.4979, "step": 3445 }, { "epoch": 0.7686816863707339, "grad_norm": 0.16432853043079376, "learning_rate": 1.7058159082980223e-05, "loss": 0.5155, "step": 3446 }, { "epoch": 0.7689047512826233, "grad_norm": 0.15966151654720306, "learning_rate": 1.7056491729336917e-05, "loss": 0.4908, "step": 3447 }, { "epoch": 0.7691278161945126, "grad_norm": 0.1597587913274765, "learning_rate": 1.7054823984860716e-05, "loss": 0.4912, "step": 3448 }, { "epoch": 0.7693508811064019, "grad_norm": 0.16364127397537231, "learning_rate": 1.705315584964399e-05, "loss": 0.5045, "step": 3449 }, { "epoch": 0.7695739460182913, "grad_norm": 0.15808305144309998, "learning_rate": 1.7051487323779122e-05, "loss": 0.5025, "step": 3450 }, { "epoch": 0.7697970109301807, "grad_norm": 0.173845574259758, "learning_rate": 1.704981840735853e-05, "loss": 0.5166, "step": 3451 }, { "epoch": 0.77002007584207, "grad_norm": 0.1586831957101822, "learning_rate": 1.7048149100474653e-05, "loss": 0.4758, "step": 3452 }, { "epoch": 0.7702431407539594, "grad_norm": 0.16501812636852264, "learning_rate": 1.704647940321994e-05, "loss": 0.5051, "step": 3453 }, { "epoch": 0.7704662056658488, "grad_norm": 0.17648212611675262, "learning_rate": 1.704480931568688e-05, "loss": 0.5014, "step": 3454 }, { "epoch": 0.7706892705777382, "grad_norm": 0.15847481787204742, "learning_rate": 1.704313883796796e-05, "loss": 0.5246, "step": 3455 }, { "epoch": 0.7709123354896275, "grad_norm": 0.17660297453403473, "learning_rate": 1.704146797015571e-05, "loss": 0.5028, "step": 3456 }, { "epoch": 0.7711354004015168, "grad_norm": 0.1632460057735443, "learning_rate": 1.7039796712342672e-05, "loss": 0.4647, "step": 3457 }, { "epoch": 0.7713584653134062, "grad_norm": 0.15266932547092438, "learning_rate": 1.7038125064621408e-05, "loss": 0.4829, "step": 3458 }, { "epoch": 0.7715815302252955, "grad_norm": 0.16995325684547424, "learning_rate": 1.703645302708451e-05, "loss": 0.5057, "step": 3459 }, { "epoch": 0.7718045951371849, "grad_norm": 0.15925444662570953, "learning_rate": 1.703478059982458e-05, "loss": 0.5128, "step": 3460 }, { "epoch": 0.7720276600490743, "grad_norm": 0.17199598252773285, "learning_rate": 1.703310778293425e-05, "loss": 0.5088, "step": 3461 }, { "epoch": 0.7722507249609636, "grad_norm": 0.17313992977142334, "learning_rate": 1.7031434576506173e-05, "loss": 0.4796, "step": 3462 }, { "epoch": 0.772473789872853, "grad_norm": 0.1665591597557068, "learning_rate": 1.7029760980633016e-05, "loss": 0.4978, "step": 3463 }, { "epoch": 0.7726968547847424, "grad_norm": 0.16291576623916626, "learning_rate": 1.7028086995407477e-05, "loss": 0.4892, "step": 3464 }, { "epoch": 0.7729199196966318, "grad_norm": 0.15629801154136658, "learning_rate": 1.7026412620922276e-05, "loss": 0.483, "step": 3465 }, { "epoch": 0.773142984608521, "grad_norm": 0.16080623865127563, "learning_rate": 1.702473785727014e-05, "loss": 0.4968, "step": 3466 }, { "epoch": 0.7733660495204104, "grad_norm": 0.16567450761795044, "learning_rate": 1.702306270454384e-05, "loss": 0.5023, "step": 3467 }, { "epoch": 0.7735891144322998, "grad_norm": 0.16630107164382935, "learning_rate": 1.702138716283615e-05, "loss": 0.4554, "step": 3468 }, { "epoch": 0.7738121793441891, "grad_norm": 0.1635141223669052, "learning_rate": 1.7019711232239872e-05, "loss": 0.4947, "step": 3469 }, { "epoch": 0.7740352442560785, "grad_norm": 0.16662491858005524, "learning_rate": 1.7018034912847826e-05, "loss": 0.4947, "step": 3470 }, { "epoch": 0.7742583091679679, "grad_norm": 0.16234828531742096, "learning_rate": 1.7016358204752865e-05, "loss": 0.5424, "step": 3471 }, { "epoch": 0.7744813740798573, "grad_norm": 0.16206470131874084, "learning_rate": 1.701468110804785e-05, "loss": 0.4761, "step": 3472 }, { "epoch": 0.7747044389917466, "grad_norm": 0.1729600876569748, "learning_rate": 1.7013003622825674e-05, "loss": 0.4867, "step": 3473 }, { "epoch": 0.774927503903636, "grad_norm": 0.15577882528305054, "learning_rate": 1.7011325749179245e-05, "loss": 0.4935, "step": 3474 }, { "epoch": 0.7751505688155254, "grad_norm": 0.16553224623203278, "learning_rate": 1.7009647487201492e-05, "loss": 0.5052, "step": 3475 }, { "epoch": 0.7753736337274146, "grad_norm": 0.16988618671894073, "learning_rate": 1.700796883698536e-05, "loss": 0.4999, "step": 3476 }, { "epoch": 0.775596698639304, "grad_norm": 0.15535961091518402, "learning_rate": 1.7006289798623842e-05, "loss": 0.494, "step": 3477 }, { "epoch": 0.7758197635511934, "grad_norm": 0.16982175409793854, "learning_rate": 1.700461037220992e-05, "loss": 0.5057, "step": 3478 }, { "epoch": 0.7760428284630828, "grad_norm": 0.17564809322357178, "learning_rate": 1.7002930557836615e-05, "loss": 0.5088, "step": 3479 }, { "epoch": 0.7762658933749721, "grad_norm": 0.16570429503917694, "learning_rate": 1.7001250355596967e-05, "loss": 0.5223, "step": 3480 }, { "epoch": 0.7764889582868615, "grad_norm": 0.16374748945236206, "learning_rate": 1.6999569765584035e-05, "loss": 0.4533, "step": 3481 }, { "epoch": 0.7767120231987509, "grad_norm": 0.15578052401542664, "learning_rate": 1.69978887878909e-05, "loss": 0.4797, "step": 3482 }, { "epoch": 0.7769350881106402, "grad_norm": 0.15627166628837585, "learning_rate": 1.6996207422610664e-05, "loss": 0.4523, "step": 3483 }, { "epoch": 0.7771581530225296, "grad_norm": 0.1620151549577713, "learning_rate": 1.6994525669836453e-05, "loss": 0.4999, "step": 3484 }, { "epoch": 0.7773812179344189, "grad_norm": 0.15806102752685547, "learning_rate": 1.6992843529661413e-05, "loss": 0.5013, "step": 3485 }, { "epoch": 0.7776042828463082, "grad_norm": 0.25901806354522705, "learning_rate": 1.6991161002178712e-05, "loss": 0.5003, "step": 3486 }, { "epoch": 0.7778273477581976, "grad_norm": 0.15611301362514496, "learning_rate": 1.698947808748154e-05, "loss": 0.5079, "step": 3487 }, { "epoch": 0.778050412670087, "grad_norm": 0.1656789630651474, "learning_rate": 1.6987794785663107e-05, "loss": 0.4983, "step": 3488 }, { "epoch": 0.7782734775819764, "grad_norm": 0.16347582638263702, "learning_rate": 1.698611109681664e-05, "loss": 0.4887, "step": 3489 }, { "epoch": 0.7784965424938657, "grad_norm": 0.15885215997695923, "learning_rate": 1.69844270210354e-05, "loss": 0.4837, "step": 3490 }, { "epoch": 0.7787196074057551, "grad_norm": 0.17082704603672028, "learning_rate": 1.698274255841265e-05, "loss": 0.5165, "step": 3491 }, { "epoch": 0.7789426723176445, "grad_norm": 0.16731634736061096, "learning_rate": 1.6981057709041703e-05, "loss": 0.485, "step": 3492 }, { "epoch": 0.7791657372295338, "grad_norm": 0.16583065688610077, "learning_rate": 1.697937247301586e-05, "loss": 0.5117, "step": 3493 }, { "epoch": 0.7793888021414231, "grad_norm": 0.16518862545490265, "learning_rate": 1.6977686850428475e-05, "loss": 0.5141, "step": 3494 }, { "epoch": 0.7796118670533125, "grad_norm": 0.15372084081172943, "learning_rate": 1.69760008413729e-05, "loss": 0.4787, "step": 3495 }, { "epoch": 0.7798349319652019, "grad_norm": 0.16989074647426605, "learning_rate": 1.6974314445942514e-05, "loss": 0.4906, "step": 3496 }, { "epoch": 0.7800579968770912, "grad_norm": 0.16484175622463226, "learning_rate": 1.697262766423072e-05, "loss": 0.5, "step": 3497 }, { "epoch": 0.7802810617889806, "grad_norm": 0.16868092119693756, "learning_rate": 1.6970940496330953e-05, "loss": 0.5106, "step": 3498 }, { "epoch": 0.78050412670087, "grad_norm": 0.16630426049232483, "learning_rate": 1.6969252942336648e-05, "loss": 0.505, "step": 3499 }, { "epoch": 0.7807271916127593, "grad_norm": 0.15219101309776306, "learning_rate": 1.696756500234128e-05, "loss": 0.4698, "step": 3500 }, { "epoch": 0.7809502565246487, "grad_norm": 0.16838675737380981, "learning_rate": 1.6965876676438334e-05, "loss": 0.4897, "step": 3501 }, { "epoch": 0.7811733214365381, "grad_norm": 0.15935415029525757, "learning_rate": 1.696418796472132e-05, "loss": 0.508, "step": 3502 }, { "epoch": 0.7813963863484273, "grad_norm": 0.16184543073177338, "learning_rate": 1.696249886728377e-05, "loss": 0.5083, "step": 3503 }, { "epoch": 0.7816194512603167, "grad_norm": 0.15746904909610748, "learning_rate": 1.6960809384219237e-05, "loss": 0.4662, "step": 3504 }, { "epoch": 0.7818425161722061, "grad_norm": 0.1662943959236145, "learning_rate": 1.6959119515621295e-05, "loss": 0.4853, "step": 3505 }, { "epoch": 0.7820655810840955, "grad_norm": 0.16866953670978546, "learning_rate": 1.695742926158354e-05, "loss": 0.4919, "step": 3506 }, { "epoch": 0.7822886459959848, "grad_norm": 0.1712365448474884, "learning_rate": 1.695573862219959e-05, "loss": 0.4882, "step": 3507 }, { "epoch": 0.7825117109078742, "grad_norm": 0.17081387341022491, "learning_rate": 1.6954047597563078e-05, "loss": 0.5068, "step": 3508 }, { "epoch": 0.7827347758197636, "grad_norm": 0.16168633103370667, "learning_rate": 1.695235618776767e-05, "loss": 0.5119, "step": 3509 }, { "epoch": 0.7829578407316529, "grad_norm": 0.2024863213300705, "learning_rate": 1.6950664392907042e-05, "loss": 0.5039, "step": 3510 }, { "epoch": 0.7831809056435423, "grad_norm": 0.1648394763469696, "learning_rate": 1.6948972213074902e-05, "loss": 0.5304, "step": 3511 }, { "epoch": 0.7834039705554317, "grad_norm": 0.15931276977062225, "learning_rate": 1.6947279648364966e-05, "loss": 0.5197, "step": 3512 }, { "epoch": 0.783627035467321, "grad_norm": 0.15827037394046783, "learning_rate": 1.6945586698870985e-05, "loss": 0.513, "step": 3513 }, { "epoch": 0.7838501003792103, "grad_norm": 0.15963312983512878, "learning_rate": 1.694389336468672e-05, "loss": 0.5096, "step": 3514 }, { "epoch": 0.7840731652910997, "grad_norm": 0.1624821275472641, "learning_rate": 1.694219964590597e-05, "loss": 0.5086, "step": 3515 }, { "epoch": 0.7842962302029891, "grad_norm": 0.15603917837142944, "learning_rate": 1.694050554262253e-05, "loss": 0.499, "step": 3516 }, { "epoch": 0.7845192951148784, "grad_norm": 0.16541822254657745, "learning_rate": 1.6938811054930237e-05, "loss": 0.4843, "step": 3517 }, { "epoch": 0.7847423600267678, "grad_norm": 0.16056667268276215, "learning_rate": 1.693711618292294e-05, "loss": 0.4992, "step": 3518 }, { "epoch": 0.7849654249386572, "grad_norm": 0.15387259423732758, "learning_rate": 1.693542092669451e-05, "loss": 0.5056, "step": 3519 }, { "epoch": 0.7851884898505465, "grad_norm": 0.15904875099658966, "learning_rate": 1.6933725286338846e-05, "loss": 0.4718, "step": 3520 }, { "epoch": 0.7854115547624358, "grad_norm": 0.16445663571357727, "learning_rate": 1.693202926194986e-05, "loss": 0.4752, "step": 3521 }, { "epoch": 0.7856346196743252, "grad_norm": 0.16833317279815674, "learning_rate": 1.693033285362149e-05, "loss": 0.4626, "step": 3522 }, { "epoch": 0.7858576845862146, "grad_norm": 0.17628587782382965, "learning_rate": 1.692863606144769e-05, "loss": 0.5001, "step": 3523 }, { "epoch": 0.7860807494981039, "grad_norm": 0.16575254499912262, "learning_rate": 1.692693888552245e-05, "loss": 0.4987, "step": 3524 }, { "epoch": 0.7863038144099933, "grad_norm": 0.1609877198934555, "learning_rate": 1.6925241325939756e-05, "loss": 0.4897, "step": 3525 }, { "epoch": 0.7865268793218827, "grad_norm": 0.1630864441394806, "learning_rate": 1.6923543382793636e-05, "loss": 0.473, "step": 3526 }, { "epoch": 0.786749944233772, "grad_norm": 0.16511790454387665, "learning_rate": 1.6921845056178133e-05, "loss": 0.511, "step": 3527 }, { "epoch": 0.7869730091456614, "grad_norm": 0.15955112874507904, "learning_rate": 1.6920146346187312e-05, "loss": 0.4702, "step": 3528 }, { "epoch": 0.7871960740575508, "grad_norm": 0.16080255806446075, "learning_rate": 1.691844725291526e-05, "loss": 0.4897, "step": 3529 }, { "epoch": 0.7874191389694402, "grad_norm": 0.15729734301567078, "learning_rate": 1.6916747776456074e-05, "loss": 0.477, "step": 3530 }, { "epoch": 0.7876422038813294, "grad_norm": 0.18572328984737396, "learning_rate": 1.691504791690389e-05, "loss": 0.4953, "step": 3531 }, { "epoch": 0.7878652687932188, "grad_norm": 0.17920143902301788, "learning_rate": 1.6913347674352855e-05, "loss": 0.4932, "step": 3532 }, { "epoch": 0.7880883337051082, "grad_norm": 0.14651532471179962, "learning_rate": 1.691164704889714e-05, "loss": 0.4531, "step": 3533 }, { "epoch": 0.7883113986169975, "grad_norm": 0.16487324237823486, "learning_rate": 1.6909946040630935e-05, "loss": 0.487, "step": 3534 }, { "epoch": 0.7885344635288869, "grad_norm": 0.16055698692798615, "learning_rate": 1.6908244649648455e-05, "loss": 0.4754, "step": 3535 }, { "epoch": 0.7887575284407763, "grad_norm": 0.1739615648984909, "learning_rate": 1.690654287604393e-05, "loss": 0.5076, "step": 3536 }, { "epoch": 0.7889805933526656, "grad_norm": 0.17668306827545166, "learning_rate": 1.690484071991162e-05, "loss": 0.5177, "step": 3537 }, { "epoch": 0.789203658264555, "grad_norm": 0.17033684253692627, "learning_rate": 1.690313818134579e-05, "loss": 0.4884, "step": 3538 }, { "epoch": 0.7894267231764444, "grad_norm": 0.19986379146575928, "learning_rate": 1.690143526044075e-05, "loss": 0.4885, "step": 3539 }, { "epoch": 0.7896497880883337, "grad_norm": 0.17267881333827972, "learning_rate": 1.6899731957290814e-05, "loss": 0.498, "step": 3540 }, { "epoch": 0.789872853000223, "grad_norm": 0.15866464376449585, "learning_rate": 1.689802827199032e-05, "loss": 0.4856, "step": 3541 }, { "epoch": 0.7900959179121124, "grad_norm": 0.17956510186195374, "learning_rate": 1.689632420463363e-05, "loss": 0.5046, "step": 3542 }, { "epoch": 0.7903189828240018, "grad_norm": 0.17033924162387848, "learning_rate": 1.6894619755315127e-05, "loss": 0.5272, "step": 3543 }, { "epoch": 0.7905420477358911, "grad_norm": 0.15526439249515533, "learning_rate": 1.6892914924129212e-05, "loss": 0.4858, "step": 3544 }, { "epoch": 0.7907651126477805, "grad_norm": 0.16875073313713074, "learning_rate": 1.689120971117031e-05, "loss": 0.4957, "step": 3545 }, { "epoch": 0.7909881775596699, "grad_norm": 0.16168349981307983, "learning_rate": 1.6889504116532868e-05, "loss": 0.5041, "step": 3546 }, { "epoch": 0.7912112424715593, "grad_norm": 0.1654532253742218, "learning_rate": 1.688779814031135e-05, "loss": 0.5032, "step": 3547 }, { "epoch": 0.7914343073834486, "grad_norm": 0.16922220587730408, "learning_rate": 1.6886091782600248e-05, "loss": 0.5098, "step": 3548 }, { "epoch": 0.791657372295338, "grad_norm": 0.17434757947921753, "learning_rate": 1.6884385043494064e-05, "loss": 0.5232, "step": 3549 }, { "epoch": 0.7918804372072273, "grad_norm": 0.1557532101869583, "learning_rate": 1.688267792308733e-05, "loss": 0.4865, "step": 3550 }, { "epoch": 0.7921035021191166, "grad_norm": 0.1586635261774063, "learning_rate": 1.6880970421474604e-05, "loss": 0.4967, "step": 3551 }, { "epoch": 0.792326567031006, "grad_norm": 0.16100507974624634, "learning_rate": 1.6879262538750453e-05, "loss": 0.4804, "step": 3552 }, { "epoch": 0.7925496319428954, "grad_norm": 0.15817134082317352, "learning_rate": 1.6877554275009467e-05, "loss": 0.5066, "step": 3553 }, { "epoch": 0.7927726968547848, "grad_norm": 0.15588125586509705, "learning_rate": 1.6875845630346265e-05, "loss": 0.5074, "step": 3554 }, { "epoch": 0.7929957617666741, "grad_norm": 0.29957878589630127, "learning_rate": 1.687413660485548e-05, "loss": 0.4962, "step": 3555 }, { "epoch": 0.7932188266785635, "grad_norm": 0.15556854009628296, "learning_rate": 1.6872427198631772e-05, "loss": 0.4746, "step": 3556 }, { "epoch": 0.7934418915904529, "grad_norm": 0.1643342226743698, "learning_rate": 1.6870717411769818e-05, "loss": 0.5085, "step": 3557 }, { "epoch": 0.7936649565023421, "grad_norm": 0.16817674040794373, "learning_rate": 1.686900724436431e-05, "loss": 0.5078, "step": 3558 }, { "epoch": 0.7938880214142315, "grad_norm": 0.17203262448310852, "learning_rate": 1.6867296696509978e-05, "loss": 0.4775, "step": 3559 }, { "epoch": 0.7941110863261209, "grad_norm": 0.1592828631401062, "learning_rate": 1.6865585768301556e-05, "loss": 0.487, "step": 3560 }, { "epoch": 0.7943341512380102, "grad_norm": 0.15609320998191833, "learning_rate": 1.6863874459833806e-05, "loss": 0.4696, "step": 3561 }, { "epoch": 0.7945572161498996, "grad_norm": 0.17543616890907288, "learning_rate": 1.6862162771201515e-05, "loss": 0.5098, "step": 3562 }, { "epoch": 0.794780281061789, "grad_norm": 0.17788106203079224, "learning_rate": 1.6860450702499486e-05, "loss": 0.4826, "step": 3563 }, { "epoch": 0.7950033459736784, "grad_norm": 0.16301631927490234, "learning_rate": 1.685873825382254e-05, "loss": 0.5087, "step": 3564 }, { "epoch": 0.7952264108855677, "grad_norm": 0.16601480543613434, "learning_rate": 1.685702542526553e-05, "loss": 0.4624, "step": 3565 }, { "epoch": 0.7954494757974571, "grad_norm": 0.7922313809394836, "learning_rate": 1.6855312216923316e-05, "loss": 0.5045, "step": 3566 }, { "epoch": 0.7956725407093465, "grad_norm": 0.2716256380081177, "learning_rate": 1.685359862889079e-05, "loss": 0.5, "step": 3567 }, { "epoch": 0.7958956056212357, "grad_norm": 0.1633540540933609, "learning_rate": 1.6851884661262864e-05, "loss": 0.4944, "step": 3568 }, { "epoch": 0.7961186705331251, "grad_norm": 0.17535462975502014, "learning_rate": 1.6850170314134465e-05, "loss": 0.5251, "step": 3569 }, { "epoch": 0.7963417354450145, "grad_norm": 0.18369325995445251, "learning_rate": 1.6848455587600542e-05, "loss": 0.5043, "step": 3570 }, { "epoch": 0.7965648003569039, "grad_norm": 0.15560777485370636, "learning_rate": 1.6846740481756072e-05, "loss": 0.4686, "step": 3571 }, { "epoch": 0.7967878652687932, "grad_norm": 0.18801093101501465, "learning_rate": 1.6845024996696047e-05, "loss": 0.4946, "step": 3572 }, { "epoch": 0.7970109301806826, "grad_norm": 0.17000144720077515, "learning_rate": 1.684330913251548e-05, "loss": 0.5, "step": 3573 }, { "epoch": 0.797233995092572, "grad_norm": 0.16179999709129333, "learning_rate": 1.6841592889309405e-05, "loss": 0.4769, "step": 3574 }, { "epoch": 0.7974570600044613, "grad_norm": 0.16142041981220245, "learning_rate": 1.6839876267172883e-05, "loss": 0.4644, "step": 3575 }, { "epoch": 0.7976801249163507, "grad_norm": 0.16446426510810852, "learning_rate": 1.683815926620099e-05, "loss": 0.4835, "step": 3576 }, { "epoch": 0.79790318982824, "grad_norm": 0.16954201459884644, "learning_rate": 1.6836441886488822e-05, "loss": 0.5151, "step": 3577 }, { "epoch": 0.7981262547401293, "grad_norm": 0.16908320784568787, "learning_rate": 1.6834724128131496e-05, "loss": 0.4714, "step": 3578 }, { "epoch": 0.7983493196520187, "grad_norm": 0.1670169085264206, "learning_rate": 1.683300599122416e-05, "loss": 0.5004, "step": 3579 }, { "epoch": 0.7985723845639081, "grad_norm": 0.17246994376182556, "learning_rate": 1.683128747586197e-05, "loss": 0.5066, "step": 3580 }, { "epoch": 0.7987954494757975, "grad_norm": 0.16627082228660583, "learning_rate": 1.6829568582140108e-05, "loss": 0.4813, "step": 3581 }, { "epoch": 0.7990185143876868, "grad_norm": 0.16429170966148376, "learning_rate": 1.6827849310153778e-05, "loss": 0.4916, "step": 3582 }, { "epoch": 0.7992415792995762, "grad_norm": 0.1685648411512375, "learning_rate": 1.6826129659998204e-05, "loss": 0.4821, "step": 3583 }, { "epoch": 0.7994646442114656, "grad_norm": 0.1623144894838333, "learning_rate": 1.6824409631768633e-05, "loss": 0.5092, "step": 3584 }, { "epoch": 0.7996877091233549, "grad_norm": 0.16186010837554932, "learning_rate": 1.6822689225560328e-05, "loss": 0.4647, "step": 3585 }, { "epoch": 0.7999107740352442, "grad_norm": 0.15835130214691162, "learning_rate": 1.682096844146858e-05, "loss": 0.5139, "step": 3586 }, { "epoch": 0.8001338389471336, "grad_norm": 0.1640879362821579, "learning_rate": 1.681924727958869e-05, "loss": 0.4815, "step": 3587 }, { "epoch": 0.800356903859023, "grad_norm": 0.18087539076805115, "learning_rate": 1.681752574001599e-05, "loss": 0.5057, "step": 3588 }, { "epoch": 0.8005799687709123, "grad_norm": 0.16318152844905853, "learning_rate": 1.6815803822845834e-05, "loss": 0.4945, "step": 3589 }, { "epoch": 0.8008030336828017, "grad_norm": 0.16909107565879822, "learning_rate": 1.681408152817359e-05, "loss": 0.5123, "step": 3590 }, { "epoch": 0.8010260985946911, "grad_norm": 0.16913791000843048, "learning_rate": 1.6812358856094652e-05, "loss": 0.4933, "step": 3591 }, { "epoch": 0.8012491635065804, "grad_norm": 0.16670222580432892, "learning_rate": 1.681063580670442e-05, "loss": 0.5109, "step": 3592 }, { "epoch": 0.8014722284184698, "grad_norm": 0.16838552057743073, "learning_rate": 1.680891238009834e-05, "loss": 0.4926, "step": 3593 }, { "epoch": 0.8016952933303592, "grad_norm": 0.15705366432666779, "learning_rate": 1.6807188576371864e-05, "loss": 0.4762, "step": 3594 }, { "epoch": 0.8019183582422484, "grad_norm": 0.15505351126194, "learning_rate": 1.6805464395620465e-05, "loss": 0.4769, "step": 3595 }, { "epoch": 0.8021414231541378, "grad_norm": 0.17056378722190857, "learning_rate": 1.6803739837939642e-05, "loss": 0.5038, "step": 3596 }, { "epoch": 0.8023644880660272, "grad_norm": 0.15908926725387573, "learning_rate": 1.6802014903424905e-05, "loss": 0.5079, "step": 3597 }, { "epoch": 0.8025875529779166, "grad_norm": 0.16009296476840973, "learning_rate": 1.68002895921718e-05, "loss": 0.497, "step": 3598 }, { "epoch": 0.8028106178898059, "grad_norm": 0.17202340066432953, "learning_rate": 1.6798563904275882e-05, "loss": 0.5097, "step": 3599 }, { "epoch": 0.8030336828016953, "grad_norm": 0.16179756820201874, "learning_rate": 1.679683783983273e-05, "loss": 0.4992, "step": 3600 }, { "epoch": 0.8032567477135847, "grad_norm": 0.16165940463542938, "learning_rate": 1.6795111398937944e-05, "loss": 0.4959, "step": 3601 }, { "epoch": 0.803479812625474, "grad_norm": 0.17612482607364655, "learning_rate": 1.679338458168714e-05, "loss": 0.4937, "step": 3602 }, { "epoch": 0.8037028775373634, "grad_norm": 0.16228753328323364, "learning_rate": 1.679165738817597e-05, "loss": 0.5287, "step": 3603 }, { "epoch": 0.8039259424492528, "grad_norm": 0.17843377590179443, "learning_rate": 1.6789929818500096e-05, "loss": 0.4851, "step": 3604 }, { "epoch": 0.8041490073611421, "grad_norm": 0.1637219339609146, "learning_rate": 1.6788201872755196e-05, "loss": 0.4633, "step": 3605 }, { "epoch": 0.8043720722730314, "grad_norm": 0.16961325705051422, "learning_rate": 1.6786473551036978e-05, "loss": 0.5084, "step": 3606 }, { "epoch": 0.8045951371849208, "grad_norm": 0.15785901248455048, "learning_rate": 1.6784744853441167e-05, "loss": 0.4855, "step": 3607 }, { "epoch": 0.8048182020968102, "grad_norm": 0.1613706648349762, "learning_rate": 1.6783015780063503e-05, "loss": 0.4674, "step": 3608 }, { "epoch": 0.8050412670086995, "grad_norm": 0.15770243108272552, "learning_rate": 1.678128633099976e-05, "loss": 0.4802, "step": 3609 }, { "epoch": 0.8052643319205889, "grad_norm": 0.16045016050338745, "learning_rate": 1.677955650634573e-05, "loss": 0.4965, "step": 3610 }, { "epoch": 0.8054873968324783, "grad_norm": 0.15433759987354279, "learning_rate": 1.6777826306197208e-05, "loss": 0.4752, "step": 3611 }, { "epoch": 0.8057104617443676, "grad_norm": 0.16770930588245392, "learning_rate": 1.6776095730650034e-05, "loss": 0.5025, "step": 3612 }, { "epoch": 0.805933526656257, "grad_norm": 0.15930598974227905, "learning_rate": 1.6774364779800057e-05, "loss": 0.4995, "step": 3613 }, { "epoch": 0.8061565915681463, "grad_norm": 0.16089414060115814, "learning_rate": 1.6772633453743142e-05, "loss": 0.4973, "step": 3614 }, { "epoch": 0.8063796564800357, "grad_norm": 0.1723753660917282, "learning_rate": 1.6770901752575186e-05, "loss": 0.5267, "step": 3615 }, { "epoch": 0.806602721391925, "grad_norm": 0.15985791385173798, "learning_rate": 1.6769169676392103e-05, "loss": 0.4873, "step": 3616 }, { "epoch": 0.8068257863038144, "grad_norm": 0.15705527365207672, "learning_rate": 1.676743722528982e-05, "loss": 0.48, "step": 3617 }, { "epoch": 0.8070488512157038, "grad_norm": 0.17080652713775635, "learning_rate": 1.6765704399364297e-05, "loss": 0.507, "step": 3618 }, { "epoch": 0.8072719161275931, "grad_norm": 0.16177797317504883, "learning_rate": 1.6763971198711505e-05, "loss": 0.5168, "step": 3619 }, { "epoch": 0.8074949810394825, "grad_norm": 0.15763692557811737, "learning_rate": 1.6762237623427445e-05, "loss": 0.4736, "step": 3620 }, { "epoch": 0.8077180459513719, "grad_norm": 0.15657658874988556, "learning_rate": 1.6760503673608123e-05, "loss": 0.479, "step": 3621 }, { "epoch": 0.8079411108632613, "grad_norm": 0.15933813154697418, "learning_rate": 1.6758769349349586e-05, "loss": 0.519, "step": 3622 }, { "epoch": 0.8081641757751505, "grad_norm": 0.173330619931221, "learning_rate": 1.675703465074789e-05, "loss": 0.5056, "step": 3623 }, { "epoch": 0.8083872406870399, "grad_norm": 0.16727612912654877, "learning_rate": 1.6755299577899107e-05, "loss": 0.4949, "step": 3624 }, { "epoch": 0.8086103055989293, "grad_norm": 0.23466536402702332, "learning_rate": 1.6753564130899343e-05, "loss": 0.5203, "step": 3625 }, { "epoch": 0.8088333705108186, "grad_norm": 0.16051769256591797, "learning_rate": 1.6751828309844714e-05, "loss": 0.4907, "step": 3626 }, { "epoch": 0.809056435422708, "grad_norm": 0.16254782676696777, "learning_rate": 1.6750092114831368e-05, "loss": 0.4976, "step": 3627 }, { "epoch": 0.8092795003345974, "grad_norm": 0.1722305417060852, "learning_rate": 1.6748355545955456e-05, "loss": 0.4932, "step": 3628 }, { "epoch": 0.8095025652464868, "grad_norm": 0.16842518746852875, "learning_rate": 1.6746618603313165e-05, "loss": 0.4934, "step": 3629 }, { "epoch": 0.8097256301583761, "grad_norm": 0.16397298872470856, "learning_rate": 1.67448812870007e-05, "loss": 0.4824, "step": 3630 }, { "epoch": 0.8099486950702655, "grad_norm": 0.15751919150352478, "learning_rate": 1.674314359711428e-05, "loss": 0.5057, "step": 3631 }, { "epoch": 0.8101717599821548, "grad_norm": 0.15799839794635773, "learning_rate": 1.6741405533750154e-05, "loss": 0.4941, "step": 3632 }, { "epoch": 0.8103948248940441, "grad_norm": 0.16038256883621216, "learning_rate": 1.6739667097004583e-05, "loss": 0.4765, "step": 3633 }, { "epoch": 0.8106178898059335, "grad_norm": 0.18798737227916718, "learning_rate": 1.6737928286973852e-05, "loss": 0.5119, "step": 3634 }, { "epoch": 0.8108409547178229, "grad_norm": 0.1568623036146164, "learning_rate": 1.673618910375427e-05, "loss": 0.4881, "step": 3635 }, { "epoch": 0.8110640196297122, "grad_norm": 0.16038067638874054, "learning_rate": 1.6734449547442165e-05, "loss": 0.491, "step": 3636 }, { "epoch": 0.8112870845416016, "grad_norm": 0.1699703484773636, "learning_rate": 1.6732709618133882e-05, "loss": 0.4772, "step": 3637 }, { "epoch": 0.811510149453491, "grad_norm": 0.16090309619903564, "learning_rate": 1.673096931592579e-05, "loss": 0.4703, "step": 3638 }, { "epoch": 0.8117332143653804, "grad_norm": 0.16035638749599457, "learning_rate": 1.672922864091428e-05, "loss": 0.5076, "step": 3639 }, { "epoch": 0.8119562792772697, "grad_norm": 0.16750063002109528, "learning_rate": 1.6727487593195757e-05, "loss": 0.4961, "step": 3640 }, { "epoch": 0.812179344189159, "grad_norm": 0.15547265112400055, "learning_rate": 1.6725746172866652e-05, "loss": 0.4759, "step": 3641 }, { "epoch": 0.8124024091010484, "grad_norm": 0.15655042231082916, "learning_rate": 1.672400438002342e-05, "loss": 0.511, "step": 3642 }, { "epoch": 0.8126254740129377, "grad_norm": 0.1612492948770523, "learning_rate": 1.6722262214762527e-05, "loss": 0.4901, "step": 3643 }, { "epoch": 0.8128485389248271, "grad_norm": 0.15349607169628143, "learning_rate": 1.6720519677180472e-05, "loss": 0.5094, "step": 3644 }, { "epoch": 0.8130716038367165, "grad_norm": 0.16423405706882477, "learning_rate": 1.671877676737376e-05, "loss": 0.513, "step": 3645 }, { "epoch": 0.8132946687486059, "grad_norm": 0.1583424061536789, "learning_rate": 1.671703348543893e-05, "loss": 0.4795, "step": 3646 }, { "epoch": 0.8135177336604952, "grad_norm": 0.16294583678245544, "learning_rate": 1.671528983147253e-05, "loss": 0.5019, "step": 3647 }, { "epoch": 0.8137407985723846, "grad_norm": 0.1648416668176651, "learning_rate": 1.671354580557114e-05, "loss": 0.4879, "step": 3648 }, { "epoch": 0.813963863484274, "grad_norm": 0.16462133824825287, "learning_rate": 1.6711801407831356e-05, "loss": 0.4911, "step": 3649 }, { "epoch": 0.8141869283961632, "grad_norm": 0.26390138268470764, "learning_rate": 1.671005663834979e-05, "loss": 0.4822, "step": 3650 }, { "epoch": 0.8144099933080526, "grad_norm": 0.17741946876049042, "learning_rate": 1.670831149722308e-05, "loss": 0.5237, "step": 3651 }, { "epoch": 0.814633058219942, "grad_norm": 0.18031084537506104, "learning_rate": 1.670656598454788e-05, "loss": 0.4956, "step": 3652 }, { "epoch": 0.8148561231318313, "grad_norm": 0.1602051854133606, "learning_rate": 1.670482010042087e-05, "loss": 0.4695, "step": 3653 }, { "epoch": 0.8150791880437207, "grad_norm": 0.15887601673603058, "learning_rate": 1.670307384493875e-05, "loss": 0.4907, "step": 3654 }, { "epoch": 0.8153022529556101, "grad_norm": 0.16547198593616486, "learning_rate": 1.6701327218198234e-05, "loss": 0.4944, "step": 3655 }, { "epoch": 0.8155253178674995, "grad_norm": 0.16335895657539368, "learning_rate": 1.6699580220296065e-05, "loss": 0.484, "step": 3656 }, { "epoch": 0.8157483827793888, "grad_norm": 0.1715383529663086, "learning_rate": 1.6697832851329002e-05, "loss": 0.5086, "step": 3657 }, { "epoch": 0.8159714476912782, "grad_norm": 0.16271516680717468, "learning_rate": 1.6696085111393825e-05, "loss": 0.4771, "step": 3658 }, { "epoch": 0.8161945126031676, "grad_norm": 0.16480712592601776, "learning_rate": 1.6694337000587334e-05, "loss": 0.5099, "step": 3659 }, { "epoch": 0.8164175775150568, "grad_norm": 0.15898552536964417, "learning_rate": 1.669258851900635e-05, "loss": 0.4987, "step": 3660 }, { "epoch": 0.8166406424269462, "grad_norm": 0.16049543023109436, "learning_rate": 1.6690839666747717e-05, "loss": 0.4762, "step": 3661 }, { "epoch": 0.8168637073388356, "grad_norm": 0.15958286821842194, "learning_rate": 1.6689090443908296e-05, "loss": 0.4827, "step": 3662 }, { "epoch": 0.817086772250725, "grad_norm": 0.16123978793621063, "learning_rate": 1.668734085058497e-05, "loss": 0.4803, "step": 3663 }, { "epoch": 0.8173098371626143, "grad_norm": 0.1554545909166336, "learning_rate": 1.668559088687464e-05, "loss": 0.4813, "step": 3664 }, { "epoch": 0.8175329020745037, "grad_norm": 0.15960079431533813, "learning_rate": 1.6683840552874235e-05, "loss": 0.4947, "step": 3665 }, { "epoch": 0.8177559669863931, "grad_norm": 0.15964864194393158, "learning_rate": 1.6682089848680698e-05, "loss": 0.4815, "step": 3666 }, { "epoch": 0.8179790318982824, "grad_norm": 0.16264641284942627, "learning_rate": 1.6680338774390993e-05, "loss": 0.509, "step": 3667 }, { "epoch": 0.8182020968101718, "grad_norm": 0.164497509598732, "learning_rate": 1.6678587330102103e-05, "loss": 0.5247, "step": 3668 }, { "epoch": 0.8184251617220611, "grad_norm": 0.16061222553253174, "learning_rate": 1.667683551591104e-05, "loss": 0.5161, "step": 3669 }, { "epoch": 0.8186482266339504, "grad_norm": 0.19114501774311066, "learning_rate": 1.6675083331914823e-05, "loss": 0.5119, "step": 3670 }, { "epoch": 0.8188712915458398, "grad_norm": 0.18399201333522797, "learning_rate": 1.6673330778210508e-05, "loss": 0.4907, "step": 3671 }, { "epoch": 0.8190943564577292, "grad_norm": 0.15932810306549072, "learning_rate": 1.6671577854895153e-05, "loss": 0.5102, "step": 3672 }, { "epoch": 0.8193174213696186, "grad_norm": 0.1565459966659546, "learning_rate": 1.6669824562065856e-05, "loss": 0.4752, "step": 3673 }, { "epoch": 0.8195404862815079, "grad_norm": 0.1646818071603775, "learning_rate": 1.6668070899819714e-05, "loss": 0.5099, "step": 3674 }, { "epoch": 0.8197635511933973, "grad_norm": 0.1591091752052307, "learning_rate": 1.6666316868253867e-05, "loss": 0.4969, "step": 3675 }, { "epoch": 0.8199866161052867, "grad_norm": 0.16319642961025238, "learning_rate": 1.6664562467465455e-05, "loss": 0.5129, "step": 3676 }, { "epoch": 0.820209681017176, "grad_norm": 0.16143767535686493, "learning_rate": 1.6662807697551654e-05, "loss": 0.4952, "step": 3677 }, { "epoch": 0.8204327459290653, "grad_norm": 0.15746799111366272, "learning_rate": 1.666105255860965e-05, "loss": 0.4947, "step": 3678 }, { "epoch": 0.8206558108409547, "grad_norm": 0.16585499048233032, "learning_rate": 1.6659297050736657e-05, "loss": 0.5134, "step": 3679 }, { "epoch": 0.8208788757528441, "grad_norm": 0.16004452109336853, "learning_rate": 1.6657541174029902e-05, "loss": 0.4995, "step": 3680 }, { "epoch": 0.8211019406647334, "grad_norm": 0.1682739406824112, "learning_rate": 1.665578492858664e-05, "loss": 0.4796, "step": 3681 }, { "epoch": 0.8213250055766228, "grad_norm": 0.1618606299161911, "learning_rate": 1.6654028314504147e-05, "loss": 0.4849, "step": 3682 }, { "epoch": 0.8215480704885122, "grad_norm": 0.1632058471441269, "learning_rate": 1.6652271331879706e-05, "loss": 0.499, "step": 3683 }, { "epoch": 0.8217711354004015, "grad_norm": 0.16575373709201813, "learning_rate": 1.665051398081064e-05, "loss": 0.5193, "step": 3684 }, { "epoch": 0.8219942003122909, "grad_norm": 0.16228437423706055, "learning_rate": 1.664875626139427e-05, "loss": 0.5065, "step": 3685 }, { "epoch": 0.8222172652241803, "grad_norm": 0.1525840312242508, "learning_rate": 1.6646998173727955e-05, "loss": 0.4904, "step": 3686 }, { "epoch": 0.8224403301360695, "grad_norm": 0.16036051511764526, "learning_rate": 1.6645239717909074e-05, "loss": 0.4798, "step": 3687 }, { "epoch": 0.8226633950479589, "grad_norm": 0.16049040853977203, "learning_rate": 1.6643480894035015e-05, "loss": 0.4985, "step": 3688 }, { "epoch": 0.8228864599598483, "grad_norm": 0.16121259331703186, "learning_rate": 1.6641721702203196e-05, "loss": 0.4586, "step": 3689 }, { "epoch": 0.8231095248717377, "grad_norm": 0.1610342562198639, "learning_rate": 1.663996214251105e-05, "loss": 0.4781, "step": 3690 }, { "epoch": 0.823332589783627, "grad_norm": 0.17599108815193176, "learning_rate": 1.6638202215056036e-05, "loss": 0.509, "step": 3691 }, { "epoch": 0.8235556546955164, "grad_norm": 0.15026211738586426, "learning_rate": 1.6636441919935627e-05, "loss": 0.4718, "step": 3692 }, { "epoch": 0.8237787196074058, "grad_norm": 0.1567082703113556, "learning_rate": 1.6634681257247314e-05, "loss": 0.5084, "step": 3693 }, { "epoch": 0.8240017845192951, "grad_norm": 0.1553567349910736, "learning_rate": 1.6632920227088628e-05, "loss": 0.481, "step": 3694 }, { "epoch": 0.8242248494311845, "grad_norm": 0.16551898419857025, "learning_rate": 1.663115882955709e-05, "loss": 0.5145, "step": 3695 }, { "epoch": 0.8244479143430739, "grad_norm": 0.1742102950811386, "learning_rate": 1.6629397064750267e-05, "loss": 0.5073, "step": 3696 }, { "epoch": 0.8246709792549632, "grad_norm": 0.16208507120609283, "learning_rate": 1.6627634932765735e-05, "loss": 0.4637, "step": 3697 }, { "epoch": 0.8248940441668525, "grad_norm": 0.1664377748966217, "learning_rate": 1.662587243370109e-05, "loss": 0.4927, "step": 3698 }, { "epoch": 0.8251171090787419, "grad_norm": 0.16273631155490875, "learning_rate": 1.662410956765395e-05, "loss": 0.4861, "step": 3699 }, { "epoch": 0.8253401739906313, "grad_norm": 0.16939525306224823, "learning_rate": 1.6622346334721956e-05, "loss": 0.4973, "step": 3700 }, { "epoch": 0.8255632389025206, "grad_norm": 0.16064801812171936, "learning_rate": 1.6620582735002762e-05, "loss": 0.4815, "step": 3701 }, { "epoch": 0.82578630381441, "grad_norm": 0.17662890255451202, "learning_rate": 1.6618818768594058e-05, "loss": 0.4851, "step": 3702 }, { "epoch": 0.8260093687262994, "grad_norm": 0.16551104187965393, "learning_rate": 1.6617054435593535e-05, "loss": 0.4872, "step": 3703 }, { "epoch": 0.8262324336381888, "grad_norm": 0.16171106696128845, "learning_rate": 1.6615289736098912e-05, "loss": 0.4801, "step": 3704 }, { "epoch": 0.826455498550078, "grad_norm": 0.15718747675418854, "learning_rate": 1.6613524670207933e-05, "loss": 0.4687, "step": 3705 }, { "epoch": 0.8266785634619674, "grad_norm": 0.16042965650558472, "learning_rate": 1.6611759238018356e-05, "loss": 0.5026, "step": 3706 }, { "epoch": 0.8269016283738568, "grad_norm": 0.1648973822593689, "learning_rate": 1.660999343962796e-05, "loss": 0.5002, "step": 3707 }, { "epoch": 0.8271246932857461, "grad_norm": 0.16398411989212036, "learning_rate": 1.6608227275134555e-05, "loss": 0.4601, "step": 3708 }, { "epoch": 0.8273477581976355, "grad_norm": 0.16151902079582214, "learning_rate": 1.6606460744635952e-05, "loss": 0.5171, "step": 3709 }, { "epoch": 0.8275708231095249, "grad_norm": 0.15724126994609833, "learning_rate": 1.660469384823e-05, "loss": 0.4794, "step": 3710 }, { "epoch": 0.8277938880214142, "grad_norm": 0.16321682929992676, "learning_rate": 1.6602926586014555e-05, "loss": 0.4987, "step": 3711 }, { "epoch": 0.8280169529333036, "grad_norm": 0.17783483862876892, "learning_rate": 1.66011589580875e-05, "loss": 0.4795, "step": 3712 }, { "epoch": 0.828240017845193, "grad_norm": 0.16518917679786682, "learning_rate": 1.659939096454674e-05, "loss": 0.5131, "step": 3713 }, { "epoch": 0.8284630827570824, "grad_norm": 0.1636318564414978, "learning_rate": 1.6597622605490198e-05, "loss": 0.4772, "step": 3714 }, { "epoch": 0.8286861476689716, "grad_norm": 0.16061849892139435, "learning_rate": 1.6595853881015814e-05, "loss": 0.5001, "step": 3715 }, { "epoch": 0.828909212580861, "grad_norm": 0.15872059762477875, "learning_rate": 1.6594084791221554e-05, "loss": 0.4987, "step": 3716 }, { "epoch": 0.8291322774927504, "grad_norm": 0.15593315660953522, "learning_rate": 1.65923153362054e-05, "loss": 0.4752, "step": 3717 }, { "epoch": 0.8293553424046397, "grad_norm": 0.15845991671085358, "learning_rate": 1.6590545516065353e-05, "loss": 0.5206, "step": 3718 }, { "epoch": 0.8295784073165291, "grad_norm": 0.15479490160942078, "learning_rate": 1.658877533089944e-05, "loss": 0.4706, "step": 3719 }, { "epoch": 0.8298014722284185, "grad_norm": 0.16462849080562592, "learning_rate": 1.6587004780805704e-05, "loss": 0.4816, "step": 3720 }, { "epoch": 0.8300245371403079, "grad_norm": 0.17479833960533142, "learning_rate": 1.658523386588221e-05, "loss": 0.5083, "step": 3721 }, { "epoch": 0.8302476020521972, "grad_norm": 0.16387352347373962, "learning_rate": 1.658346258622704e-05, "loss": 0.4897, "step": 3722 }, { "epoch": 0.8304706669640866, "grad_norm": 0.16821224987506866, "learning_rate": 1.6581690941938307e-05, "loss": 0.4928, "step": 3723 }, { "epoch": 0.830693731875976, "grad_norm": 0.15833254158496857, "learning_rate": 1.657991893311412e-05, "loss": 0.4523, "step": 3724 }, { "epoch": 0.8309167967878652, "grad_norm": 0.15844541788101196, "learning_rate": 1.657814655985264e-05, "loss": 0.4989, "step": 3725 }, { "epoch": 0.8311398616997546, "grad_norm": 0.16405586898326874, "learning_rate": 1.657637382225202e-05, "loss": 0.512, "step": 3726 }, { "epoch": 0.831362926611644, "grad_norm": 0.16231343150138855, "learning_rate": 1.6574600720410455e-05, "loss": 0.4856, "step": 3727 }, { "epoch": 0.8315859915235333, "grad_norm": 0.16474612057209015, "learning_rate": 1.6572827254426145e-05, "loss": 0.495, "step": 3728 }, { "epoch": 0.8318090564354227, "grad_norm": 0.16462446749210358, "learning_rate": 1.6571053424397316e-05, "loss": 0.4931, "step": 3729 }, { "epoch": 0.8320321213473121, "grad_norm": 0.1530153751373291, "learning_rate": 1.6569279230422215e-05, "loss": 0.4752, "step": 3730 }, { "epoch": 0.8322551862592015, "grad_norm": 0.15689024329185486, "learning_rate": 1.656750467259911e-05, "loss": 0.4899, "step": 3731 }, { "epoch": 0.8324782511710908, "grad_norm": 0.17177921533584595, "learning_rate": 1.656572975102628e-05, "loss": 0.4941, "step": 3732 }, { "epoch": 0.8327013160829801, "grad_norm": 0.15630176663398743, "learning_rate": 1.6563954465802042e-05, "loss": 0.4768, "step": 3733 }, { "epoch": 0.8329243809948695, "grad_norm": 0.17884239554405212, "learning_rate": 1.6562178817024713e-05, "loss": 0.5076, "step": 3734 }, { "epoch": 0.8331474459067588, "grad_norm": 0.15849219262599945, "learning_rate": 1.6560402804792644e-05, "loss": 0.4948, "step": 3735 }, { "epoch": 0.8333705108186482, "grad_norm": 0.15310537815093994, "learning_rate": 1.65586264292042e-05, "loss": 0.467, "step": 3736 }, { "epoch": 0.8335935757305376, "grad_norm": 0.1629456728696823, "learning_rate": 1.6556849690357776e-05, "loss": 0.4794, "step": 3737 }, { "epoch": 0.833816640642427, "grad_norm": 0.16876748204231262, "learning_rate": 1.6555072588351765e-05, "loss": 0.497, "step": 3738 }, { "epoch": 0.8340397055543163, "grad_norm": 0.17010155320167542, "learning_rate": 1.6553295123284605e-05, "loss": 0.5199, "step": 3739 }, { "epoch": 0.8342627704662057, "grad_norm": 0.16115908324718475, "learning_rate": 1.6551517295254732e-05, "loss": 0.5094, "step": 3740 }, { "epoch": 0.8344858353780951, "grad_norm": 0.18651996552944183, "learning_rate": 1.6549739104360627e-05, "loss": 0.4963, "step": 3741 }, { "epoch": 0.8347089002899843, "grad_norm": 0.16856126487255096, "learning_rate": 1.6547960550700766e-05, "loss": 0.4729, "step": 3742 }, { "epoch": 0.8349319652018737, "grad_norm": 0.16244658827781677, "learning_rate": 1.6546181634373666e-05, "loss": 0.4964, "step": 3743 }, { "epoch": 0.8351550301137631, "grad_norm": 0.15980856120586395, "learning_rate": 1.654440235547785e-05, "loss": 0.4741, "step": 3744 }, { "epoch": 0.8353780950256524, "grad_norm": 0.15810216963291168, "learning_rate": 1.6542622714111865e-05, "loss": 0.4905, "step": 3745 }, { "epoch": 0.8356011599375418, "grad_norm": 0.15649911761283875, "learning_rate": 1.654084271037428e-05, "loss": 0.4883, "step": 3746 }, { "epoch": 0.8358242248494312, "grad_norm": 0.16376326978206635, "learning_rate": 1.653906234436368e-05, "loss": 0.4788, "step": 3747 }, { "epoch": 0.8360472897613206, "grad_norm": 0.17131556570529938, "learning_rate": 1.6537281616178674e-05, "loss": 0.5121, "step": 3748 }, { "epoch": 0.8362703546732099, "grad_norm": 0.1694851517677307, "learning_rate": 1.6535500525917893e-05, "loss": 0.5165, "step": 3749 }, { "epoch": 0.8364934195850993, "grad_norm": 0.16306069493293762, "learning_rate": 1.653371907367998e-05, "loss": 0.493, "step": 3750 }, { "epoch": 0.8367164844969887, "grad_norm": 0.1630067676305771, "learning_rate": 1.6531937259563612e-05, "loss": 0.4992, "step": 3751 }, { "epoch": 0.8369395494088779, "grad_norm": 0.15753702819347382, "learning_rate": 1.6530155083667468e-05, "loss": 0.4767, "step": 3752 }, { "epoch": 0.8371626143207673, "grad_norm": 0.1651187390089035, "learning_rate": 1.6528372546090258e-05, "loss": 0.4987, "step": 3753 }, { "epoch": 0.8373856792326567, "grad_norm": 0.1615830808877945, "learning_rate": 1.6526589646930712e-05, "loss": 0.4932, "step": 3754 }, { "epoch": 0.8376087441445461, "grad_norm": 0.1610192209482193, "learning_rate": 1.6524806386287578e-05, "loss": 0.5062, "step": 3755 }, { "epoch": 0.8378318090564354, "grad_norm": 0.16058041155338287, "learning_rate": 1.652302276425962e-05, "loss": 0.4937, "step": 3756 }, { "epoch": 0.8380548739683248, "grad_norm": 0.15800607204437256, "learning_rate": 1.6521238780945635e-05, "loss": 0.4814, "step": 3757 }, { "epoch": 0.8382779388802142, "grad_norm": 0.16751538217067719, "learning_rate": 1.6519454436444423e-05, "loss": 0.4904, "step": 3758 }, { "epoch": 0.8385010037921035, "grad_norm": 0.16399426758289337, "learning_rate": 1.651766973085482e-05, "loss": 0.5086, "step": 3759 }, { "epoch": 0.8387240687039929, "grad_norm": 0.16619375348091125, "learning_rate": 1.6515884664275663e-05, "loss": 0.5192, "step": 3760 }, { "epoch": 0.8389471336158822, "grad_norm": 0.1645645648241043, "learning_rate": 1.651409923680583e-05, "loss": 0.4894, "step": 3761 }, { "epoch": 0.8391701985277715, "grad_norm": 0.16611702740192413, "learning_rate": 1.6512313448544207e-05, "loss": 0.4858, "step": 3762 }, { "epoch": 0.8393932634396609, "grad_norm": 0.15403629839420319, "learning_rate": 1.6510527299589696e-05, "loss": 0.4844, "step": 3763 }, { "epoch": 0.8396163283515503, "grad_norm": 0.15380257368087769, "learning_rate": 1.6508740790041236e-05, "loss": 0.491, "step": 3764 }, { "epoch": 0.8398393932634397, "grad_norm": 0.1628074049949646, "learning_rate": 1.650695391999777e-05, "loss": 0.4666, "step": 3765 }, { "epoch": 0.840062458175329, "grad_norm": 0.16377408802509308, "learning_rate": 1.650516668955826e-05, "loss": 0.4695, "step": 3766 }, { "epoch": 0.8402855230872184, "grad_norm": 0.16199228167533875, "learning_rate": 1.6503379098821705e-05, "loss": 0.48, "step": 3767 }, { "epoch": 0.8405085879991078, "grad_norm": 0.17420238256454468, "learning_rate": 1.6501591147887108e-05, "loss": 0.4983, "step": 3768 }, { "epoch": 0.840731652910997, "grad_norm": 0.1635866016149521, "learning_rate": 1.649980283685349e-05, "loss": 0.4877, "step": 3769 }, { "epoch": 0.8409547178228864, "grad_norm": 0.16857163608074188, "learning_rate": 1.6498014165819908e-05, "loss": 0.4956, "step": 3770 }, { "epoch": 0.8411777827347758, "grad_norm": 0.17237527668476105, "learning_rate": 1.649622513488543e-05, "loss": 0.515, "step": 3771 }, { "epoch": 0.8414008476466652, "grad_norm": 0.1601458191871643, "learning_rate": 1.6494435744149142e-05, "loss": 0.4839, "step": 3772 }, { "epoch": 0.8416239125585545, "grad_norm": 0.1580028235912323, "learning_rate": 1.6492645993710148e-05, "loss": 0.4529, "step": 3773 }, { "epoch": 0.8418469774704439, "grad_norm": 0.16191108524799347, "learning_rate": 1.649085588366758e-05, "loss": 0.495, "step": 3774 }, { "epoch": 0.8420700423823333, "grad_norm": 0.16524077951908112, "learning_rate": 1.6489065414120583e-05, "loss": 0.4832, "step": 3775 }, { "epoch": 0.8422931072942226, "grad_norm": 0.16424553096294403, "learning_rate": 1.6487274585168327e-05, "loss": 0.4811, "step": 3776 }, { "epoch": 0.842516172206112, "grad_norm": 0.16102652251720428, "learning_rate": 1.6485483396909997e-05, "loss": 0.486, "step": 3777 }, { "epoch": 0.8427392371180014, "grad_norm": 0.16395264863967896, "learning_rate": 1.64836918494448e-05, "loss": 0.4956, "step": 3778 }, { "epoch": 0.8429623020298908, "grad_norm": 0.17286796867847443, "learning_rate": 1.6481899942871967e-05, "loss": 0.4859, "step": 3779 }, { "epoch": 0.84318536694178, "grad_norm": 0.16825571656227112, "learning_rate": 1.648010767729074e-05, "loss": 0.4873, "step": 3780 }, { "epoch": 0.8434084318536694, "grad_norm": 0.14970429241657257, "learning_rate": 1.647831505280039e-05, "loss": 0.4725, "step": 3781 }, { "epoch": 0.8436314967655588, "grad_norm": 0.1689714640378952, "learning_rate": 1.64765220695002e-05, "loss": 0.4865, "step": 3782 }, { "epoch": 0.8438545616774481, "grad_norm": 0.18679800629615784, "learning_rate": 1.647472872748948e-05, "loss": 0.5144, "step": 3783 }, { "epoch": 0.8440776265893375, "grad_norm": 0.15957824885845184, "learning_rate": 1.6472935026867555e-05, "loss": 0.4969, "step": 3784 }, { "epoch": 0.8443006915012269, "grad_norm": 0.1743471622467041, "learning_rate": 1.6471140967733772e-05, "loss": 0.4945, "step": 3785 }, { "epoch": 0.8445237564131162, "grad_norm": 0.17243602871894836, "learning_rate": 1.64693465501875e-05, "loss": 0.4963, "step": 3786 }, { "epoch": 0.8447468213250056, "grad_norm": 0.1854105442762375, "learning_rate": 1.646755177432812e-05, "loss": 0.4655, "step": 3787 }, { "epoch": 0.844969886236895, "grad_norm": 0.1698131114244461, "learning_rate": 1.6465756640255038e-05, "loss": 0.5208, "step": 3788 }, { "epoch": 0.8451929511487843, "grad_norm": 0.17987042665481567, "learning_rate": 1.6463961148067685e-05, "loss": 0.5031, "step": 3789 }, { "epoch": 0.8454160160606736, "grad_norm": 0.15817251801490784, "learning_rate": 1.6462165297865503e-05, "loss": 0.4892, "step": 3790 }, { "epoch": 0.845639080972563, "grad_norm": 0.16425161063671112, "learning_rate": 1.6460369089747956e-05, "loss": 0.4867, "step": 3791 }, { "epoch": 0.8458621458844524, "grad_norm": 0.164134219288826, "learning_rate": 1.6458572523814535e-05, "loss": 0.5063, "step": 3792 }, { "epoch": 0.8460852107963417, "grad_norm": 0.1651495099067688, "learning_rate": 1.6456775600164737e-05, "loss": 0.5168, "step": 3793 }, { "epoch": 0.8463082757082311, "grad_norm": 0.1596038043498993, "learning_rate": 1.6454978318898093e-05, "loss": 0.4948, "step": 3794 }, { "epoch": 0.8465313406201205, "grad_norm": 0.16954916715621948, "learning_rate": 1.645318068011415e-05, "loss": 0.5364, "step": 3795 }, { "epoch": 0.8467544055320099, "grad_norm": 0.16513592004776, "learning_rate": 1.6451382683912468e-05, "loss": 0.4905, "step": 3796 }, { "epoch": 0.8469774704438991, "grad_norm": 0.17096330225467682, "learning_rate": 1.6449584330392627e-05, "loss": 0.4687, "step": 3797 }, { "epoch": 0.8472005353557885, "grad_norm": 0.16846898198127747, "learning_rate": 1.644778561965424e-05, "loss": 0.4803, "step": 3798 }, { "epoch": 0.8474236002676779, "grad_norm": 0.16746586561203003, "learning_rate": 1.644598655179693e-05, "loss": 0.5015, "step": 3799 }, { "epoch": 0.8476466651795672, "grad_norm": 0.16072294116020203, "learning_rate": 1.6444187126920334e-05, "loss": 0.4993, "step": 3800 }, { "epoch": 0.8478697300914566, "grad_norm": 0.18298223614692688, "learning_rate": 1.644238734512412e-05, "loss": 0.4923, "step": 3801 }, { "epoch": 0.848092795003346, "grad_norm": 0.1531648337841034, "learning_rate": 1.6440587206507972e-05, "loss": 0.4731, "step": 3802 }, { "epoch": 0.8483158599152353, "grad_norm": 0.16385571658611298, "learning_rate": 1.6438786711171588e-05, "loss": 0.4718, "step": 3803 }, { "epoch": 0.8485389248271247, "grad_norm": 0.15524934232234955, "learning_rate": 1.6436985859214698e-05, "loss": 0.4886, "step": 3804 }, { "epoch": 0.8487619897390141, "grad_norm": 0.16199147701263428, "learning_rate": 1.643518465073704e-05, "loss": 0.5056, "step": 3805 }, { "epoch": 0.8489850546509035, "grad_norm": 0.17070268094539642, "learning_rate": 1.6433383085838378e-05, "loss": 0.5157, "step": 3806 }, { "epoch": 0.8492081195627927, "grad_norm": 0.19035401940345764, "learning_rate": 1.643158116461849e-05, "loss": 0.5135, "step": 3807 }, { "epoch": 0.8494311844746821, "grad_norm": 0.16606462001800537, "learning_rate": 1.6429778887177182e-05, "loss": 0.5058, "step": 3808 }, { "epoch": 0.8496542493865715, "grad_norm": 0.16338834166526794, "learning_rate": 1.6427976253614275e-05, "loss": 0.4914, "step": 3809 }, { "epoch": 0.8498773142984608, "grad_norm": 0.16145442426204681, "learning_rate": 1.6426173264029614e-05, "loss": 0.4999, "step": 3810 }, { "epoch": 0.8501003792103502, "grad_norm": 0.16404959559440613, "learning_rate": 1.642436991852305e-05, "loss": 0.5014, "step": 3811 }, { "epoch": 0.8503234441222396, "grad_norm": 0.16108281910419464, "learning_rate": 1.642256621719447e-05, "loss": 0.4702, "step": 3812 }, { "epoch": 0.850546509034129, "grad_norm": 0.15342526137828827, "learning_rate": 1.642076216014377e-05, "loss": 0.4686, "step": 3813 }, { "epoch": 0.8507695739460183, "grad_norm": 0.16050684452056885, "learning_rate": 1.6418957747470877e-05, "loss": 0.4729, "step": 3814 }, { "epoch": 0.8509926388579077, "grad_norm": 0.17533640563488007, "learning_rate": 1.641715297927573e-05, "loss": 0.4926, "step": 3815 }, { "epoch": 0.851215703769797, "grad_norm": 0.16416087746620178, "learning_rate": 1.641534785565828e-05, "loss": 0.513, "step": 3816 }, { "epoch": 0.8514387686816863, "grad_norm": 0.1623440384864807, "learning_rate": 1.6413542376718513e-05, "loss": 0.49, "step": 3817 }, { "epoch": 0.8516618335935757, "grad_norm": 0.1671840250492096, "learning_rate": 1.641173654255643e-05, "loss": 0.5044, "step": 3818 }, { "epoch": 0.8518848985054651, "grad_norm": 0.1557224541902542, "learning_rate": 1.640993035327204e-05, "loss": 0.4984, "step": 3819 }, { "epoch": 0.8521079634173544, "grad_norm": 0.15791340172290802, "learning_rate": 1.6408123808965392e-05, "loss": 0.4801, "step": 3820 }, { "epoch": 0.8523310283292438, "grad_norm": 0.1637008786201477, "learning_rate": 1.6406316909736536e-05, "loss": 0.4989, "step": 3821 }, { "epoch": 0.8525540932411332, "grad_norm": 0.16228072345256805, "learning_rate": 1.6404509655685555e-05, "loss": 0.4568, "step": 3822 }, { "epoch": 0.8527771581530226, "grad_norm": 0.16301074624061584, "learning_rate": 1.640270204691254e-05, "loss": 0.4741, "step": 3823 }, { "epoch": 0.8530002230649119, "grad_norm": 0.15945035219192505, "learning_rate": 1.6400894083517612e-05, "loss": 0.4885, "step": 3824 }, { "epoch": 0.8532232879768012, "grad_norm": 0.16819296777248383, "learning_rate": 1.639908576560091e-05, "loss": 0.5147, "step": 3825 }, { "epoch": 0.8534463528886906, "grad_norm": 0.16270814836025238, "learning_rate": 1.6397277093262583e-05, "loss": 0.5147, "step": 3826 }, { "epoch": 0.8536694178005799, "grad_norm": 0.16252903640270233, "learning_rate": 1.6395468066602812e-05, "loss": 0.5019, "step": 3827 }, { "epoch": 0.8538924827124693, "grad_norm": 0.16933606564998627, "learning_rate": 1.6393658685721787e-05, "loss": 0.5085, "step": 3828 }, { "epoch": 0.8541155476243587, "grad_norm": 0.17316611111164093, "learning_rate": 1.639184895071973e-05, "loss": 0.5141, "step": 3829 }, { "epoch": 0.8543386125362481, "grad_norm": 0.17270614206790924, "learning_rate": 1.6390038861696868e-05, "loss": 0.4894, "step": 3830 }, { "epoch": 0.8545616774481374, "grad_norm": 0.1740892380475998, "learning_rate": 1.638822841875346e-05, "loss": 0.5061, "step": 3831 }, { "epoch": 0.8547847423600268, "grad_norm": 0.18361349403858185, "learning_rate": 1.638641762198978e-05, "loss": 0.4868, "step": 3832 }, { "epoch": 0.8550078072719162, "grad_norm": 0.18127956986427307, "learning_rate": 1.638460647150612e-05, "loss": 0.5208, "step": 3833 }, { "epoch": 0.8552308721838054, "grad_norm": 0.1610213816165924, "learning_rate": 1.6382794967402792e-05, "loss": 0.4749, "step": 3834 }, { "epoch": 0.8554539370956948, "grad_norm": 0.16226479411125183, "learning_rate": 1.638098310978013e-05, "loss": 0.4872, "step": 3835 }, { "epoch": 0.8556770020075842, "grad_norm": 0.15979908406734467, "learning_rate": 1.6379170898738483e-05, "loss": 0.5036, "step": 3836 }, { "epoch": 0.8559000669194735, "grad_norm": 0.1549946367740631, "learning_rate": 1.6377358334378228e-05, "loss": 0.4858, "step": 3837 }, { "epoch": 0.8561231318313629, "grad_norm": 0.16230742633342743, "learning_rate": 1.6375545416799756e-05, "loss": 0.5084, "step": 3838 }, { "epoch": 0.8563461967432523, "grad_norm": 0.15997640788555145, "learning_rate": 1.6373732146103466e-05, "loss": 0.5162, "step": 3839 }, { "epoch": 0.8565692616551417, "grad_norm": 0.1616683304309845, "learning_rate": 1.6371918522389804e-05, "loss": 0.471, "step": 3840 }, { "epoch": 0.856792326567031, "grad_norm": 0.15914194285869598, "learning_rate": 1.637010454575921e-05, "loss": 0.5039, "step": 3841 }, { "epoch": 0.8570153914789204, "grad_norm": 0.16176219284534454, "learning_rate": 1.636829021631216e-05, "loss": 0.4903, "step": 3842 }, { "epoch": 0.8572384563908098, "grad_norm": 0.1591506451368332, "learning_rate": 1.636647553414914e-05, "loss": 0.4833, "step": 3843 }, { "epoch": 0.857461521302699, "grad_norm": 0.16955386102199554, "learning_rate": 1.6364660499370656e-05, "loss": 0.5088, "step": 3844 }, { "epoch": 0.8576845862145884, "grad_norm": 0.15826305747032166, "learning_rate": 1.636284511207724e-05, "loss": 0.4985, "step": 3845 }, { "epoch": 0.8579076511264778, "grad_norm": 0.16054044663906097, "learning_rate": 1.6361029372369433e-05, "loss": 0.4991, "step": 3846 }, { "epoch": 0.8581307160383672, "grad_norm": 0.16432225704193115, "learning_rate": 1.6359213280347814e-05, "loss": 0.4927, "step": 3847 }, { "epoch": 0.8583537809502565, "grad_norm": 0.17022813856601715, "learning_rate": 1.6357396836112957e-05, "loss": 0.4863, "step": 3848 }, { "epoch": 0.8585768458621459, "grad_norm": 0.15264475345611572, "learning_rate": 1.6355580039765478e-05, "loss": 0.4891, "step": 3849 }, { "epoch": 0.8587999107740353, "grad_norm": 0.1790938675403595, "learning_rate": 1.6353762891405993e-05, "loss": 0.517, "step": 3850 }, { "epoch": 0.8590229756859246, "grad_norm": 0.15989623963832855, "learning_rate": 1.6351945391135154e-05, "loss": 0.4684, "step": 3851 }, { "epoch": 0.859246040597814, "grad_norm": 0.16801711916923523, "learning_rate": 1.6350127539053626e-05, "loss": 0.4782, "step": 3852 }, { "epoch": 0.8594691055097033, "grad_norm": 0.15932013094425201, "learning_rate": 1.634830933526209e-05, "loss": 0.4805, "step": 3853 }, { "epoch": 0.8596921704215927, "grad_norm": 0.16254423558712006, "learning_rate": 1.6346490779861252e-05, "loss": 0.5063, "step": 3854 }, { "epoch": 0.859915235333482, "grad_norm": 0.1786786913871765, "learning_rate": 1.634467187295183e-05, "loss": 0.5457, "step": 3855 }, { "epoch": 0.8601383002453714, "grad_norm": 0.15343046188354492, "learning_rate": 1.6342852614634575e-05, "loss": 0.4681, "step": 3856 }, { "epoch": 0.8603613651572608, "grad_norm": 0.16010713577270508, "learning_rate": 1.634103300501024e-05, "loss": 0.5131, "step": 3857 }, { "epoch": 0.8605844300691501, "grad_norm": 0.1694883406162262, "learning_rate": 1.6339213044179612e-05, "loss": 0.5117, "step": 3858 }, { "epoch": 0.8608074949810395, "grad_norm": 0.1592147946357727, "learning_rate": 1.6337392732243488e-05, "loss": 0.5026, "step": 3859 }, { "epoch": 0.8610305598929289, "grad_norm": 0.15784434974193573, "learning_rate": 1.6335572069302694e-05, "loss": 0.4705, "step": 3860 }, { "epoch": 0.8612536248048182, "grad_norm": 0.15071116387844086, "learning_rate": 1.6333751055458065e-05, "loss": 0.4781, "step": 3861 }, { "epoch": 0.8614766897167075, "grad_norm": 0.16125838458538055, "learning_rate": 1.6331929690810464e-05, "loss": 0.5079, "step": 3862 }, { "epoch": 0.8616997546285969, "grad_norm": 0.16050268709659576, "learning_rate": 1.6330107975460764e-05, "loss": 0.4983, "step": 3863 }, { "epoch": 0.8619228195404863, "grad_norm": 0.15100105106830597, "learning_rate": 1.632828590950987e-05, "loss": 0.46, "step": 3864 }, { "epoch": 0.8621458844523756, "grad_norm": 0.17107254266738892, "learning_rate": 1.632646349305869e-05, "loss": 0.4708, "step": 3865 }, { "epoch": 0.862368949364265, "grad_norm": 0.15665309131145477, "learning_rate": 1.6324640726208172e-05, "loss": 0.4786, "step": 3866 }, { "epoch": 0.8625920142761544, "grad_norm": 0.1617649793624878, "learning_rate": 1.6322817609059267e-05, "loss": 0.5078, "step": 3867 }, { "epoch": 0.8628150791880437, "grad_norm": 0.15603747963905334, "learning_rate": 1.6320994141712948e-05, "loss": 0.4939, "step": 3868 }, { "epoch": 0.8630381440999331, "grad_norm": 0.14941559731960297, "learning_rate": 1.6319170324270212e-05, "loss": 0.4637, "step": 3869 }, { "epoch": 0.8632612090118225, "grad_norm": 0.1626354604959488, "learning_rate": 1.631734615683208e-05, "loss": 0.4715, "step": 3870 }, { "epoch": 0.8634842739237119, "grad_norm": 0.15691153705120087, "learning_rate": 1.6315521639499573e-05, "loss": 0.5017, "step": 3871 }, { "epoch": 0.8637073388356011, "grad_norm": 0.19187089800834656, "learning_rate": 1.6313696772373754e-05, "loss": 0.5369, "step": 3872 }, { "epoch": 0.8639304037474905, "grad_norm": 0.16823086142539978, "learning_rate": 1.6311871555555696e-05, "loss": 0.4892, "step": 3873 }, { "epoch": 0.8641534686593799, "grad_norm": 0.17205961048603058, "learning_rate": 1.6310045989146486e-05, "loss": 0.5472, "step": 3874 }, { "epoch": 0.8643765335712692, "grad_norm": 0.1610354632139206, "learning_rate": 1.6308220073247237e-05, "loss": 0.4887, "step": 3875 }, { "epoch": 0.8645995984831586, "grad_norm": 0.21832773089408875, "learning_rate": 1.6306393807959078e-05, "loss": 0.5081, "step": 3876 }, { "epoch": 0.864822663395048, "grad_norm": 0.17104685306549072, "learning_rate": 1.6304567193383164e-05, "loss": 0.5179, "step": 3877 }, { "epoch": 0.8650457283069373, "grad_norm": 0.15571469068527222, "learning_rate": 1.6302740229620662e-05, "loss": 0.4773, "step": 3878 }, { "epoch": 0.8652687932188267, "grad_norm": 0.16199520230293274, "learning_rate": 1.630091291677276e-05, "loss": 0.4972, "step": 3879 }, { "epoch": 0.865491858130716, "grad_norm": 0.17316444218158722, "learning_rate": 1.6299085254940664e-05, "loss": 0.4961, "step": 3880 }, { "epoch": 0.8657149230426054, "grad_norm": 0.16597145795822144, "learning_rate": 1.6297257244225602e-05, "loss": 0.4893, "step": 3881 }, { "epoch": 0.8659379879544947, "grad_norm": 0.1571418046951294, "learning_rate": 1.6295428884728827e-05, "loss": 0.4957, "step": 3882 }, { "epoch": 0.8661610528663841, "grad_norm": 0.15437713265419006, "learning_rate": 1.62936001765516e-05, "loss": 0.4912, "step": 3883 }, { "epoch": 0.8663841177782735, "grad_norm": 0.15803179144859314, "learning_rate": 1.6291771119795202e-05, "loss": 0.5012, "step": 3884 }, { "epoch": 0.8666071826901628, "grad_norm": 0.1526332050561905, "learning_rate": 1.628994171456095e-05, "loss": 0.48, "step": 3885 }, { "epoch": 0.8668302476020522, "grad_norm": 0.16080142557621002, "learning_rate": 1.628811196095016e-05, "loss": 0.4885, "step": 3886 }, { "epoch": 0.8670533125139416, "grad_norm": 0.16212552785873413, "learning_rate": 1.628628185906417e-05, "loss": 0.5049, "step": 3887 }, { "epoch": 0.867276377425831, "grad_norm": 0.1618081033229828, "learning_rate": 1.6284451409004352e-05, "loss": 0.4955, "step": 3888 }, { "epoch": 0.8674994423377203, "grad_norm": 0.15760111808776855, "learning_rate": 1.628262061087208e-05, "loss": 0.4808, "step": 3889 }, { "epoch": 0.8677225072496096, "grad_norm": 0.15690281987190247, "learning_rate": 1.6280789464768765e-05, "loss": 0.5198, "step": 3890 }, { "epoch": 0.867945572161499, "grad_norm": 0.17230992019176483, "learning_rate": 1.6278957970795818e-05, "loss": 0.4762, "step": 3891 }, { "epoch": 0.8681686370733883, "grad_norm": 0.15698418021202087, "learning_rate": 1.6277126129054687e-05, "loss": 0.4969, "step": 3892 }, { "epoch": 0.8683917019852777, "grad_norm": 0.15693382918834686, "learning_rate": 1.6275293939646822e-05, "loss": 0.4856, "step": 3893 }, { "epoch": 0.8686147668971671, "grad_norm": 0.1605808138847351, "learning_rate": 1.6273461402673706e-05, "loss": 0.4739, "step": 3894 }, { "epoch": 0.8688378318090564, "grad_norm": 0.2007637321949005, "learning_rate": 1.6271628518236836e-05, "loss": 0.5115, "step": 3895 }, { "epoch": 0.8690608967209458, "grad_norm": 0.17741620540618896, "learning_rate": 1.6269795286437728e-05, "loss": 0.5034, "step": 3896 }, { "epoch": 0.8692839616328352, "grad_norm": 0.1685972362756729, "learning_rate": 1.6267961707377923e-05, "loss": 0.5316, "step": 3897 }, { "epoch": 0.8695070265447246, "grad_norm": 0.16251643002033234, "learning_rate": 1.6266127781158965e-05, "loss": 0.5018, "step": 3898 }, { "epoch": 0.8697300914566138, "grad_norm": 0.15942230820655823, "learning_rate": 1.626429350788244e-05, "loss": 0.4799, "step": 3899 }, { "epoch": 0.8699531563685032, "grad_norm": 0.16205990314483643, "learning_rate": 1.6262458887649933e-05, "loss": 0.4721, "step": 3900 }, { "epoch": 0.8701762212803926, "grad_norm": 0.16467221081256866, "learning_rate": 1.6260623920563062e-05, "loss": 0.5137, "step": 3901 }, { "epoch": 0.8703992861922819, "grad_norm": 0.16359251737594604, "learning_rate": 1.6258788606723457e-05, "loss": 0.5063, "step": 3902 }, { "epoch": 0.8706223511041713, "grad_norm": 0.16454778611660004, "learning_rate": 1.625695294623277e-05, "loss": 0.4847, "step": 3903 }, { "epoch": 0.8708454160160607, "grad_norm": 0.1625732183456421, "learning_rate": 1.625511693919267e-05, "loss": 0.5324, "step": 3904 }, { "epoch": 0.8710684809279501, "grad_norm": 0.16343548893928528, "learning_rate": 1.625328058570485e-05, "loss": 0.5012, "step": 3905 }, { "epoch": 0.8712915458398394, "grad_norm": 0.15868264436721802, "learning_rate": 1.6251443885871013e-05, "loss": 0.4921, "step": 3906 }, { "epoch": 0.8715146107517288, "grad_norm": 0.1692667156457901, "learning_rate": 1.6249606839792897e-05, "loss": 0.4861, "step": 3907 }, { "epoch": 0.8717376756636182, "grad_norm": 0.15825539827346802, "learning_rate": 1.6247769447572235e-05, "loss": 0.4828, "step": 3908 }, { "epoch": 0.8719607405755074, "grad_norm": 0.1585260033607483, "learning_rate": 1.6245931709310806e-05, "loss": 0.4885, "step": 3909 }, { "epoch": 0.8721838054873968, "grad_norm": 0.1593310832977295, "learning_rate": 1.624409362511039e-05, "loss": 0.5158, "step": 3910 }, { "epoch": 0.8724068703992862, "grad_norm": 0.15775549411773682, "learning_rate": 1.624225519507279e-05, "loss": 0.4812, "step": 3911 }, { "epoch": 0.8726299353111755, "grad_norm": 0.16018901765346527, "learning_rate": 1.624041641929983e-05, "loss": 0.5123, "step": 3912 }, { "epoch": 0.8728530002230649, "grad_norm": 0.16512146592140198, "learning_rate": 1.6238577297893357e-05, "loss": 0.4886, "step": 3913 }, { "epoch": 0.8730760651349543, "grad_norm": 0.16052314639091492, "learning_rate": 1.6236737830955233e-05, "loss": 0.5078, "step": 3914 }, { "epoch": 0.8732991300468437, "grad_norm": 0.16342321038246155, "learning_rate": 1.6234898018587336e-05, "loss": 0.5151, "step": 3915 }, { "epoch": 0.873522194958733, "grad_norm": 0.16125398874282837, "learning_rate": 1.6233057860891566e-05, "loss": 0.4786, "step": 3916 }, { "epoch": 0.8737452598706223, "grad_norm": 0.17200647294521332, "learning_rate": 1.623121735796985e-05, "loss": 0.5013, "step": 3917 }, { "epoch": 0.8739683247825117, "grad_norm": 0.17798694968223572, "learning_rate": 1.6229376509924116e-05, "loss": 0.5257, "step": 3918 }, { "epoch": 0.874191389694401, "grad_norm": 0.15707524120807648, "learning_rate": 1.6227535316856326e-05, "loss": 0.4889, "step": 3919 }, { "epoch": 0.8744144546062904, "grad_norm": 0.16776537895202637, "learning_rate": 1.622569377886846e-05, "loss": 0.5196, "step": 3920 }, { "epoch": 0.8746375195181798, "grad_norm": 0.16779324412345886, "learning_rate": 1.622385189606251e-05, "loss": 0.512, "step": 3921 }, { "epoch": 0.8748605844300692, "grad_norm": 0.16673165559768677, "learning_rate": 1.622200966854049e-05, "loss": 0.4888, "step": 3922 }, { "epoch": 0.8750836493419585, "grad_norm": 0.18797388672828674, "learning_rate": 1.622016709640444e-05, "loss": 0.4789, "step": 3923 }, { "epoch": 0.8753067142538479, "grad_norm": 0.21027544140815735, "learning_rate": 1.621832417975641e-05, "loss": 0.4818, "step": 3924 }, { "epoch": 0.8755297791657373, "grad_norm": 0.16859135031700134, "learning_rate": 1.621648091869847e-05, "loss": 0.4949, "step": 3925 }, { "epoch": 0.8757528440776265, "grad_norm": 0.1632431447505951, "learning_rate": 1.6214637313332714e-05, "loss": 0.4869, "step": 3926 }, { "epoch": 0.8759759089895159, "grad_norm": 0.1545240879058838, "learning_rate": 1.6212793363761253e-05, "loss": 0.495, "step": 3927 }, { "epoch": 0.8761989739014053, "grad_norm": 0.162541463971138, "learning_rate": 1.621094907008621e-05, "loss": 0.507, "step": 3928 }, { "epoch": 0.8764220388132947, "grad_norm": 0.16767984628677368, "learning_rate": 1.6209104432409745e-05, "loss": 0.5098, "step": 3929 }, { "epoch": 0.876645103725184, "grad_norm": 0.17964600026607513, "learning_rate": 1.6207259450834022e-05, "loss": 0.5055, "step": 3930 }, { "epoch": 0.8768681686370734, "grad_norm": 0.16593998670578003, "learning_rate": 1.620541412546122e-05, "loss": 0.4944, "step": 3931 }, { "epoch": 0.8770912335489628, "grad_norm": 0.15954278409481049, "learning_rate": 1.6203568456393554e-05, "loss": 0.5158, "step": 3932 }, { "epoch": 0.8773142984608521, "grad_norm": 0.1770762950181961, "learning_rate": 1.620172244373324e-05, "loss": 0.4886, "step": 3933 }, { "epoch": 0.8775373633727415, "grad_norm": 0.16538506746292114, "learning_rate": 1.619987608758253e-05, "loss": 0.4563, "step": 3934 }, { "epoch": 0.8777604282846309, "grad_norm": 0.1733943372964859, "learning_rate": 1.6198029388043685e-05, "loss": 0.5131, "step": 3935 }, { "epoch": 0.8779834931965201, "grad_norm": 0.15919508039951324, "learning_rate": 1.619618234521898e-05, "loss": 0.4904, "step": 3936 }, { "epoch": 0.8782065581084095, "grad_norm": 0.1546212136745453, "learning_rate": 1.6194334959210726e-05, "loss": 0.4827, "step": 3937 }, { "epoch": 0.8784296230202989, "grad_norm": 0.16514717042446136, "learning_rate": 1.6192487230121236e-05, "loss": 0.501, "step": 3938 }, { "epoch": 0.8786526879321883, "grad_norm": 0.1485379934310913, "learning_rate": 1.6190639158052852e-05, "loss": 0.47, "step": 3939 }, { "epoch": 0.8788757528440776, "grad_norm": 0.1519124060869217, "learning_rate": 1.618879074310793e-05, "loss": 0.484, "step": 3940 }, { "epoch": 0.879098817755967, "grad_norm": 0.16282908618450165, "learning_rate": 1.618694198538885e-05, "loss": 0.5071, "step": 3941 }, { "epoch": 0.8793218826678564, "grad_norm": 0.16473978757858276, "learning_rate": 1.6185092884998e-05, "loss": 0.5203, "step": 3942 }, { "epoch": 0.8795449475797457, "grad_norm": 0.15753906965255737, "learning_rate": 1.6183243442037807e-05, "loss": 0.4989, "step": 3943 }, { "epoch": 0.879768012491635, "grad_norm": 0.1654537469148636, "learning_rate": 1.6181393656610693e-05, "loss": 0.4749, "step": 3944 }, { "epoch": 0.8799910774035244, "grad_norm": 0.17788641154766083, "learning_rate": 1.6179543528819116e-05, "loss": 0.4961, "step": 3945 }, { "epoch": 0.8802141423154138, "grad_norm": 0.16031092405319214, "learning_rate": 1.617769305876555e-05, "loss": 0.5006, "step": 3946 }, { "epoch": 0.8804372072273031, "grad_norm": 0.16216473281383514, "learning_rate": 1.6175842246552484e-05, "loss": 0.5012, "step": 3947 }, { "epoch": 0.8806602721391925, "grad_norm": 0.3736167848110199, "learning_rate": 1.6173991092282424e-05, "loss": 0.481, "step": 3948 }, { "epoch": 0.8808833370510819, "grad_norm": 0.16511203348636627, "learning_rate": 1.6172139596057902e-05, "loss": 0.4837, "step": 3949 }, { "epoch": 0.8811064019629712, "grad_norm": 0.16273178160190582, "learning_rate": 1.6170287757981468e-05, "loss": 0.494, "step": 3950 }, { "epoch": 0.8813294668748606, "grad_norm": 0.17184849083423615, "learning_rate": 1.616843557815568e-05, "loss": 0.5257, "step": 3951 }, { "epoch": 0.88155253178675, "grad_norm": 0.16385000944137573, "learning_rate": 1.6166583056683132e-05, "loss": 0.5086, "step": 3952 }, { "epoch": 0.8817755966986393, "grad_norm": 0.16628234088420868, "learning_rate": 1.6164730193666423e-05, "loss": 0.5035, "step": 3953 }, { "epoch": 0.8819986616105286, "grad_norm": 0.16098076105117798, "learning_rate": 1.616287698920818e-05, "loss": 0.4979, "step": 3954 }, { "epoch": 0.882221726522418, "grad_norm": 0.16197021305561066, "learning_rate": 1.6161023443411044e-05, "loss": 0.4883, "step": 3955 }, { "epoch": 0.8824447914343074, "grad_norm": 0.16163140535354614, "learning_rate": 1.6159169556377672e-05, "loss": 0.4899, "step": 3956 }, { "epoch": 0.8826678563461967, "grad_norm": 0.16022367775440216, "learning_rate": 1.615731532821075e-05, "loss": 0.4832, "step": 3957 }, { "epoch": 0.8828909212580861, "grad_norm": 0.16164539754390717, "learning_rate": 1.615546075901297e-05, "loss": 0.5016, "step": 3958 }, { "epoch": 0.8831139861699755, "grad_norm": 0.17649579048156738, "learning_rate": 1.615360584888706e-05, "loss": 0.4918, "step": 3959 }, { "epoch": 0.8833370510818648, "grad_norm": 0.16985541582107544, "learning_rate": 1.6151750597935746e-05, "loss": 0.4947, "step": 3960 }, { "epoch": 0.8835601159937542, "grad_norm": 0.1681845337152481, "learning_rate": 1.6149895006261788e-05, "loss": 0.5169, "step": 3961 }, { "epoch": 0.8837831809056436, "grad_norm": 0.1974424421787262, "learning_rate": 1.6148039073967964e-05, "loss": 0.5077, "step": 3962 }, { "epoch": 0.884006245817533, "grad_norm": 0.1644642949104309, "learning_rate": 1.614618280115706e-05, "loss": 0.4981, "step": 3963 }, { "epoch": 0.8842293107294222, "grad_norm": 0.16741390526294708, "learning_rate": 1.6144326187931893e-05, "loss": 0.4821, "step": 3964 }, { "epoch": 0.8844523756413116, "grad_norm": 0.17722152173519135, "learning_rate": 1.614246923439529e-05, "loss": 0.4782, "step": 3965 }, { "epoch": 0.884675440553201, "grad_norm": 0.17287255823612213, "learning_rate": 1.6140611940650104e-05, "loss": 0.4996, "step": 3966 }, { "epoch": 0.8848985054650903, "grad_norm": 0.17982237040996552, "learning_rate": 1.6138754306799206e-05, "loss": 0.4567, "step": 3967 }, { "epoch": 0.8851215703769797, "grad_norm": 0.1712944060564041, "learning_rate": 1.6136896332945474e-05, "loss": 0.4789, "step": 3968 }, { "epoch": 0.8853446352888691, "grad_norm": 0.16616348922252655, "learning_rate": 1.6135038019191823e-05, "loss": 0.509, "step": 3969 }, { "epoch": 0.8855677002007584, "grad_norm": 0.16956603527069092, "learning_rate": 1.6133179365641178e-05, "loss": 0.4906, "step": 3970 }, { "epoch": 0.8857907651126478, "grad_norm": 0.15731406211853027, "learning_rate": 1.613132037239648e-05, "loss": 0.4911, "step": 3971 }, { "epoch": 0.8860138300245372, "grad_norm": 0.16865961253643036, "learning_rate": 1.6129461039560693e-05, "loss": 0.5072, "step": 3972 }, { "epoch": 0.8862368949364265, "grad_norm": 0.16574735939502716, "learning_rate": 1.6127601367236793e-05, "loss": 0.4807, "step": 3973 }, { "epoch": 0.8864599598483158, "grad_norm": 0.16178300976753235, "learning_rate": 1.6125741355527788e-05, "loss": 0.5138, "step": 3974 }, { "epoch": 0.8866830247602052, "grad_norm": 0.163814976811409, "learning_rate": 1.6123881004536696e-05, "loss": 0.4999, "step": 3975 }, { "epoch": 0.8869060896720946, "grad_norm": 0.1648682951927185, "learning_rate": 1.612202031436655e-05, "loss": 0.4621, "step": 3976 }, { "epoch": 0.8871291545839839, "grad_norm": 0.1537158489227295, "learning_rate": 1.6120159285120417e-05, "loss": 0.4678, "step": 3977 }, { "epoch": 0.8873522194958733, "grad_norm": 0.17030583322048187, "learning_rate": 1.6118297916901357e-05, "loss": 0.5142, "step": 3978 }, { "epoch": 0.8875752844077627, "grad_norm": 0.15777969360351562, "learning_rate": 1.6116436209812476e-05, "loss": 0.4857, "step": 3979 }, { "epoch": 0.8877983493196521, "grad_norm": 0.16635335981845856, "learning_rate": 1.6114574163956883e-05, "loss": 0.4945, "step": 3980 }, { "epoch": 0.8880214142315414, "grad_norm": 0.16205914318561554, "learning_rate": 1.611271177943771e-05, "loss": 0.5013, "step": 3981 }, { "epoch": 0.8882444791434307, "grad_norm": 0.16518031060695648, "learning_rate": 1.6110849056358112e-05, "loss": 0.492, "step": 3982 }, { "epoch": 0.8884675440553201, "grad_norm": 0.17391857504844666, "learning_rate": 1.610898599482125e-05, "loss": 0.5147, "step": 3983 }, { "epoch": 0.8886906089672094, "grad_norm": 0.16679394245147705, "learning_rate": 1.610712259493032e-05, "loss": 0.5079, "step": 3984 }, { "epoch": 0.8889136738790988, "grad_norm": 0.15235210955142975, "learning_rate": 1.6105258856788525e-05, "loss": 0.4616, "step": 3985 }, { "epoch": 0.8891367387909882, "grad_norm": 0.15558315813541412, "learning_rate": 1.6103394780499088e-05, "loss": 0.5129, "step": 3986 }, { "epoch": 0.8893598037028775, "grad_norm": 0.16309772431850433, "learning_rate": 1.610153036616526e-05, "loss": 0.4981, "step": 3987 }, { "epoch": 0.8895828686147669, "grad_norm": 0.1611076146364212, "learning_rate": 1.60996656138903e-05, "loss": 0.4858, "step": 3988 }, { "epoch": 0.8898059335266563, "grad_norm": 0.16238349676132202, "learning_rate": 1.6097800523777487e-05, "loss": 0.5327, "step": 3989 }, { "epoch": 0.8900289984385457, "grad_norm": 0.1624278724193573, "learning_rate": 1.6095935095930125e-05, "loss": 0.4988, "step": 3990 }, { "epoch": 0.8902520633504349, "grad_norm": 0.16069476306438446, "learning_rate": 1.609406933045153e-05, "loss": 0.5061, "step": 3991 }, { "epoch": 0.8904751282623243, "grad_norm": 0.17323002219200134, "learning_rate": 1.6092203227445046e-05, "loss": 0.5126, "step": 3992 }, { "epoch": 0.8906981931742137, "grad_norm": 0.16106821596622467, "learning_rate": 1.6090336787014028e-05, "loss": 0.5075, "step": 3993 }, { "epoch": 0.890921258086103, "grad_norm": 0.20618608593940735, "learning_rate": 1.6088470009261846e-05, "loss": 0.4934, "step": 3994 }, { "epoch": 0.8911443229979924, "grad_norm": 0.15415968000888824, "learning_rate": 1.6086602894291895e-05, "loss": 0.4685, "step": 3995 }, { "epoch": 0.8913673879098818, "grad_norm": 0.16283094882965088, "learning_rate": 1.608473544220759e-05, "loss": 0.4893, "step": 3996 }, { "epoch": 0.8915904528217712, "grad_norm": 0.15671685338020325, "learning_rate": 1.6082867653112365e-05, "loss": 0.4866, "step": 3997 }, { "epoch": 0.8918135177336605, "grad_norm": 0.14689774811267853, "learning_rate": 1.6080999527109665e-05, "loss": 0.4596, "step": 3998 }, { "epoch": 0.8920365826455499, "grad_norm": 0.1579175591468811, "learning_rate": 1.6079131064302958e-05, "loss": 0.5135, "step": 3999 }, { "epoch": 0.8922596475574393, "grad_norm": 0.19000330567359924, "learning_rate": 1.6077262264795735e-05, "loss": 0.4955, "step": 4000 }, { "epoch": 0.8924827124693285, "grad_norm": 0.17095667123794556, "learning_rate": 1.6075393128691497e-05, "loss": 0.492, "step": 4001 }, { "epoch": 0.8927057773812179, "grad_norm": 0.1627168506383896, "learning_rate": 1.6073523656093778e-05, "loss": 0.4954, "step": 4002 }, { "epoch": 0.8929288422931073, "grad_norm": 0.16945117712020874, "learning_rate": 1.6071653847106113e-05, "loss": 0.5073, "step": 4003 }, { "epoch": 0.8931519072049967, "grad_norm": 0.16621045768260956, "learning_rate": 1.6069783701832066e-05, "loss": 0.509, "step": 4004 }, { "epoch": 0.893374972116886, "grad_norm": 0.16034524142742157, "learning_rate": 1.6067913220375216e-05, "loss": 0.4839, "step": 4005 }, { "epoch": 0.8935980370287754, "grad_norm": 0.15669786930084229, "learning_rate": 1.6066042402839163e-05, "loss": 0.4953, "step": 4006 }, { "epoch": 0.8938211019406648, "grad_norm": 0.15480557084083557, "learning_rate": 1.606417124932752e-05, "loss": 0.459, "step": 4007 }, { "epoch": 0.8940441668525541, "grad_norm": 0.15904684364795685, "learning_rate": 1.6062299759943938e-05, "loss": 0.4977, "step": 4008 }, { "epoch": 0.8942672317644434, "grad_norm": 0.16506803035736084, "learning_rate": 1.6060427934792056e-05, "loss": 0.4792, "step": 4009 }, { "epoch": 0.8944902966763328, "grad_norm": 0.16208292543888092, "learning_rate": 1.6058555773975552e-05, "loss": 0.5113, "step": 4010 }, { "epoch": 0.8947133615882221, "grad_norm": 0.15986153483390808, "learning_rate": 1.6056683277598123e-05, "loss": 0.4839, "step": 4011 }, { "epoch": 0.8949364265001115, "grad_norm": 0.17647914588451385, "learning_rate": 1.6054810445763474e-05, "loss": 0.4845, "step": 4012 }, { "epoch": 0.8951594914120009, "grad_norm": 0.16568545997142792, "learning_rate": 1.6052937278575338e-05, "loss": 0.5114, "step": 4013 }, { "epoch": 0.8953825563238903, "grad_norm": 0.16345763206481934, "learning_rate": 1.605106377613746e-05, "loss": 0.5004, "step": 4014 }, { "epoch": 0.8956056212357796, "grad_norm": 0.15674430131912231, "learning_rate": 1.6049189938553606e-05, "loss": 0.4745, "step": 4015 }, { "epoch": 0.895828686147669, "grad_norm": 0.25220704078674316, "learning_rate": 1.6047315765927566e-05, "loss": 0.4988, "step": 4016 }, { "epoch": 0.8960517510595584, "grad_norm": 0.16052506864070892, "learning_rate": 1.6045441258363138e-05, "loss": 0.468, "step": 4017 }, { "epoch": 0.8962748159714476, "grad_norm": 0.1760246306657791, "learning_rate": 1.6043566415964145e-05, "loss": 0.5213, "step": 4018 }, { "epoch": 0.896497880883337, "grad_norm": 0.16761453449726105, "learning_rate": 1.6041691238834426e-05, "loss": 0.5223, "step": 4019 }, { "epoch": 0.8967209457952264, "grad_norm": 0.1628294140100479, "learning_rate": 1.6039815727077845e-05, "loss": 0.4642, "step": 4020 }, { "epoch": 0.8969440107071158, "grad_norm": 0.16300807893276215, "learning_rate": 1.6037939880798277e-05, "loss": 0.4907, "step": 4021 }, { "epoch": 0.8971670756190051, "grad_norm": 0.16601742804050446, "learning_rate": 1.603606370009962e-05, "loss": 0.465, "step": 4022 }, { "epoch": 0.8973901405308945, "grad_norm": 0.16860070824623108, "learning_rate": 1.6034187185085783e-05, "loss": 0.4817, "step": 4023 }, { "epoch": 0.8976132054427839, "grad_norm": 0.16450795531272888, "learning_rate": 1.6032310335860706e-05, "loss": 0.4991, "step": 4024 }, { "epoch": 0.8978362703546732, "grad_norm": 0.18083292245864868, "learning_rate": 1.603043315252834e-05, "loss": 0.49, "step": 4025 }, { "epoch": 0.8980593352665626, "grad_norm": 0.16025912761688232, "learning_rate": 1.6028555635192648e-05, "loss": 0.492, "step": 4026 }, { "epoch": 0.898282400178452, "grad_norm": 0.17619994282722473, "learning_rate": 1.6026677783957626e-05, "loss": 0.5244, "step": 4027 }, { "epoch": 0.8985054650903412, "grad_norm": 0.16059327125549316, "learning_rate": 1.602479959892728e-05, "loss": 0.4875, "step": 4028 }, { "epoch": 0.8987285300022306, "grad_norm": 0.166759192943573, "learning_rate": 1.6022921080205634e-05, "loss": 0.4953, "step": 4029 }, { "epoch": 0.89895159491412, "grad_norm": 0.17170590162277222, "learning_rate": 1.602104222789673e-05, "loss": 0.5209, "step": 4030 }, { "epoch": 0.8991746598260094, "grad_norm": 0.17768704891204834, "learning_rate": 1.601916304210464e-05, "loss": 0.4861, "step": 4031 }, { "epoch": 0.8993977247378987, "grad_norm": 0.1798562854528427, "learning_rate": 1.6017283522933432e-05, "loss": 0.5049, "step": 4032 }, { "epoch": 0.8996207896497881, "grad_norm": 0.15688154101371765, "learning_rate": 1.6015403670487216e-05, "loss": 0.4527, "step": 4033 }, { "epoch": 0.8998438545616775, "grad_norm": 0.1658952832221985, "learning_rate": 1.6013523484870107e-05, "loss": 0.4687, "step": 4034 }, { "epoch": 0.9000669194735668, "grad_norm": 0.20666684210300446, "learning_rate": 1.6011642966186237e-05, "loss": 0.4883, "step": 4035 }, { "epoch": 0.9002899843854562, "grad_norm": 0.1706281453371048, "learning_rate": 1.600976211453977e-05, "loss": 0.4989, "step": 4036 }, { "epoch": 0.9005130492973455, "grad_norm": 0.16239850223064423, "learning_rate": 1.600788093003487e-05, "loss": 0.4944, "step": 4037 }, { "epoch": 0.9007361142092349, "grad_norm": 0.17332103848457336, "learning_rate": 1.6005999412775736e-05, "loss": 0.5225, "step": 4038 }, { "epoch": 0.9009591791211242, "grad_norm": 0.16437414288520813, "learning_rate": 1.600411756286657e-05, "loss": 0.5197, "step": 4039 }, { "epoch": 0.9011822440330136, "grad_norm": 0.1621236801147461, "learning_rate": 1.6002235380411614e-05, "loss": 0.4943, "step": 4040 }, { "epoch": 0.901405308944903, "grad_norm": 0.17484545707702637, "learning_rate": 1.60003528655151e-05, "loss": 0.5032, "step": 4041 }, { "epoch": 0.9016283738567923, "grad_norm": 0.16205301880836487, "learning_rate": 1.5998470018281303e-05, "loss": 0.4848, "step": 4042 }, { "epoch": 0.9018514387686817, "grad_norm": 0.15417253971099854, "learning_rate": 1.5996586838814505e-05, "loss": 0.4656, "step": 4043 }, { "epoch": 0.9020745036805711, "grad_norm": 0.15824946761131287, "learning_rate": 1.5994703327219008e-05, "loss": 0.4926, "step": 4044 }, { "epoch": 0.9022975685924604, "grad_norm": 0.17363391816616058, "learning_rate": 1.5992819483599132e-05, "loss": 0.5254, "step": 4045 }, { "epoch": 0.9025206335043497, "grad_norm": 0.16660279035568237, "learning_rate": 1.599093530805922e-05, "loss": 0.5396, "step": 4046 }, { "epoch": 0.9027436984162391, "grad_norm": 0.16047632694244385, "learning_rate": 1.5989050800703622e-05, "loss": 0.4782, "step": 4047 }, { "epoch": 0.9029667633281285, "grad_norm": 0.1667635589838028, "learning_rate": 1.5987165961636718e-05, "loss": 0.5084, "step": 4048 }, { "epoch": 0.9031898282400178, "grad_norm": 0.16391973197460175, "learning_rate": 1.5985280790962903e-05, "loss": 0.4967, "step": 4049 }, { "epoch": 0.9034128931519072, "grad_norm": 0.15799559652805328, "learning_rate": 1.598339528878659e-05, "loss": 0.4929, "step": 4050 }, { "epoch": 0.9036359580637966, "grad_norm": 0.15577222406864166, "learning_rate": 1.5981509455212207e-05, "loss": 0.4766, "step": 4051 }, { "epoch": 0.9038590229756859, "grad_norm": 0.15642912685871124, "learning_rate": 1.5979623290344207e-05, "loss": 0.4714, "step": 4052 }, { "epoch": 0.9040820878875753, "grad_norm": 0.1563492864370346, "learning_rate": 1.5977736794287057e-05, "loss": 0.4891, "step": 4053 }, { "epoch": 0.9043051527994647, "grad_norm": 0.16570942103862762, "learning_rate": 1.597584996714524e-05, "loss": 0.4857, "step": 4054 }, { "epoch": 0.904528217711354, "grad_norm": 0.16158944368362427, "learning_rate": 1.5973962809023258e-05, "loss": 0.487, "step": 4055 }, { "epoch": 0.9047512826232433, "grad_norm": 0.16446934640407562, "learning_rate": 1.5972075320025643e-05, "loss": 0.5012, "step": 4056 }, { "epoch": 0.9049743475351327, "grad_norm": 0.20039531588554382, "learning_rate": 1.597018750025693e-05, "loss": 0.5092, "step": 4057 }, { "epoch": 0.9051974124470221, "grad_norm": 0.16635029017925262, "learning_rate": 1.5968299349821678e-05, "loss": 0.5425, "step": 4058 }, { "epoch": 0.9054204773589114, "grad_norm": 0.162612184882164, "learning_rate": 1.596641086882447e-05, "loss": 0.4561, "step": 4059 }, { "epoch": 0.9056435422708008, "grad_norm": 0.15874888002872467, "learning_rate": 1.5964522057369897e-05, "loss": 0.503, "step": 4060 }, { "epoch": 0.9058666071826902, "grad_norm": 0.16984137892723083, "learning_rate": 1.596263291556257e-05, "loss": 0.4931, "step": 4061 }, { "epoch": 0.9060896720945795, "grad_norm": 0.15440818667411804, "learning_rate": 1.5960743443507128e-05, "loss": 0.4834, "step": 4062 }, { "epoch": 0.9063127370064689, "grad_norm": 0.17957209050655365, "learning_rate": 1.595885364130822e-05, "loss": 0.5049, "step": 4063 }, { "epoch": 0.9065358019183583, "grad_norm": 0.15549324452877045, "learning_rate": 1.5956963509070513e-05, "loss": 0.4733, "step": 4064 }, { "epoch": 0.9067588668302476, "grad_norm": 0.1653992384672165, "learning_rate": 1.59550730468987e-05, "loss": 0.4925, "step": 4065 }, { "epoch": 0.9069819317421369, "grad_norm": 0.1693269908428192, "learning_rate": 1.5953182254897478e-05, "loss": 0.4908, "step": 4066 }, { "epoch": 0.9072049966540263, "grad_norm": 0.16473425924777985, "learning_rate": 1.5951291133171577e-05, "loss": 0.4741, "step": 4067 }, { "epoch": 0.9074280615659157, "grad_norm": 0.16339989006519318, "learning_rate": 1.5949399681825738e-05, "loss": 0.4574, "step": 4068 }, { "epoch": 0.907651126477805, "grad_norm": 0.16389763355255127, "learning_rate": 1.5947507900964723e-05, "loss": 0.4951, "step": 4069 }, { "epoch": 0.9078741913896944, "grad_norm": 0.16346803307533264, "learning_rate": 1.594561579069331e-05, "loss": 0.4794, "step": 4070 }, { "epoch": 0.9080972563015838, "grad_norm": 0.16627097129821777, "learning_rate": 1.5943723351116293e-05, "loss": 0.4726, "step": 4071 }, { "epoch": 0.9083203212134732, "grad_norm": 0.17151188850402832, "learning_rate": 1.5941830582338488e-05, "loss": 0.5246, "step": 4072 }, { "epoch": 0.9085433861253625, "grad_norm": 0.15683230757713318, "learning_rate": 1.593993748446473e-05, "loss": 0.4771, "step": 4073 }, { "epoch": 0.9087664510372518, "grad_norm": 0.1879899501800537, "learning_rate": 1.5938044057599873e-05, "loss": 0.5102, "step": 4074 }, { "epoch": 0.9089895159491412, "grad_norm": 0.16234296560287476, "learning_rate": 1.593615030184878e-05, "loss": 0.4906, "step": 4075 }, { "epoch": 0.9092125808610305, "grad_norm": 0.15749451518058777, "learning_rate": 1.593425621731635e-05, "loss": 0.4867, "step": 4076 }, { "epoch": 0.9094356457729199, "grad_norm": 0.1519862711429596, "learning_rate": 1.593236180410748e-05, "loss": 0.4644, "step": 4077 }, { "epoch": 0.9096587106848093, "grad_norm": 0.20612524449825287, "learning_rate": 1.5930467062327096e-05, "loss": 0.5017, "step": 4078 }, { "epoch": 0.9098817755966987, "grad_norm": 0.16134196519851685, "learning_rate": 1.5928571992080142e-05, "loss": 0.4801, "step": 4079 }, { "epoch": 0.910104840508588, "grad_norm": 0.15148447453975677, "learning_rate": 1.592667659347158e-05, "loss": 0.4697, "step": 4080 }, { "epoch": 0.9103279054204774, "grad_norm": 0.1598382443189621, "learning_rate": 1.5924780866606387e-05, "loss": 0.4976, "step": 4081 }, { "epoch": 0.9105509703323668, "grad_norm": 0.15935340523719788, "learning_rate": 1.592288481158956e-05, "loss": 0.4806, "step": 4082 }, { "epoch": 0.910774035244256, "grad_norm": 0.16446979343891144, "learning_rate": 1.5920988428526117e-05, "loss": 0.4966, "step": 4083 }, { "epoch": 0.9109971001561454, "grad_norm": 0.15920083224773407, "learning_rate": 1.591909171752109e-05, "loss": 0.4917, "step": 4084 }, { "epoch": 0.9112201650680348, "grad_norm": 0.16009603440761566, "learning_rate": 1.5917194678679532e-05, "loss": 0.5169, "step": 4085 }, { "epoch": 0.9114432299799241, "grad_norm": 0.17751926183700562, "learning_rate": 1.5915297312106513e-05, "loss": 0.4804, "step": 4086 }, { "epoch": 0.9116662948918135, "grad_norm": 0.18244434893131256, "learning_rate": 1.5913399617907116e-05, "loss": 0.487, "step": 4087 }, { "epoch": 0.9118893598037029, "grad_norm": 0.15786734223365784, "learning_rate": 1.5911501596186455e-05, "loss": 0.4986, "step": 4088 }, { "epoch": 0.9121124247155923, "grad_norm": 0.15740512311458588, "learning_rate": 1.5909603247049654e-05, "loss": 0.4895, "step": 4089 }, { "epoch": 0.9123354896274816, "grad_norm": 0.1518784463405609, "learning_rate": 1.5907704570601845e-05, "loss": 0.4522, "step": 4090 }, { "epoch": 0.912558554539371, "grad_norm": 0.1830586940050125, "learning_rate": 1.59058055669482e-05, "loss": 0.513, "step": 4091 }, { "epoch": 0.9127816194512604, "grad_norm": 0.16128897666931152, "learning_rate": 1.5903906236193892e-05, "loss": 0.4706, "step": 4092 }, { "epoch": 0.9130046843631496, "grad_norm": 0.17136983573436737, "learning_rate": 1.5902006578444123e-05, "loss": 0.4867, "step": 4093 }, { "epoch": 0.913227749275039, "grad_norm": 0.15806463360786438, "learning_rate": 1.59001065938041e-05, "loss": 0.5066, "step": 4094 }, { "epoch": 0.9134508141869284, "grad_norm": 0.17267145216464996, "learning_rate": 1.5898206282379063e-05, "loss": 0.4805, "step": 4095 }, { "epoch": 0.9136738790988178, "grad_norm": 0.15600056946277618, "learning_rate": 1.5896305644274262e-05, "loss": 0.4865, "step": 4096 }, { "epoch": 0.9138969440107071, "grad_norm": 0.1608215868473053, "learning_rate": 1.5894404679594963e-05, "loss": 0.5102, "step": 4097 }, { "epoch": 0.9141200089225965, "grad_norm": 0.15523095428943634, "learning_rate": 1.5892503388446456e-05, "loss": 0.4642, "step": 4098 }, { "epoch": 0.9143430738344859, "grad_norm": 0.16346514225006104, "learning_rate": 1.589060177093405e-05, "loss": 0.4655, "step": 4099 }, { "epoch": 0.9145661387463752, "grad_norm": 0.1508352905511856, "learning_rate": 1.588869982716306e-05, "loss": 0.4826, "step": 4100 }, { "epoch": 0.9147892036582645, "grad_norm": 0.1622077375650406, "learning_rate": 1.5886797557238832e-05, "loss": 0.4876, "step": 4101 }, { "epoch": 0.9150122685701539, "grad_norm": 0.17161275446414948, "learning_rate": 1.588489496126673e-05, "loss": 0.4963, "step": 4102 }, { "epoch": 0.9152353334820432, "grad_norm": 0.15936551988124847, "learning_rate": 1.5882992039352122e-05, "loss": 0.4768, "step": 4103 }, { "epoch": 0.9154583983939326, "grad_norm": 0.1664397269487381, "learning_rate": 1.588108879160041e-05, "loss": 0.4927, "step": 4104 }, { "epoch": 0.915681463305822, "grad_norm": 0.16500705480575562, "learning_rate": 1.5879185218117012e-05, "loss": 0.4909, "step": 4105 }, { "epoch": 0.9159045282177114, "grad_norm": 0.16480299830436707, "learning_rate": 1.5877281319007352e-05, "loss": 0.468, "step": 4106 }, { "epoch": 0.9161275931296007, "grad_norm": 0.17092856764793396, "learning_rate": 1.5875377094376883e-05, "loss": 0.5188, "step": 4107 }, { "epoch": 0.9163506580414901, "grad_norm": 0.16929516196250916, "learning_rate": 1.5873472544331073e-05, "loss": 0.4932, "step": 4108 }, { "epoch": 0.9165737229533795, "grad_norm": 0.1531393676996231, "learning_rate": 1.5871567668975406e-05, "loss": 0.4512, "step": 4109 }, { "epoch": 0.9167967878652687, "grad_norm": 0.15918409824371338, "learning_rate": 1.586966246841539e-05, "loss": 0.5019, "step": 4110 }, { "epoch": 0.9170198527771581, "grad_norm": 0.19668515026569366, "learning_rate": 1.5867756942756548e-05, "loss": 0.5106, "step": 4111 }, { "epoch": 0.9172429176890475, "grad_norm": 0.15807564556598663, "learning_rate": 1.5865851092104414e-05, "loss": 0.4738, "step": 4112 }, { "epoch": 0.9174659826009369, "grad_norm": 0.16922055184841156, "learning_rate": 1.586394491656455e-05, "loss": 0.5086, "step": 4113 }, { "epoch": 0.9176890475128262, "grad_norm": 0.18552662432193756, "learning_rate": 1.586203841624253e-05, "loss": 0.5246, "step": 4114 }, { "epoch": 0.9179121124247156, "grad_norm": 0.16202445328235626, "learning_rate": 1.5860131591243945e-05, "loss": 0.5113, "step": 4115 }, { "epoch": 0.918135177336605, "grad_norm": 0.16352488100528717, "learning_rate": 1.5858224441674416e-05, "loss": 0.4739, "step": 4116 }, { "epoch": 0.9183582422484943, "grad_norm": 0.16634705662727356, "learning_rate": 1.5856316967639566e-05, "loss": 0.5221, "step": 4117 }, { "epoch": 0.9185813071603837, "grad_norm": 0.16568778455257416, "learning_rate": 1.5854409169245043e-05, "loss": 0.4476, "step": 4118 }, { "epoch": 0.9188043720722731, "grad_norm": 0.160440593957901, "learning_rate": 1.5852501046596516e-05, "loss": 0.5054, "step": 4119 }, { "epoch": 0.9190274369841623, "grad_norm": 0.1603170782327652, "learning_rate": 1.5850592599799668e-05, "loss": 0.4947, "step": 4120 }, { "epoch": 0.9192505018960517, "grad_norm": 0.16584816575050354, "learning_rate": 1.5848683828960195e-05, "loss": 0.5014, "step": 4121 }, { "epoch": 0.9194735668079411, "grad_norm": 0.1588207632303238, "learning_rate": 1.584677473418383e-05, "loss": 0.4973, "step": 4122 }, { "epoch": 0.9196966317198305, "grad_norm": 0.1574103981256485, "learning_rate": 1.5844865315576296e-05, "loss": 0.5214, "step": 4123 }, { "epoch": 0.9199196966317198, "grad_norm": 0.17080260813236237, "learning_rate": 1.584295557324336e-05, "loss": 0.5067, "step": 4124 }, { "epoch": 0.9201427615436092, "grad_norm": 0.15937574207782745, "learning_rate": 1.584104550729079e-05, "loss": 0.4971, "step": 4125 }, { "epoch": 0.9203658264554986, "grad_norm": 0.15593832731246948, "learning_rate": 1.5839135117824375e-05, "loss": 0.4844, "step": 4126 }, { "epoch": 0.9205888913673879, "grad_norm": 0.18794505298137665, "learning_rate": 1.583722440494993e-05, "loss": 0.5135, "step": 4127 }, { "epoch": 0.9208119562792773, "grad_norm": 0.183371901512146, "learning_rate": 1.5835313368773276e-05, "loss": 0.4861, "step": 4128 }, { "epoch": 0.9210350211911666, "grad_norm": 0.153816357254982, "learning_rate": 1.583340200940027e-05, "loss": 0.4921, "step": 4129 }, { "epoch": 0.921258086103056, "grad_norm": 0.16074557602405548, "learning_rate": 1.583149032693676e-05, "loss": 0.4566, "step": 4130 }, { "epoch": 0.9214811510149453, "grad_norm": 0.15913258492946625, "learning_rate": 1.5829578321488636e-05, "loss": 0.4838, "step": 4131 }, { "epoch": 0.9217042159268347, "grad_norm": 0.15266257524490356, "learning_rate": 1.58276659931618e-05, "loss": 0.4704, "step": 4132 }, { "epoch": 0.9219272808387241, "grad_norm": 0.15182676911354065, "learning_rate": 1.5825753342062155e-05, "loss": 0.4825, "step": 4133 }, { "epoch": 0.9221503457506134, "grad_norm": 0.15525878965854645, "learning_rate": 1.582384036829565e-05, "loss": 0.4636, "step": 4134 }, { "epoch": 0.9223734106625028, "grad_norm": 0.15554243326187134, "learning_rate": 1.582192707196823e-05, "loss": 0.4945, "step": 4135 }, { "epoch": 0.9225964755743922, "grad_norm": 0.15928377211093903, "learning_rate": 1.582001345318587e-05, "loss": 0.4772, "step": 4136 }, { "epoch": 0.9228195404862815, "grad_norm": 0.15914230048656464, "learning_rate": 1.581809951205455e-05, "loss": 0.4723, "step": 4137 }, { "epoch": 0.9230426053981708, "grad_norm": 0.16770337522029877, "learning_rate": 1.581618524868029e-05, "loss": 0.4579, "step": 4138 }, { "epoch": 0.9232656703100602, "grad_norm": 0.16558465361595154, "learning_rate": 1.58142706631691e-05, "loss": 0.5186, "step": 4139 }, { "epoch": 0.9234887352219496, "grad_norm": 0.16763773560523987, "learning_rate": 1.5812355755627028e-05, "loss": 0.4887, "step": 4140 }, { "epoch": 0.9237118001338389, "grad_norm": 0.1562553197145462, "learning_rate": 1.5810440526160133e-05, "loss": 0.4953, "step": 4141 }, { "epoch": 0.9239348650457283, "grad_norm": 0.1571418195962906, "learning_rate": 1.5808524974874493e-05, "loss": 0.4753, "step": 4142 }, { "epoch": 0.9241579299576177, "grad_norm": 0.17362381517887115, "learning_rate": 1.5806609101876203e-05, "loss": 0.4704, "step": 4143 }, { "epoch": 0.924380994869507, "grad_norm": 0.1619691401720047, "learning_rate": 1.580469290727138e-05, "loss": 0.476, "step": 4144 }, { "epoch": 0.9246040597813964, "grad_norm": 0.16508683562278748, "learning_rate": 1.5802776391166146e-05, "loss": 0.5118, "step": 4145 }, { "epoch": 0.9248271246932858, "grad_norm": 0.16421370208263397, "learning_rate": 1.5800859553666655e-05, "loss": 0.4876, "step": 4146 }, { "epoch": 0.9250501896051752, "grad_norm": 0.16035978496074677, "learning_rate": 1.5798942394879073e-05, "loss": 0.5048, "step": 4147 }, { "epoch": 0.9252732545170644, "grad_norm": 0.16965007781982422, "learning_rate": 1.5797024914909584e-05, "loss": 0.4932, "step": 4148 }, { "epoch": 0.9254963194289538, "grad_norm": 0.16695380210876465, "learning_rate": 1.5795107113864393e-05, "loss": 0.5188, "step": 4149 }, { "epoch": 0.9257193843408432, "grad_norm": 0.1638556569814682, "learning_rate": 1.5793188991849717e-05, "loss": 0.4764, "step": 4150 }, { "epoch": 0.9259424492527325, "grad_norm": 0.16190040111541748, "learning_rate": 1.579127054897179e-05, "loss": 0.4877, "step": 4151 }, { "epoch": 0.9261655141646219, "grad_norm": 0.16219079494476318, "learning_rate": 1.5789351785336874e-05, "loss": 0.4823, "step": 4152 }, { "epoch": 0.9263885790765113, "grad_norm": 0.15557697415351868, "learning_rate": 1.5787432701051242e-05, "loss": 0.4826, "step": 4153 }, { "epoch": 0.9266116439884007, "grad_norm": 0.15617480874061584, "learning_rate": 1.578551329622118e-05, "loss": 0.5012, "step": 4154 }, { "epoch": 0.92683470890029, "grad_norm": 0.16417647898197174, "learning_rate": 1.5783593570953e-05, "loss": 0.4692, "step": 4155 }, { "epoch": 0.9270577738121794, "grad_norm": 0.16066023707389832, "learning_rate": 1.578167352535303e-05, "loss": 0.4932, "step": 4156 }, { "epoch": 0.9272808387240687, "grad_norm": 0.15020349621772766, "learning_rate": 1.577975315952761e-05, "loss": 0.4593, "step": 4157 }, { "epoch": 0.927503903635958, "grad_norm": 0.15949909389019012, "learning_rate": 1.57778324735831e-05, "loss": 0.4952, "step": 4158 }, { "epoch": 0.9277269685478474, "grad_norm": 0.169538676738739, "learning_rate": 1.577591146762589e-05, "loss": 0.4989, "step": 4159 }, { "epoch": 0.9279500334597368, "grad_norm": 0.1595795601606369, "learning_rate": 1.5773990141762366e-05, "loss": 0.4886, "step": 4160 }, { "epoch": 0.9281730983716261, "grad_norm": 0.17213213443756104, "learning_rate": 1.577206849609895e-05, "loss": 0.4383, "step": 4161 }, { "epoch": 0.9283961632835155, "grad_norm": 0.15668156743049622, "learning_rate": 1.5770146530742075e-05, "loss": 0.4672, "step": 4162 }, { "epoch": 0.9286192281954049, "grad_norm": 0.15910851955413818, "learning_rate": 1.576822424579819e-05, "loss": 0.5136, "step": 4163 }, { "epoch": 0.9288422931072943, "grad_norm": 0.1524379998445511, "learning_rate": 1.5766301641373755e-05, "loss": 0.4752, "step": 4164 }, { "epoch": 0.9290653580191836, "grad_norm": 0.1620722860097885, "learning_rate": 1.5764378717575272e-05, "loss": 0.46, "step": 4165 }, { "epoch": 0.9292884229310729, "grad_norm": 0.16785915195941925, "learning_rate": 1.576245547450923e-05, "loss": 0.5097, "step": 4166 }, { "epoch": 0.9295114878429623, "grad_norm": 0.15819083154201508, "learning_rate": 1.5760531912282163e-05, "loss": 0.5042, "step": 4167 }, { "epoch": 0.9297345527548516, "grad_norm": 0.15982572734355927, "learning_rate": 1.57586080310006e-05, "loss": 0.4832, "step": 4168 }, { "epoch": 0.929957617666741, "grad_norm": 0.16882368922233582, "learning_rate": 1.57566838307711e-05, "loss": 0.495, "step": 4169 }, { "epoch": 0.9301806825786304, "grad_norm": 0.15450328588485718, "learning_rate": 1.575475931170024e-05, "loss": 0.4831, "step": 4170 }, { "epoch": 0.9304037474905198, "grad_norm": 0.15884611010551453, "learning_rate": 1.575283447389461e-05, "loss": 0.4447, "step": 4171 }, { "epoch": 0.9306268124024091, "grad_norm": 0.16536371409893036, "learning_rate": 1.575090931746082e-05, "loss": 0.4708, "step": 4172 }, { "epoch": 0.9308498773142985, "grad_norm": 0.16749803721904755, "learning_rate": 1.57489838425055e-05, "loss": 0.4927, "step": 4173 }, { "epoch": 0.9310729422261879, "grad_norm": 0.17306271195411682, "learning_rate": 1.5747058049135286e-05, "loss": 0.5355, "step": 4174 }, { "epoch": 0.9312960071380771, "grad_norm": 0.1781659871339798, "learning_rate": 1.5745131937456853e-05, "loss": 0.5054, "step": 4175 }, { "epoch": 0.9315190720499665, "grad_norm": 0.16959606111049652, "learning_rate": 1.5743205507576873e-05, "loss": 0.4799, "step": 4176 }, { "epoch": 0.9317421369618559, "grad_norm": 0.16488516330718994, "learning_rate": 1.5741278759602045e-05, "loss": 0.4835, "step": 4177 }, { "epoch": 0.9319652018737452, "grad_norm": 0.1639404445886612, "learning_rate": 1.5739351693639085e-05, "loss": 0.4757, "step": 4178 }, { "epoch": 0.9321882667856346, "grad_norm": 0.17086337506771088, "learning_rate": 1.573742430979473e-05, "loss": 0.4982, "step": 4179 }, { "epoch": 0.932411331697524, "grad_norm": 0.162103071808815, "learning_rate": 1.5735496608175722e-05, "loss": 0.4593, "step": 4180 }, { "epoch": 0.9326343966094134, "grad_norm": 0.1637817919254303, "learning_rate": 1.5733568588888835e-05, "loss": 0.4776, "step": 4181 }, { "epoch": 0.9328574615213027, "grad_norm": 0.1821950078010559, "learning_rate": 1.5731640252040857e-05, "loss": 0.5159, "step": 4182 }, { "epoch": 0.9330805264331921, "grad_norm": 0.21807077527046204, "learning_rate": 1.5729711597738587e-05, "loss": 0.4721, "step": 4183 }, { "epoch": 0.9333035913450815, "grad_norm": 0.1628894805908203, "learning_rate": 1.5727782626088844e-05, "loss": 0.5033, "step": 4184 }, { "epoch": 0.9335266562569707, "grad_norm": 0.158598393201828, "learning_rate": 1.5725853337198476e-05, "loss": 0.4743, "step": 4185 }, { "epoch": 0.9337497211688601, "grad_norm": 0.16272245347499847, "learning_rate": 1.5723923731174327e-05, "loss": 0.4952, "step": 4186 }, { "epoch": 0.9339727860807495, "grad_norm": 0.16067437827587128, "learning_rate": 1.5721993808123283e-05, "loss": 0.4879, "step": 4187 }, { "epoch": 0.9341958509926389, "grad_norm": 0.16409751772880554, "learning_rate": 1.5720063568152222e-05, "loss": 0.5051, "step": 4188 }, { "epoch": 0.9344189159045282, "grad_norm": 0.15659625828266144, "learning_rate": 1.5718133011368065e-05, "loss": 0.4497, "step": 4189 }, { "epoch": 0.9346419808164176, "grad_norm": 0.17328283190727234, "learning_rate": 1.5716202137877732e-05, "loss": 0.4927, "step": 4190 }, { "epoch": 0.934865045728307, "grad_norm": 0.15970586240291595, "learning_rate": 1.5714270947788168e-05, "loss": 0.4706, "step": 4191 }, { "epoch": 0.9350881106401963, "grad_norm": 0.16526605188846588, "learning_rate": 1.5712339441206335e-05, "loss": 0.5266, "step": 4192 }, { "epoch": 0.9353111755520856, "grad_norm": 0.16756314039230347, "learning_rate": 1.5710407618239215e-05, "loss": 0.4818, "step": 4193 }, { "epoch": 0.935534240463975, "grad_norm": 0.16548283398151398, "learning_rate": 1.57084754789938e-05, "loss": 0.4586, "step": 4194 }, { "epoch": 0.9357573053758643, "grad_norm": 0.1704016625881195, "learning_rate": 1.57065430235771e-05, "loss": 0.4999, "step": 4195 }, { "epoch": 0.9359803702877537, "grad_norm": 0.16027678549289703, "learning_rate": 1.5704610252096158e-05, "loss": 0.5155, "step": 4196 }, { "epoch": 0.9362034351996431, "grad_norm": 0.16171659529209137, "learning_rate": 1.5702677164658013e-05, "loss": 0.482, "step": 4197 }, { "epoch": 0.9364265001115325, "grad_norm": 0.15903575718402863, "learning_rate": 1.5700743761369735e-05, "loss": 0.4862, "step": 4198 }, { "epoch": 0.9366495650234218, "grad_norm": 0.16277752816677094, "learning_rate": 1.569881004233841e-05, "loss": 0.5011, "step": 4199 }, { "epoch": 0.9368726299353112, "grad_norm": 0.18112027645111084, "learning_rate": 1.5696876007671137e-05, "loss": 0.5124, "step": 4200 }, { "epoch": 0.9370956948472006, "grad_norm": 0.1702015995979309, "learning_rate": 1.5694941657475037e-05, "loss": 0.5233, "step": 4201 }, { "epoch": 0.9373187597590898, "grad_norm": 0.15752531588077545, "learning_rate": 1.5693006991857248e-05, "loss": 0.486, "step": 4202 }, { "epoch": 0.9375418246709792, "grad_norm": 0.16015774011611938, "learning_rate": 1.5691072010924915e-05, "loss": 0.4816, "step": 4203 }, { "epoch": 0.9377648895828686, "grad_norm": 0.17005962133407593, "learning_rate": 1.568913671478522e-05, "loss": 0.4986, "step": 4204 }, { "epoch": 0.937987954494758, "grad_norm": 0.15905898809432983, "learning_rate": 1.5687201103545343e-05, "loss": 0.5031, "step": 4205 }, { "epoch": 0.9382110194066473, "grad_norm": 0.16087795794010162, "learning_rate": 1.56852651773125e-05, "loss": 0.4957, "step": 4206 }, { "epoch": 0.9384340843185367, "grad_norm": 0.16135592758655548, "learning_rate": 1.5683328936193908e-05, "loss": 0.5094, "step": 4207 }, { "epoch": 0.9386571492304261, "grad_norm": 0.18367910385131836, "learning_rate": 1.568139238029681e-05, "loss": 0.515, "step": 4208 }, { "epoch": 0.9388802141423154, "grad_norm": 0.16034136712551117, "learning_rate": 1.5679455509728468e-05, "loss": 0.5168, "step": 4209 }, { "epoch": 0.9391032790542048, "grad_norm": 0.17193713784217834, "learning_rate": 1.567751832459615e-05, "loss": 0.5078, "step": 4210 }, { "epoch": 0.9393263439660942, "grad_norm": 0.15431612730026245, "learning_rate": 1.5675580825007158e-05, "loss": 0.4718, "step": 4211 }, { "epoch": 0.9395494088779834, "grad_norm": 0.16114309430122375, "learning_rate": 1.5673643011068796e-05, "loss": 0.4779, "step": 4212 }, { "epoch": 0.9397724737898728, "grad_norm": 0.15799474716186523, "learning_rate": 1.5671704882888396e-05, "loss": 0.5086, "step": 4213 }, { "epoch": 0.9399955387017622, "grad_norm": 0.1653827577829361, "learning_rate": 1.5669766440573302e-05, "loss": 0.488, "step": 4214 }, { "epoch": 0.9402186036136516, "grad_norm": 0.16851937770843506, "learning_rate": 1.566782768423088e-05, "loss": 0.4806, "step": 4215 }, { "epoch": 0.9404416685255409, "grad_norm": 0.17142115533351898, "learning_rate": 1.566588861396851e-05, "loss": 0.5131, "step": 4216 }, { "epoch": 0.9406647334374303, "grad_norm": 0.15409086644649506, "learning_rate": 1.5663949229893587e-05, "loss": 0.4624, "step": 4217 }, { "epoch": 0.9408877983493197, "grad_norm": 0.16042175889015198, "learning_rate": 1.566200953211353e-05, "loss": 0.4816, "step": 4218 }, { "epoch": 0.941110863261209, "grad_norm": 0.15703439712524414, "learning_rate": 1.5660069520735766e-05, "loss": 0.504, "step": 4219 }, { "epoch": 0.9413339281730984, "grad_norm": 0.16937057673931122, "learning_rate": 1.565812919586775e-05, "loss": 0.4766, "step": 4220 }, { "epoch": 0.9415569930849877, "grad_norm": 0.17867791652679443, "learning_rate": 1.565618855761695e-05, "loss": 0.4872, "step": 4221 }, { "epoch": 0.9417800579968771, "grad_norm": 0.15998175740242004, "learning_rate": 1.5654247606090846e-05, "loss": 0.4844, "step": 4222 }, { "epoch": 0.9420031229087664, "grad_norm": 0.15649773180484772, "learning_rate": 1.5652306341396943e-05, "loss": 0.4774, "step": 4223 }, { "epoch": 0.9422261878206558, "grad_norm": 0.15873044729232788, "learning_rate": 1.5650364763642764e-05, "loss": 0.5239, "step": 4224 }, { "epoch": 0.9424492527325452, "grad_norm": 0.15709929168224335, "learning_rate": 1.564842287293584e-05, "loss": 0.4875, "step": 4225 }, { "epoch": 0.9426723176444345, "grad_norm": 0.1562758833169937, "learning_rate": 1.5646480669383726e-05, "loss": 0.5132, "step": 4226 }, { "epoch": 0.9428953825563239, "grad_norm": 0.162166565656662, "learning_rate": 1.5644538153093995e-05, "loss": 0.4901, "step": 4227 }, { "epoch": 0.9431184474682133, "grad_norm": 0.1618107706308365, "learning_rate": 1.564259532417424e-05, "loss": 0.4722, "step": 4228 }, { "epoch": 0.9433415123801027, "grad_norm": 0.15450115501880646, "learning_rate": 1.5640652182732057e-05, "loss": 0.4498, "step": 4229 }, { "epoch": 0.943564577291992, "grad_norm": 0.1606828272342682, "learning_rate": 1.563870872887508e-05, "loss": 0.4798, "step": 4230 }, { "epoch": 0.9437876422038813, "grad_norm": 0.15939275920391083, "learning_rate": 1.5636764962710936e-05, "loss": 0.4773, "step": 4231 }, { "epoch": 0.9440107071157707, "grad_norm": 0.16131429374217987, "learning_rate": 1.5634820884347303e-05, "loss": 0.4861, "step": 4232 }, { "epoch": 0.94423377202766, "grad_norm": 0.15643377602100372, "learning_rate": 1.563287649389184e-05, "loss": 0.5172, "step": 4233 }, { "epoch": 0.9444568369395494, "grad_norm": 0.1604384183883667, "learning_rate": 1.5630931791452246e-05, "loss": 0.4808, "step": 4234 }, { "epoch": 0.9446799018514388, "grad_norm": 0.15961483120918274, "learning_rate": 1.5628986777136223e-05, "loss": 0.4648, "step": 4235 }, { "epoch": 0.9449029667633281, "grad_norm": 0.16450265049934387, "learning_rate": 1.562704145105151e-05, "loss": 0.5006, "step": 4236 }, { "epoch": 0.9451260316752175, "grad_norm": 0.1608218103647232, "learning_rate": 1.5625095813305847e-05, "loss": 0.4983, "step": 4237 }, { "epoch": 0.9453490965871069, "grad_norm": 0.1569865345954895, "learning_rate": 1.5623149864006993e-05, "loss": 0.4969, "step": 4238 }, { "epoch": 0.9455721614989963, "grad_norm": 0.1610192060470581, "learning_rate": 1.5621203603262727e-05, "loss": 0.4842, "step": 4239 }, { "epoch": 0.9457952264108855, "grad_norm": 0.18263404071331024, "learning_rate": 1.561925703118085e-05, "loss": 0.5053, "step": 4240 }, { "epoch": 0.9460182913227749, "grad_norm": 0.16298796236515045, "learning_rate": 1.561731014786917e-05, "loss": 0.4906, "step": 4241 }, { "epoch": 0.9462413562346643, "grad_norm": 0.16036361455917358, "learning_rate": 1.5615362953435517e-05, "loss": 0.4647, "step": 4242 }, { "epoch": 0.9464644211465536, "grad_norm": 0.16408760845661163, "learning_rate": 1.5613415447987743e-05, "loss": 0.4836, "step": 4243 }, { "epoch": 0.946687486058443, "grad_norm": 0.1685844510793686, "learning_rate": 1.5611467631633713e-05, "loss": 0.5509, "step": 4244 }, { "epoch": 0.9469105509703324, "grad_norm": 0.16026711463928223, "learning_rate": 1.5609519504481306e-05, "loss": 0.4926, "step": 4245 }, { "epoch": 0.9471336158822218, "grad_norm": 0.19063352048397064, "learning_rate": 1.560757106663843e-05, "loss": 0.4975, "step": 4246 }, { "epoch": 0.9473566807941111, "grad_norm": 0.1648816168308258, "learning_rate": 1.560562231821299e-05, "loss": 0.5142, "step": 4247 }, { "epoch": 0.9475797457060005, "grad_norm": 0.1545959860086441, "learning_rate": 1.5603673259312927e-05, "loss": 0.4862, "step": 4248 }, { "epoch": 0.9478028106178898, "grad_norm": 0.15768404304981232, "learning_rate": 1.5601723890046188e-05, "loss": 0.4708, "step": 4249 }, { "epoch": 0.9480258755297791, "grad_norm": 0.1632533222436905, "learning_rate": 1.5599774210520747e-05, "loss": 0.5113, "step": 4250 }, { "epoch": 0.9482489404416685, "grad_norm": 0.1614227592945099, "learning_rate": 1.5597824220844583e-05, "loss": 0.5081, "step": 4251 }, { "epoch": 0.9484720053535579, "grad_norm": 0.15559621155261993, "learning_rate": 1.55958739211257e-05, "loss": 0.5012, "step": 4252 }, { "epoch": 0.9486950702654472, "grad_norm": 0.16258159279823303, "learning_rate": 1.5593923311472127e-05, "loss": 0.5018, "step": 4253 }, { "epoch": 0.9489181351773366, "grad_norm": 0.15811897814273834, "learning_rate": 1.559197239199189e-05, "loss": 0.4694, "step": 4254 }, { "epoch": 0.949141200089226, "grad_norm": 0.15983614325523376, "learning_rate": 1.5590021162793047e-05, "loss": 0.4758, "step": 4255 }, { "epoch": 0.9493642650011154, "grad_norm": 0.16571156680583954, "learning_rate": 1.558806962398367e-05, "loss": 0.4952, "step": 4256 }, { "epoch": 0.9495873299130047, "grad_norm": 0.17233605682849884, "learning_rate": 1.5586117775671844e-05, "loss": 0.5252, "step": 4257 }, { "epoch": 0.949810394824894, "grad_norm": 0.17330588400363922, "learning_rate": 1.558416561796568e-05, "loss": 0.5099, "step": 4258 }, { "epoch": 0.9500334597367834, "grad_norm": 0.15399448573589325, "learning_rate": 1.5582213150973296e-05, "loss": 0.4697, "step": 4259 }, { "epoch": 0.9502565246486727, "grad_norm": 0.17017517983913422, "learning_rate": 1.5580260374802837e-05, "loss": 0.4981, "step": 4260 }, { "epoch": 0.9504795895605621, "grad_norm": 0.1677882969379425, "learning_rate": 1.5578307289562457e-05, "loss": 0.5042, "step": 4261 }, { "epoch": 0.9507026544724515, "grad_norm": 0.15946677327156067, "learning_rate": 1.557635389536033e-05, "loss": 0.4697, "step": 4262 }, { "epoch": 0.9509257193843409, "grad_norm": 1.8394982814788818, "learning_rate": 1.557440019230465e-05, "loss": 0.5449, "step": 4263 }, { "epoch": 0.9511487842962302, "grad_norm": 0.16612844169139862, "learning_rate": 1.5572446180503618e-05, "loss": 0.4826, "step": 4264 }, { "epoch": 0.9513718492081196, "grad_norm": 0.15885433554649353, "learning_rate": 1.557049186006547e-05, "loss": 0.4837, "step": 4265 }, { "epoch": 0.951594914120009, "grad_norm": 0.16099439561367035, "learning_rate": 1.5568537231098438e-05, "loss": 0.4863, "step": 4266 }, { "epoch": 0.9518179790318982, "grad_norm": 0.15327222645282745, "learning_rate": 1.5566582293710787e-05, "loss": 0.4865, "step": 4267 }, { "epoch": 0.9520410439437876, "grad_norm": 0.1621992439031601, "learning_rate": 1.5564627048010797e-05, "loss": 0.5082, "step": 4268 }, { "epoch": 0.952264108855677, "grad_norm": 0.16586080193519592, "learning_rate": 1.5562671494106756e-05, "loss": 0.5066, "step": 4269 }, { "epoch": 0.9524871737675663, "grad_norm": 0.16953197121620178, "learning_rate": 1.5560715632106976e-05, "loss": 0.5127, "step": 4270 }, { "epoch": 0.9527102386794557, "grad_norm": 0.1806926429271698, "learning_rate": 1.555875946211979e-05, "loss": 0.5091, "step": 4271 }, { "epoch": 0.9529333035913451, "grad_norm": 0.16707123816013336, "learning_rate": 1.5556802984253534e-05, "loss": 0.5048, "step": 4272 }, { "epoch": 0.9531563685032345, "grad_norm": 0.29071560502052307, "learning_rate": 1.5554846198616576e-05, "loss": 0.4998, "step": 4273 }, { "epoch": 0.9533794334151238, "grad_norm": 0.19042329490184784, "learning_rate": 1.5552889105317296e-05, "loss": 0.4883, "step": 4274 }, { "epoch": 0.9536024983270132, "grad_norm": 0.1610180288553238, "learning_rate": 1.555093170446409e-05, "loss": 0.4816, "step": 4275 }, { "epoch": 0.9538255632389026, "grad_norm": 0.16079892218112946, "learning_rate": 1.5548973996165365e-05, "loss": 0.5118, "step": 4276 }, { "epoch": 0.9540486281507918, "grad_norm": 0.1768450289964676, "learning_rate": 1.5547015980529558e-05, "loss": 0.5249, "step": 4277 }, { "epoch": 0.9542716930626812, "grad_norm": 0.1556175947189331, "learning_rate": 1.5545057657665115e-05, "loss": 0.5081, "step": 4278 }, { "epoch": 0.9544947579745706, "grad_norm": 0.16020707786083221, "learning_rate": 1.5543099027680496e-05, "loss": 0.4846, "step": 4279 }, { "epoch": 0.95471782288646, "grad_norm": 0.1518724262714386, "learning_rate": 1.554114009068419e-05, "loss": 0.4866, "step": 4280 }, { "epoch": 0.9549408877983493, "grad_norm": 0.1620427668094635, "learning_rate": 1.5539180846784686e-05, "loss": 0.5054, "step": 4281 }, { "epoch": 0.9551639527102387, "grad_norm": 0.1567380130290985, "learning_rate": 1.5537221296090506e-05, "loss": 0.4857, "step": 4282 }, { "epoch": 0.9553870176221281, "grad_norm": 0.1572207808494568, "learning_rate": 1.553526143871018e-05, "loss": 0.4803, "step": 4283 }, { "epoch": 0.9556100825340174, "grad_norm": 0.1630953699350357, "learning_rate": 1.553330127475226e-05, "loss": 0.5076, "step": 4284 }, { "epoch": 0.9558331474459068, "grad_norm": 0.16126613318920135, "learning_rate": 1.5531340804325303e-05, "loss": 0.4566, "step": 4285 }, { "epoch": 0.9560562123577961, "grad_norm": 0.16429363191127777, "learning_rate": 1.5529380027537904e-05, "loss": 0.4905, "step": 4286 }, { "epoch": 0.9562792772696854, "grad_norm": 0.16202637553215027, "learning_rate": 1.5527418944498656e-05, "loss": 0.4962, "step": 4287 }, { "epoch": 0.9565023421815748, "grad_norm": 0.16903194785118103, "learning_rate": 1.5525457555316177e-05, "loss": 0.4917, "step": 4288 }, { "epoch": 0.9567254070934642, "grad_norm": 0.16751469671726227, "learning_rate": 1.5523495860099102e-05, "loss": 0.4762, "step": 4289 }, { "epoch": 0.9569484720053536, "grad_norm": 0.16918997466564178, "learning_rate": 1.5521533858956085e-05, "loss": 0.5005, "step": 4290 }, { "epoch": 0.9571715369172429, "grad_norm": 0.268764466047287, "learning_rate": 1.551957155199579e-05, "loss": 0.4734, "step": 4291 }, { "epoch": 0.9573946018291323, "grad_norm": 0.1583995223045349, "learning_rate": 1.55176089393269e-05, "loss": 0.4776, "step": 4292 }, { "epoch": 0.9576176667410217, "grad_norm": 0.16260802745819092, "learning_rate": 1.5515646021058124e-05, "loss": 0.487, "step": 4293 }, { "epoch": 0.957840731652911, "grad_norm": 0.16519910097122192, "learning_rate": 1.5513682797298172e-05, "loss": 0.4916, "step": 4294 }, { "epoch": 0.9580637965648003, "grad_norm": 0.15430989861488342, "learning_rate": 1.551171926815579e-05, "loss": 0.4645, "step": 4295 }, { "epoch": 0.9582868614766897, "grad_norm": 0.15664781630039215, "learning_rate": 1.5509755433739723e-05, "loss": 0.4824, "step": 4296 }, { "epoch": 0.9585099263885791, "grad_norm": 0.16775622963905334, "learning_rate": 1.550779129415874e-05, "loss": 0.4993, "step": 4297 }, { "epoch": 0.9587329913004684, "grad_norm": 0.163412943482399, "learning_rate": 1.550582684952163e-05, "loss": 0.5176, "step": 4298 }, { "epoch": 0.9589560562123578, "grad_norm": 0.1565362513065338, "learning_rate": 1.5503862099937198e-05, "loss": 0.4667, "step": 4299 }, { "epoch": 0.9591791211242472, "grad_norm": 0.16091248393058777, "learning_rate": 1.550189704551426e-05, "loss": 0.4823, "step": 4300 }, { "epoch": 0.9594021860361365, "grad_norm": 0.1619873195886612, "learning_rate": 1.5499931686361658e-05, "loss": 0.4753, "step": 4301 }, { "epoch": 0.9596252509480259, "grad_norm": 0.18374527990818024, "learning_rate": 1.549796602258824e-05, "loss": 0.465, "step": 4302 }, { "epoch": 0.9598483158599153, "grad_norm": 0.16035234928131104, "learning_rate": 1.549600005430288e-05, "loss": 0.4913, "step": 4303 }, { "epoch": 0.9600713807718046, "grad_norm": 0.16635017096996307, "learning_rate": 1.549403378161447e-05, "loss": 0.4869, "step": 4304 }, { "epoch": 0.9602944456836939, "grad_norm": 0.16243912279605865, "learning_rate": 1.5492067204631908e-05, "loss": 0.4787, "step": 4305 }, { "epoch": 0.9605175105955833, "grad_norm": 0.1565091758966446, "learning_rate": 1.5490100323464118e-05, "loss": 0.4805, "step": 4306 }, { "epoch": 0.9607405755074727, "grad_norm": 0.1620541661977768, "learning_rate": 1.5488133138220038e-05, "loss": 0.5016, "step": 4307 }, { "epoch": 0.960963640419362, "grad_norm": 0.16399456560611725, "learning_rate": 1.5486165649008623e-05, "loss": 0.4977, "step": 4308 }, { "epoch": 0.9611867053312514, "grad_norm": 0.1687907576560974, "learning_rate": 1.5484197855938847e-05, "loss": 0.4966, "step": 4309 }, { "epoch": 0.9614097702431408, "grad_norm": 0.16494713723659515, "learning_rate": 1.548222975911969e-05, "loss": 0.5044, "step": 4310 }, { "epoch": 0.9616328351550301, "grad_norm": 0.16072864830493927, "learning_rate": 1.5480261358660172e-05, "loss": 0.4969, "step": 4311 }, { "epoch": 0.9618559000669195, "grad_norm": 0.1517607718706131, "learning_rate": 1.5478292654669304e-05, "loss": 0.4789, "step": 4312 }, { "epoch": 0.9620789649788088, "grad_norm": 0.15903200209140778, "learning_rate": 1.547632364725613e-05, "loss": 0.4877, "step": 4313 }, { "epoch": 0.9623020298906982, "grad_norm": 0.2090669870376587, "learning_rate": 1.5474354336529706e-05, "loss": 0.4699, "step": 4314 }, { "epoch": 0.9625250948025875, "grad_norm": 0.17890144884586334, "learning_rate": 1.5472384722599102e-05, "loss": 0.4921, "step": 4315 }, { "epoch": 0.9627481597144769, "grad_norm": 0.1629069298505783, "learning_rate": 1.547041480557341e-05, "loss": 0.5049, "step": 4316 }, { "epoch": 0.9629712246263663, "grad_norm": 0.1475018411874771, "learning_rate": 1.5468444585561736e-05, "loss": 0.4795, "step": 4317 }, { "epoch": 0.9631942895382556, "grad_norm": 0.15612593293190002, "learning_rate": 1.54664740626732e-05, "loss": 0.4402, "step": 4318 }, { "epoch": 0.963417354450145, "grad_norm": 0.15412123501300812, "learning_rate": 1.546450323701695e-05, "loss": 0.4493, "step": 4319 }, { "epoch": 0.9636404193620344, "grad_norm": 0.16448107361793518, "learning_rate": 1.5462532108702134e-05, "loss": 0.4979, "step": 4320 }, { "epoch": 0.9638634842739238, "grad_norm": 0.16529002785682678, "learning_rate": 1.546056067783793e-05, "loss": 0.5127, "step": 4321 }, { "epoch": 0.964086549185813, "grad_norm": 0.15807853639125824, "learning_rate": 1.545858894453353e-05, "loss": 0.4638, "step": 4322 }, { "epoch": 0.9643096140977024, "grad_norm": 0.16824275255203247, "learning_rate": 1.5456616908898134e-05, "loss": 0.4877, "step": 4323 }, { "epoch": 0.9645326790095918, "grad_norm": 0.15887348353862762, "learning_rate": 1.5454644571040973e-05, "loss": 0.4688, "step": 4324 }, { "epoch": 0.9647557439214811, "grad_norm": 0.16340267658233643, "learning_rate": 1.545267193107128e-05, "loss": 0.4779, "step": 4325 }, { "epoch": 0.9649788088333705, "grad_norm": 0.16982871294021606, "learning_rate": 1.545069898909832e-05, "loss": 0.5058, "step": 4326 }, { "epoch": 0.9652018737452599, "grad_norm": 0.15784378349781036, "learning_rate": 1.544872574523137e-05, "loss": 0.4563, "step": 4327 }, { "epoch": 0.9654249386571492, "grad_norm": 0.15926995873451233, "learning_rate": 1.5446752199579703e-05, "loss": 0.4893, "step": 4328 }, { "epoch": 0.9656480035690386, "grad_norm": 0.16018277406692505, "learning_rate": 1.544477835225265e-05, "loss": 0.4876, "step": 4329 }, { "epoch": 0.965871068480928, "grad_norm": 0.16839507222175598, "learning_rate": 1.544280420335951e-05, "loss": 0.5184, "step": 4330 }, { "epoch": 0.9660941333928174, "grad_norm": 0.1582273244857788, "learning_rate": 1.5440829753009646e-05, "loss": 0.4895, "step": 4331 }, { "epoch": 0.9663171983047066, "grad_norm": 0.15812306106090546, "learning_rate": 1.5438855001312402e-05, "loss": 0.5244, "step": 4332 }, { "epoch": 0.966540263216596, "grad_norm": 0.15881675481796265, "learning_rate": 1.5436879948377157e-05, "loss": 0.4541, "step": 4333 }, { "epoch": 0.9667633281284854, "grad_norm": 0.1649298518896103, "learning_rate": 1.5434904594313303e-05, "loss": 0.4898, "step": 4334 }, { "epoch": 0.9669863930403747, "grad_norm": 0.1670350432395935, "learning_rate": 1.5432928939230243e-05, "loss": 0.4961, "step": 4335 }, { "epoch": 0.9672094579522641, "grad_norm": 0.17490597069263458, "learning_rate": 1.5430952983237404e-05, "loss": 0.4996, "step": 4336 }, { "epoch": 0.9674325228641535, "grad_norm": 0.18037396669387817, "learning_rate": 1.542897672644423e-05, "loss": 0.4856, "step": 4337 }, { "epoch": 0.9676555877760429, "grad_norm": 0.16085660457611084, "learning_rate": 1.5427000168960172e-05, "loss": 0.4741, "step": 4338 }, { "epoch": 0.9678786526879322, "grad_norm": 0.16227804124355316, "learning_rate": 1.5425023310894707e-05, "loss": 0.4858, "step": 4339 }, { "epoch": 0.9681017175998216, "grad_norm": 0.21843907237052917, "learning_rate": 1.5423046152357328e-05, "loss": 0.4705, "step": 4340 }, { "epoch": 0.968324782511711, "grad_norm": 0.16902711987495422, "learning_rate": 1.542106869345754e-05, "loss": 0.4673, "step": 4341 }, { "epoch": 0.9685478474236002, "grad_norm": 0.15176647901535034, "learning_rate": 1.5419090934304865e-05, "loss": 0.477, "step": 4342 }, { "epoch": 0.9687709123354896, "grad_norm": 0.15946295857429504, "learning_rate": 1.5417112875008854e-05, "loss": 0.4768, "step": 4343 }, { "epoch": 0.968993977247379, "grad_norm": 0.1705644428730011, "learning_rate": 1.5415134515679053e-05, "loss": 0.4806, "step": 4344 }, { "epoch": 0.9692170421592683, "grad_norm": 0.1598884016275406, "learning_rate": 1.541315585642504e-05, "loss": 0.5018, "step": 4345 }, { "epoch": 0.9694401070711577, "grad_norm": 0.15806621313095093, "learning_rate": 1.54111768973564e-05, "loss": 0.4854, "step": 4346 }, { "epoch": 0.9696631719830471, "grad_norm": 0.15730160474777222, "learning_rate": 1.5409197638582753e-05, "loss": 0.499, "step": 4347 }, { "epoch": 0.9698862368949365, "grad_norm": 0.16367729008197784, "learning_rate": 1.540721808021371e-05, "loss": 0.4784, "step": 4348 }, { "epoch": 0.9701093018068258, "grad_norm": 0.16696509718894958, "learning_rate": 1.5405238222358925e-05, "loss": 0.4887, "step": 4349 }, { "epoch": 0.9703323667187151, "grad_norm": 0.1621687263250351, "learning_rate": 1.5403258065128042e-05, "loss": 0.5113, "step": 4350 }, { "epoch": 0.9705554316306045, "grad_norm": 0.1656588315963745, "learning_rate": 1.5401277608630742e-05, "loss": 0.5021, "step": 4351 }, { "epoch": 0.9707784965424938, "grad_norm": 0.17137381434440613, "learning_rate": 1.539929685297671e-05, "loss": 0.4965, "step": 4352 }, { "epoch": 0.9710015614543832, "grad_norm": 0.16810309886932373, "learning_rate": 1.5397315798275654e-05, "loss": 0.4961, "step": 4353 }, { "epoch": 0.9712246263662726, "grad_norm": 0.17059317231178284, "learning_rate": 1.5395334444637306e-05, "loss": 0.4708, "step": 4354 }, { "epoch": 0.971447691278162, "grad_norm": 0.16224807500839233, "learning_rate": 1.539335279217139e-05, "loss": 0.4593, "step": 4355 }, { "epoch": 0.9716707561900513, "grad_norm": 0.17209084331989288, "learning_rate": 1.5391370840987674e-05, "loss": 0.4959, "step": 4356 }, { "epoch": 0.9718938211019407, "grad_norm": 0.16699501872062683, "learning_rate": 1.5389388591195928e-05, "loss": 0.4897, "step": 4357 }, { "epoch": 0.9721168860138301, "grad_norm": 0.15935075283050537, "learning_rate": 1.538740604290594e-05, "loss": 0.4833, "step": 4358 }, { "epoch": 0.9723399509257193, "grad_norm": 0.16760936379432678, "learning_rate": 1.538542319622752e-05, "loss": 0.4707, "step": 4359 }, { "epoch": 0.9725630158376087, "grad_norm": 0.19774286448955536, "learning_rate": 1.5383440051270486e-05, "loss": 0.4799, "step": 4360 }, { "epoch": 0.9727860807494981, "grad_norm": 0.1727793663740158, "learning_rate": 1.5381456608144677e-05, "loss": 0.4761, "step": 4361 }, { "epoch": 0.9730091456613874, "grad_norm": 0.1611672341823578, "learning_rate": 1.5379472866959954e-05, "loss": 0.4984, "step": 4362 }, { "epoch": 0.9732322105732768, "grad_norm": 0.15909035503864288, "learning_rate": 1.537748882782618e-05, "loss": 0.487, "step": 4363 }, { "epoch": 0.9734552754851662, "grad_norm": 0.17754100263118744, "learning_rate": 1.5375504490853255e-05, "loss": 0.4782, "step": 4364 }, { "epoch": 0.9736783403970556, "grad_norm": 0.19527114927768707, "learning_rate": 1.5373519856151077e-05, "loss": 0.4783, "step": 4365 }, { "epoch": 0.9739014053089449, "grad_norm": 0.1601899415254593, "learning_rate": 1.5371534923829562e-05, "loss": 0.5042, "step": 4366 }, { "epoch": 0.9741244702208343, "grad_norm": 0.16773121058940887, "learning_rate": 1.536954969399866e-05, "loss": 0.5025, "step": 4367 }, { "epoch": 0.9743475351327237, "grad_norm": 0.1618761271238327, "learning_rate": 1.5367564166768322e-05, "loss": 0.5125, "step": 4368 }, { "epoch": 0.9745706000446129, "grad_norm": 0.1555018573999405, "learning_rate": 1.5365578342248515e-05, "loss": 0.4754, "step": 4369 }, { "epoch": 0.9747936649565023, "grad_norm": 0.16692039370536804, "learning_rate": 1.5363592220549227e-05, "loss": 0.5025, "step": 4370 }, { "epoch": 0.9750167298683917, "grad_norm": 0.16272681951522827, "learning_rate": 1.5361605801780465e-05, "loss": 0.513, "step": 4371 }, { "epoch": 0.9752397947802811, "grad_norm": 0.17153267562389374, "learning_rate": 1.535961908605225e-05, "loss": 0.4716, "step": 4372 }, { "epoch": 0.9754628596921704, "grad_norm": 0.162004753947258, "learning_rate": 1.5357632073474614e-05, "loss": 0.4752, "step": 4373 }, { "epoch": 0.9756859246040598, "grad_norm": 0.17653700709342957, "learning_rate": 1.535564476415761e-05, "loss": 0.4875, "step": 4374 }, { "epoch": 0.9759089895159492, "grad_norm": 0.16345418989658356, "learning_rate": 1.535365715821132e-05, "loss": 0.5036, "step": 4375 }, { "epoch": 0.9761320544278385, "grad_norm": 0.16799689829349518, "learning_rate": 1.535166925574581e-05, "loss": 0.5035, "step": 4376 }, { "epoch": 0.9763551193397279, "grad_norm": 0.204716295003891, "learning_rate": 1.53496810568712e-05, "loss": 0.501, "step": 4377 }, { "epoch": 0.9765781842516172, "grad_norm": 0.15521782636642456, "learning_rate": 1.53476925616976e-05, "loss": 0.4864, "step": 4378 }, { "epoch": 0.9768012491635066, "grad_norm": 0.16439463198184967, "learning_rate": 1.5345703770335147e-05, "loss": 0.491, "step": 4379 }, { "epoch": 0.9770243140753959, "grad_norm": 0.16255317628383636, "learning_rate": 1.5343714682893997e-05, "loss": 0.5108, "step": 4380 }, { "epoch": 0.9772473789872853, "grad_norm": 0.15555396676063538, "learning_rate": 1.534172529948431e-05, "loss": 0.4506, "step": 4381 }, { "epoch": 0.9774704438991747, "grad_norm": 0.16245706379413605, "learning_rate": 1.5339735620216275e-05, "loss": 0.4892, "step": 4382 }, { "epoch": 0.977693508811064, "grad_norm": 0.16190017759799957, "learning_rate": 1.5337745645200097e-05, "loss": 0.4716, "step": 4383 }, { "epoch": 0.9779165737229534, "grad_norm": 0.1614699810743332, "learning_rate": 1.5335755374545985e-05, "loss": 0.5041, "step": 4384 }, { "epoch": 0.9781396386348428, "grad_norm": 0.1543981432914734, "learning_rate": 1.533376480836418e-05, "loss": 0.4837, "step": 4385 }, { "epoch": 0.978362703546732, "grad_norm": 0.15510593354701996, "learning_rate": 1.5331773946764928e-05, "loss": 0.5073, "step": 4386 }, { "epoch": 0.9785857684586214, "grad_norm": 0.16036750376224518, "learning_rate": 1.5329782789858495e-05, "loss": 0.511, "step": 4387 }, { "epoch": 0.9788088333705108, "grad_norm": 0.1586884707212448, "learning_rate": 1.532779133775517e-05, "loss": 0.4783, "step": 4388 }, { "epoch": 0.9790318982824002, "grad_norm": 0.1595693677663803, "learning_rate": 1.5325799590565247e-05, "loss": 0.4847, "step": 4389 }, { "epoch": 0.9792549631942895, "grad_norm": 0.1676463484764099, "learning_rate": 1.532380754839904e-05, "loss": 0.4857, "step": 4390 }, { "epoch": 0.9794780281061789, "grad_norm": 0.1569330245256424, "learning_rate": 1.532181521136688e-05, "loss": 0.5038, "step": 4391 }, { "epoch": 0.9797010930180683, "grad_norm": 0.15425747632980347, "learning_rate": 1.5319822579579125e-05, "loss": 0.4687, "step": 4392 }, { "epoch": 0.9799241579299576, "grad_norm": 0.1658424586057663, "learning_rate": 1.5317829653146127e-05, "loss": 0.4727, "step": 4393 }, { "epoch": 0.980147222841847, "grad_norm": 0.16950277984142303, "learning_rate": 1.5315836432178275e-05, "loss": 0.5056, "step": 4394 }, { "epoch": 0.9803702877537364, "grad_norm": 0.1796022653579712, "learning_rate": 1.5313842916785965e-05, "loss": 0.5062, "step": 4395 }, { "epoch": 0.9805933526656258, "grad_norm": 0.16499702632427216, "learning_rate": 1.5311849107079603e-05, "loss": 0.5142, "step": 4396 }, { "epoch": 0.980816417577515, "grad_norm": 0.1709393560886383, "learning_rate": 1.5309855003169632e-05, "loss": 0.5001, "step": 4397 }, { "epoch": 0.9810394824894044, "grad_norm": 0.16077296435832977, "learning_rate": 1.5307860605166487e-05, "loss": 0.4908, "step": 4398 }, { "epoch": 0.9812625474012938, "grad_norm": 0.17582206428050995, "learning_rate": 1.5305865913180633e-05, "loss": 0.4844, "step": 4399 }, { "epoch": 0.9814856123131831, "grad_norm": 0.16168825328350067, "learning_rate": 1.5303870927322552e-05, "loss": 0.4856, "step": 4400 }, { "epoch": 0.9817086772250725, "grad_norm": 0.15929754078388214, "learning_rate": 1.5301875647702732e-05, "loss": 0.4878, "step": 4401 }, { "epoch": 0.9819317421369619, "grad_norm": 0.16380812227725983, "learning_rate": 1.5299880074431693e-05, "loss": 0.5009, "step": 4402 }, { "epoch": 0.9821548070488512, "grad_norm": 0.17140789330005646, "learning_rate": 1.5297884207619957e-05, "loss": 0.4604, "step": 4403 }, { "epoch": 0.9823778719607406, "grad_norm": 0.17358756065368652, "learning_rate": 1.5295888047378064e-05, "loss": 0.4905, "step": 4404 }, { "epoch": 0.98260093687263, "grad_norm": 0.16201473772525787, "learning_rate": 1.5293891593816583e-05, "loss": 0.5156, "step": 4405 }, { "epoch": 0.9828240017845193, "grad_norm": 0.16091175377368927, "learning_rate": 1.529189484704608e-05, "loss": 0.4883, "step": 4406 }, { "epoch": 0.9830470666964086, "grad_norm": 0.16278314590454102, "learning_rate": 1.528989780717716e-05, "loss": 0.4727, "step": 4407 }, { "epoch": 0.983270131608298, "grad_norm": 0.17446744441986084, "learning_rate": 1.5287900474320422e-05, "loss": 0.4824, "step": 4408 }, { "epoch": 0.9834931965201874, "grad_norm": 0.3287087678909302, "learning_rate": 1.5285902848586495e-05, "loss": 0.49, "step": 4409 }, { "epoch": 0.9837162614320767, "grad_norm": 0.16612185537815094, "learning_rate": 1.5283904930086017e-05, "loss": 0.4911, "step": 4410 }, { "epoch": 0.9839393263439661, "grad_norm": 0.16220030188560486, "learning_rate": 1.528190671892965e-05, "loss": 0.4537, "step": 4411 }, { "epoch": 0.9841623912558555, "grad_norm": 0.1631203591823578, "learning_rate": 1.5279908215228058e-05, "loss": 0.4725, "step": 4412 }, { "epoch": 0.9843854561677449, "grad_norm": 0.15897974371910095, "learning_rate": 1.5277909419091942e-05, "loss": 0.4794, "step": 4413 }, { "epoch": 0.9846085210796341, "grad_norm": 0.18077795207500458, "learning_rate": 1.5275910330632e-05, "loss": 0.4764, "step": 4414 }, { "epoch": 0.9848315859915235, "grad_norm": 0.1682848483324051, "learning_rate": 1.5273910949958963e-05, "loss": 0.5024, "step": 4415 }, { "epoch": 0.9850546509034129, "grad_norm": 0.1584632247686386, "learning_rate": 1.527191127718356e-05, "loss": 0.5007, "step": 4416 }, { "epoch": 0.9852777158153022, "grad_norm": 0.1735336184501648, "learning_rate": 1.5269911312416547e-05, "loss": 0.4887, "step": 4417 }, { "epoch": 0.9855007807271916, "grad_norm": 0.16184300184249878, "learning_rate": 1.5267911055768697e-05, "loss": 0.5064, "step": 4418 }, { "epoch": 0.985723845639081, "grad_norm": 0.15283328294754028, "learning_rate": 1.5265910507350797e-05, "loss": 0.5012, "step": 4419 }, { "epoch": 0.9859469105509703, "grad_norm": 0.1493072211742401, "learning_rate": 1.526390966727365e-05, "loss": 0.4763, "step": 4420 }, { "epoch": 0.9861699754628597, "grad_norm": 0.15723635256290436, "learning_rate": 1.526190853564807e-05, "loss": 0.4893, "step": 4421 }, { "epoch": 0.9863930403747491, "grad_norm": 0.17012642323970795, "learning_rate": 1.52599071125849e-05, "loss": 0.4905, "step": 4422 }, { "epoch": 0.9866161052866385, "grad_norm": 0.15649664402008057, "learning_rate": 1.5257905398194988e-05, "loss": 0.4841, "step": 4423 }, { "epoch": 0.9868391701985277, "grad_norm": 0.1696402132511139, "learning_rate": 1.5255903392589204e-05, "loss": 0.4792, "step": 4424 }, { "epoch": 0.9870622351104171, "grad_norm": 0.17853881418704987, "learning_rate": 1.5253901095878423e-05, "loss": 0.5112, "step": 4425 }, { "epoch": 0.9872853000223065, "grad_norm": 0.1598142683506012, "learning_rate": 1.5251898508173558e-05, "loss": 0.4907, "step": 4426 }, { "epoch": 0.9875083649341958, "grad_norm": 0.1565473973751068, "learning_rate": 1.5249895629585511e-05, "loss": 0.4922, "step": 4427 }, { "epoch": 0.9877314298460852, "grad_norm": 0.16049052774906158, "learning_rate": 1.5247892460225226e-05, "loss": 0.4947, "step": 4428 }, { "epoch": 0.9879544947579746, "grad_norm": 0.16537557542324066, "learning_rate": 1.5245889000203644e-05, "loss": 0.4547, "step": 4429 }, { "epoch": 0.988177559669864, "grad_norm": 0.16105584800243378, "learning_rate": 1.5243885249631732e-05, "loss": 0.4697, "step": 4430 }, { "epoch": 0.9884006245817533, "grad_norm": 0.5197194814682007, "learning_rate": 1.5241881208620468e-05, "loss": 0.4723, "step": 4431 }, { "epoch": 0.9886236894936427, "grad_norm": 0.15987588465213776, "learning_rate": 1.5239876877280852e-05, "loss": 0.5388, "step": 4432 }, { "epoch": 0.988846754405532, "grad_norm": 0.16223685443401337, "learning_rate": 1.5237872255723894e-05, "loss": 0.5185, "step": 4433 }, { "epoch": 0.9890698193174213, "grad_norm": 0.16801123321056366, "learning_rate": 1.5235867344060622e-05, "loss": 0.4825, "step": 4434 }, { "epoch": 0.9892928842293107, "grad_norm": 0.16588987410068512, "learning_rate": 1.523386214240208e-05, "loss": 0.4933, "step": 4435 }, { "epoch": 0.9895159491412001, "grad_norm": 0.1642870157957077, "learning_rate": 1.5231856650859334e-05, "loss": 0.4572, "step": 4436 }, { "epoch": 0.9897390140530894, "grad_norm": 0.1569276601076126, "learning_rate": 1.5229850869543454e-05, "loss": 0.4844, "step": 4437 }, { "epoch": 0.9899620789649788, "grad_norm": 0.16232363879680634, "learning_rate": 1.5227844798565538e-05, "loss": 0.5041, "step": 4438 }, { "epoch": 0.9901851438768682, "grad_norm": 0.18178559839725494, "learning_rate": 1.5225838438036693e-05, "loss": 0.5203, "step": 4439 }, { "epoch": 0.9904082087887576, "grad_norm": 0.1483062505722046, "learning_rate": 1.5223831788068039e-05, "loss": 0.4601, "step": 4440 }, { "epoch": 0.9906312737006469, "grad_norm": 0.16904570162296295, "learning_rate": 1.5221824848770728e-05, "loss": 0.4692, "step": 4441 }, { "epoch": 0.9908543386125362, "grad_norm": 0.16170957684516907, "learning_rate": 1.5219817620255906e-05, "loss": 0.476, "step": 4442 }, { "epoch": 0.9910774035244256, "grad_norm": 0.15581966936588287, "learning_rate": 1.521781010263475e-05, "loss": 0.4685, "step": 4443 }, { "epoch": 0.9913004684363149, "grad_norm": 0.17382118105888367, "learning_rate": 1.521580229601845e-05, "loss": 0.511, "step": 4444 }, { "epoch": 0.9915235333482043, "grad_norm": 0.16696421802043915, "learning_rate": 1.521379420051821e-05, "loss": 0.5072, "step": 4445 }, { "epoch": 0.9917465982600937, "grad_norm": 0.16701167821884155, "learning_rate": 1.521178581624525e-05, "loss": 0.521, "step": 4446 }, { "epoch": 0.9919696631719831, "grad_norm": 0.16650201380252838, "learning_rate": 1.520977714331081e-05, "loss": 0.4945, "step": 4447 }, { "epoch": 0.9921927280838724, "grad_norm": 0.1575930416584015, "learning_rate": 1.5207768181826138e-05, "loss": 0.4668, "step": 4448 }, { "epoch": 0.9924157929957618, "grad_norm": 0.16714414954185486, "learning_rate": 1.5205758931902507e-05, "loss": 0.4739, "step": 4449 }, { "epoch": 0.9926388579076512, "grad_norm": 0.1591663658618927, "learning_rate": 1.5203749393651204e-05, "loss": 0.5024, "step": 4450 }, { "epoch": 0.9928619228195404, "grad_norm": 0.16330404579639435, "learning_rate": 1.5201739567183525e-05, "loss": 0.5104, "step": 4451 }, { "epoch": 0.9930849877314298, "grad_norm": 0.16072112321853638, "learning_rate": 1.519972945261079e-05, "loss": 0.4722, "step": 4452 }, { "epoch": 0.9933080526433192, "grad_norm": 0.1594439148902893, "learning_rate": 1.5197719050044328e-05, "loss": 0.4951, "step": 4453 }, { "epoch": 0.9935311175552086, "grad_norm": 0.17731733620166779, "learning_rate": 1.519570835959549e-05, "loss": 0.4799, "step": 4454 }, { "epoch": 0.9937541824670979, "grad_norm": 0.1658770889043808, "learning_rate": 1.5193697381375641e-05, "loss": 0.471, "step": 4455 }, { "epoch": 0.9939772473789873, "grad_norm": 0.168108731508255, "learning_rate": 1.5191686115496163e-05, "loss": 0.4856, "step": 4456 }, { "epoch": 0.9942003122908767, "grad_norm": 0.1631343513727188, "learning_rate": 1.5189674562068448e-05, "loss": 0.4795, "step": 4457 }, { "epoch": 0.994423377202766, "grad_norm": 0.17591005563735962, "learning_rate": 1.5187662721203916e-05, "loss": 0.4431, "step": 4458 }, { "epoch": 0.9946464421146554, "grad_norm": 0.16236771643161774, "learning_rate": 1.5185650593013984e-05, "loss": 0.5097, "step": 4459 }, { "epoch": 0.9948695070265448, "grad_norm": 0.15711717307567596, "learning_rate": 1.5183638177610109e-05, "loss": 0.4896, "step": 4460 }, { "epoch": 0.995092571938434, "grad_norm": 0.16357070207595825, "learning_rate": 1.5181625475103744e-05, "loss": 0.4878, "step": 4461 }, { "epoch": 0.9953156368503234, "grad_norm": 0.1608823835849762, "learning_rate": 1.5179612485606366e-05, "loss": 0.4924, "step": 4462 }, { "epoch": 0.9955387017622128, "grad_norm": 0.1610344648361206, "learning_rate": 1.5177599209229468e-05, "loss": 0.4564, "step": 4463 }, { "epoch": 0.9957617666741022, "grad_norm": 0.1570533961057663, "learning_rate": 1.5175585646084557e-05, "loss": 0.4665, "step": 4464 }, { "epoch": 0.9959848315859915, "grad_norm": 0.16878454387187958, "learning_rate": 1.5173571796283155e-05, "loss": 0.4832, "step": 4465 }, { "epoch": 0.9962078964978809, "grad_norm": 0.15934909880161285, "learning_rate": 1.5171557659936806e-05, "loss": 0.4629, "step": 4466 }, { "epoch": 0.9964309614097703, "grad_norm": 0.17097942531108856, "learning_rate": 1.5169543237157062e-05, "loss": 0.5009, "step": 4467 }, { "epoch": 0.9966540263216596, "grad_norm": 0.1563706398010254, "learning_rate": 1.5167528528055498e-05, "loss": 0.4864, "step": 4468 }, { "epoch": 0.996877091233549, "grad_norm": 0.16761906445026398, "learning_rate": 1.5165513532743696e-05, "loss": 0.4688, "step": 4469 }, { "epoch": 0.9971001561454383, "grad_norm": 0.16788896918296814, "learning_rate": 1.5163498251333267e-05, "loss": 0.4975, "step": 4470 }, { "epoch": 0.9973232210573277, "grad_norm": 0.16318921744823456, "learning_rate": 1.516148268393582e-05, "loss": 0.483, "step": 4471 }, { "epoch": 0.997546285969217, "grad_norm": 0.1533585637807846, "learning_rate": 1.5159466830662997e-05, "loss": 0.4888, "step": 4472 }, { "epoch": 0.9977693508811064, "grad_norm": 0.16081759333610535, "learning_rate": 1.515745069162645e-05, "loss": 0.4997, "step": 4473 }, { "epoch": 0.9979924157929958, "grad_norm": 0.17346571385860443, "learning_rate": 1.5155434266937836e-05, "loss": 0.4918, "step": 4474 }, { "epoch": 0.9982154807048851, "grad_norm": 0.16051939129829407, "learning_rate": 1.515341755670885e-05, "loss": 0.4911, "step": 4475 }, { "epoch": 0.9984385456167745, "grad_norm": 0.15489095449447632, "learning_rate": 1.5151400561051177e-05, "loss": 0.4573, "step": 4476 }, { "epoch": 0.9986616105286639, "grad_norm": 0.17051677405834198, "learning_rate": 1.5149383280076544e-05, "loss": 0.47, "step": 4477 }, { "epoch": 0.9988846754405531, "grad_norm": 0.16898037493228912, "learning_rate": 1.5147365713896669e-05, "loss": 0.5265, "step": 4478 }, { "epoch": 0.9991077403524425, "grad_norm": 0.1665852814912796, "learning_rate": 1.5145347862623303e-05, "loss": 0.4967, "step": 4479 }, { "epoch": 0.9993308052643319, "grad_norm": 0.16627469658851624, "learning_rate": 1.5143329726368205e-05, "loss": 0.4755, "step": 4480 }, { "epoch": 0.9995538701762213, "grad_norm": 0.15166249871253967, "learning_rate": 1.5141311305243158e-05, "loss": 0.4762, "step": 4481 }, { "epoch": 0.9997769350881106, "grad_norm": 0.15809613466262817, "learning_rate": 1.513929259935995e-05, "loss": 0.4825, "step": 4482 }, { "epoch": 1.0, "grad_norm": 0.24375846982002258, "learning_rate": 1.5137273608830387e-05, "loss": 0.4902, "step": 4483 }, { "epoch": 1.0, "eval_loss": 0.3229251801967621, "eval_runtime": 666.2343, "eval_samples_per_second": 94.726, "eval_steps_per_second": 1.481, "step": 4483 } ], "logging_steps": 1, "max_steps": 13449, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.4381131957442655e+20, "train_batch_size": 8, "trial_name": null, "trial_params": null }