| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 711, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.004232804232804233, | |
| "grad_norm": 54.30406078717587, | |
| "learning_rate": 0.0, | |
| "loss": 1.2873, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.008465608465608466, | |
| "grad_norm": 56.84874016239752, | |
| "learning_rate": 2.0833333333333333e-07, | |
| "loss": 1.278, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.012698412698412698, | |
| "grad_norm": 54.878082362253245, | |
| "learning_rate": 4.1666666666666667e-07, | |
| "loss": 1.2261, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.016931216931216932, | |
| "grad_norm": 54.66242016872322, | |
| "learning_rate": 6.25e-07, | |
| "loss": 1.1957, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.021164021164021163, | |
| "grad_norm": 54.63578365153812, | |
| "learning_rate": 8.333333333333333e-07, | |
| "loss": 1.3671, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.025396825396825397, | |
| "grad_norm": 52.69479916022209, | |
| "learning_rate": 1.0416666666666667e-06, | |
| "loss": 1.0852, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.02962962962962963, | |
| "grad_norm": 51.80015106662938, | |
| "learning_rate": 1.25e-06, | |
| "loss": 1.1174, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.033862433862433865, | |
| "grad_norm": 46.94049102872614, | |
| "learning_rate": 1.4583333333333335e-06, | |
| "loss": 1.0722, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.0380952380952381, | |
| "grad_norm": 42.75063943291744, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 0.998, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.042328042328042326, | |
| "grad_norm": 43.214149555747724, | |
| "learning_rate": 1.8750000000000003e-06, | |
| "loss": 0.942, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04656084656084656, | |
| "grad_norm": 33.48353944926147, | |
| "learning_rate": 2.0833333333333334e-06, | |
| "loss": 0.6964, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.050793650793650794, | |
| "grad_norm": 33.50023841667552, | |
| "learning_rate": 2.2916666666666666e-06, | |
| "loss": 0.8639, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.05502645502645503, | |
| "grad_norm": 28.838714912036302, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.8166, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.05925925925925926, | |
| "grad_norm": 13.200524499309642, | |
| "learning_rate": 2.7083333333333334e-06, | |
| "loss": 0.6199, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.06349206349206349, | |
| "grad_norm": 12.22641881931881, | |
| "learning_rate": 2.916666666666667e-06, | |
| "loss": 0.5748, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.06772486772486773, | |
| "grad_norm": 11.38831601039375, | |
| "learning_rate": 3.125e-06, | |
| "loss": 0.6128, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.07195767195767196, | |
| "grad_norm": 9.044088753747445, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 0.5587, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.0761904761904762, | |
| "grad_norm": 7.844366385332237, | |
| "learning_rate": 3.5416666666666673e-06, | |
| "loss": 0.5319, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.08042328042328042, | |
| "grad_norm": 5.731160381356699, | |
| "learning_rate": 3.7500000000000005e-06, | |
| "loss": 0.4898, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.08465608465608465, | |
| "grad_norm": 4.726984164569137, | |
| "learning_rate": 3.958333333333333e-06, | |
| "loss": 0.5398, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.08888888888888889, | |
| "grad_norm": 4.5781542459814535, | |
| "learning_rate": 4.166666666666667e-06, | |
| "loss": 0.3968, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.09312169312169312, | |
| "grad_norm": 3.7775510520748368, | |
| "learning_rate": 4.3750000000000005e-06, | |
| "loss": 0.3964, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.09735449735449736, | |
| "grad_norm": 3.05477842541282, | |
| "learning_rate": 4.583333333333333e-06, | |
| "loss": 0.4362, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.10158730158730159, | |
| "grad_norm": 3.438531926670594, | |
| "learning_rate": 4.791666666666668e-06, | |
| "loss": 0.458, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.10582010582010581, | |
| "grad_norm": 3.8238853915379662, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4659, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.11005291005291006, | |
| "grad_norm": 2.8947366693177288, | |
| "learning_rate": 5.208333333333334e-06, | |
| "loss": 0.3804, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.11428571428571428, | |
| "grad_norm": 3.6390451997485584, | |
| "learning_rate": 5.416666666666667e-06, | |
| "loss": 0.4531, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.11851851851851852, | |
| "grad_norm": 2.9482232685782703, | |
| "learning_rate": 5.625e-06, | |
| "loss": 0.3826, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.12275132275132275, | |
| "grad_norm": 3.674602280740873, | |
| "learning_rate": 5.833333333333334e-06, | |
| "loss": 0.4131, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.12698412698412698, | |
| "grad_norm": 3.123169330914684, | |
| "learning_rate": 6.041666666666667e-06, | |
| "loss": 0.3337, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1312169312169312, | |
| "grad_norm": 2.8563692549958866, | |
| "learning_rate": 6.25e-06, | |
| "loss": 0.3496, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.13544973544973546, | |
| "grad_norm": 2.5685868294187646, | |
| "learning_rate": 6.458333333333334e-06, | |
| "loss": 0.3464, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.13968253968253969, | |
| "grad_norm": 2.7561118762037933, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 0.3553, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.1439153439153439, | |
| "grad_norm": 2.799437451043737, | |
| "learning_rate": 6.875e-06, | |
| "loss": 0.3412, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.14814814814814814, | |
| "grad_norm": 3.292845577989745, | |
| "learning_rate": 7.083333333333335e-06, | |
| "loss": 0.3276, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.1523809523809524, | |
| "grad_norm": 2.3306789119643936, | |
| "learning_rate": 7.291666666666667e-06, | |
| "loss": 0.2384, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.15661375661375662, | |
| "grad_norm": 2.803053991451241, | |
| "learning_rate": 7.500000000000001e-06, | |
| "loss": 0.3443, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.16084656084656085, | |
| "grad_norm": 2.8701427259214065, | |
| "learning_rate": 7.708333333333334e-06, | |
| "loss": 0.3013, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.16507936507936508, | |
| "grad_norm": 2.537652900045535, | |
| "learning_rate": 7.916666666666667e-06, | |
| "loss": 0.3267, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.1693121693121693, | |
| "grad_norm": 2.6768501900523223, | |
| "learning_rate": 8.125000000000001e-06, | |
| "loss": 0.2639, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.17354497354497356, | |
| "grad_norm": 2.6577601401008333, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 0.3324, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.17777777777777778, | |
| "grad_norm": 2.5512677479663632, | |
| "learning_rate": 8.541666666666666e-06, | |
| "loss": 0.2672, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.182010582010582, | |
| "grad_norm": 2.8990280872171805, | |
| "learning_rate": 8.750000000000001e-06, | |
| "loss": 0.2439, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.18624338624338624, | |
| "grad_norm": 2.889097227933124, | |
| "learning_rate": 8.958333333333334e-06, | |
| "loss": 0.3163, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.19047619047619047, | |
| "grad_norm": 2.9104235083784102, | |
| "learning_rate": 9.166666666666666e-06, | |
| "loss": 0.3205, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.19470899470899472, | |
| "grad_norm": 3.083107801252212, | |
| "learning_rate": 9.375000000000001e-06, | |
| "loss": 0.3172, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.19894179894179895, | |
| "grad_norm": 2.3884460518664374, | |
| "learning_rate": 9.583333333333335e-06, | |
| "loss": 0.1934, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.20317460317460317, | |
| "grad_norm": 3.226585957430524, | |
| "learning_rate": 9.791666666666666e-06, | |
| "loss": 0.2701, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.2074074074074074, | |
| "grad_norm": 2.7423891340389615, | |
| "learning_rate": 1e-05, | |
| "loss": 0.2527, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.21164021164021163, | |
| "grad_norm": 2.820228218301591, | |
| "learning_rate": 9.999969538288953e-06, | |
| "loss": 0.2192, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.21587301587301588, | |
| "grad_norm": 2.467968682909313, | |
| "learning_rate": 9.999878153526974e-06, | |
| "loss": 0.2493, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.2201058201058201, | |
| "grad_norm": 2.66662023011886, | |
| "learning_rate": 9.999725846827562e-06, | |
| "loss": 0.2806, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.22433862433862434, | |
| "grad_norm": 3.129707362043057, | |
| "learning_rate": 9.999512620046523e-06, | |
| "loss": 0.309, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.22857142857142856, | |
| "grad_norm": 2.56704704796148, | |
| "learning_rate": 9.999238475781957e-06, | |
| "loss": 0.2664, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.2328042328042328, | |
| "grad_norm": 2.351657358687772, | |
| "learning_rate": 9.998903417374228e-06, | |
| "loss": 0.2318, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.23703703703703705, | |
| "grad_norm": 2.616314308180943, | |
| "learning_rate": 9.998507448905917e-06, | |
| "loss": 0.2548, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.24126984126984127, | |
| "grad_norm": 2.740317144109295, | |
| "learning_rate": 9.998050575201772e-06, | |
| "loss": 0.2506, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.2455026455026455, | |
| "grad_norm": 2.1871378697009396, | |
| "learning_rate": 9.997532801828659e-06, | |
| "loss": 0.2187, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.24973544973544973, | |
| "grad_norm": 3.3209760924838423, | |
| "learning_rate": 9.99695413509548e-06, | |
| "loss": 0.2841, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.25396825396825395, | |
| "grad_norm": 2.447101187980101, | |
| "learning_rate": 9.996314582053106e-06, | |
| "loss": 0.2218, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2582010582010582, | |
| "grad_norm": 2.833727508307957, | |
| "learning_rate": 9.995614150494293e-06, | |
| "loss": 0.2551, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.2624338624338624, | |
| "grad_norm": 2.544105354348358, | |
| "learning_rate": 9.994852848953574e-06, | |
| "loss": 0.2213, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.26666666666666666, | |
| "grad_norm": 2.4267488774897927, | |
| "learning_rate": 9.994030686707171e-06, | |
| "loss": 0.1665, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.2708994708994709, | |
| "grad_norm": 1.8496876818825865, | |
| "learning_rate": 9.993147673772869e-06, | |
| "loss": 0.176, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.2751322751322751, | |
| "grad_norm": 2.2079265341473375, | |
| "learning_rate": 9.992203820909906e-06, | |
| "loss": 0.2061, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.27936507936507937, | |
| "grad_norm": 2.7008947539879693, | |
| "learning_rate": 9.991199139618828e-06, | |
| "loss": 0.2402, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.28359788359788357, | |
| "grad_norm": 1.9380719993127324, | |
| "learning_rate": 9.990133642141359e-06, | |
| "loss": 0.2095, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.2878306878306878, | |
| "grad_norm": 2.3742267443315432, | |
| "learning_rate": 9.989007341460251e-06, | |
| "loss": 0.2176, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.2920634920634921, | |
| "grad_norm": 2.1948771063063632, | |
| "learning_rate": 9.987820251299121e-06, | |
| "loss": 0.2145, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.2962962962962963, | |
| "grad_norm": 1.9006443729448308, | |
| "learning_rate": 9.98657238612229e-06, | |
| "loss": 0.2187, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.30052910052910053, | |
| "grad_norm": 2.0165710016106995, | |
| "learning_rate": 9.985263761134602e-06, | |
| "loss": 0.1908, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.3047619047619048, | |
| "grad_norm": 2.05912929127493, | |
| "learning_rate": 9.983894392281237e-06, | |
| "loss": 0.2137, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.308994708994709, | |
| "grad_norm": 2.810426086613887, | |
| "learning_rate": 9.982464296247523e-06, | |
| "loss": 0.2565, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.31322751322751324, | |
| "grad_norm": 2.3766628874747937, | |
| "learning_rate": 9.980973490458728e-06, | |
| "loss": 0.1442, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.31746031746031744, | |
| "grad_norm": 2.4709186657411233, | |
| "learning_rate": 9.979421993079853e-06, | |
| "loss": 0.2542, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.3216931216931217, | |
| "grad_norm": 1.9082809595138144, | |
| "learning_rate": 9.9778098230154e-06, | |
| "loss": 0.19, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.32592592592592595, | |
| "grad_norm": 2.071460019416873, | |
| "learning_rate": 9.976136999909156e-06, | |
| "loss": 0.1663, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.33015873015873015, | |
| "grad_norm": 2.516711677394165, | |
| "learning_rate": 9.974403544143942e-06, | |
| "loss": 0.1794, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.3343915343915344, | |
| "grad_norm": 1.7925945452723815, | |
| "learning_rate": 9.972609476841368e-06, | |
| "loss": 0.1671, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.3386243386243386, | |
| "grad_norm": 1.9441478155371381, | |
| "learning_rate": 9.970754819861577e-06, | |
| "loss": 0.1743, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.34285714285714286, | |
| "grad_norm": 2.6254833507520234, | |
| "learning_rate": 9.968839595802982e-06, | |
| "loss": 0.2414, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.3470899470899471, | |
| "grad_norm": 2.316238733652428, | |
| "learning_rate": 9.966863828001982e-06, | |
| "loss": 0.2089, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.3513227513227513, | |
| "grad_norm": 2.0519535959849593, | |
| "learning_rate": 9.964827540532685e-06, | |
| "loss": 0.2097, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.35555555555555557, | |
| "grad_norm": 2.057989705603544, | |
| "learning_rate": 9.962730758206612e-06, | |
| "loss": 0.1938, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.35978835978835977, | |
| "grad_norm": 2.672807955843952, | |
| "learning_rate": 9.960573506572391e-06, | |
| "loss": 0.258, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.364021164021164, | |
| "grad_norm": 2.2091594991845716, | |
| "learning_rate": 9.958355811915452e-06, | |
| "loss": 0.1949, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.3682539682539683, | |
| "grad_norm": 2.179853329030986, | |
| "learning_rate": 9.95607770125771e-06, | |
| "loss": 0.2111, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.3724867724867725, | |
| "grad_norm": 1.925480718448465, | |
| "learning_rate": 9.953739202357219e-06, | |
| "loss": 0.1621, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.37671957671957673, | |
| "grad_norm": 2.5097628215455994, | |
| "learning_rate": 9.951340343707852e-06, | |
| "loss": 0.1916, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.38095238095238093, | |
| "grad_norm": 2.2291743713915815, | |
| "learning_rate": 9.948881154538946e-06, | |
| "loss": 0.1901, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3851851851851852, | |
| "grad_norm": 2.2427956711484147, | |
| "learning_rate": 9.946361664814942e-06, | |
| "loss": 0.1926, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.38941798941798944, | |
| "grad_norm": 2.1674595305785966, | |
| "learning_rate": 9.94378190523503e-06, | |
| "loss": 0.15, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.39365079365079364, | |
| "grad_norm": 2.065344580545721, | |
| "learning_rate": 9.941141907232766e-06, | |
| "loss": 0.151, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.3978835978835979, | |
| "grad_norm": 2.4498733708415332, | |
| "learning_rate": 9.938441702975689e-06, | |
| "loss": 0.1991, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.4021164021164021, | |
| "grad_norm": 2.196562226640055, | |
| "learning_rate": 9.93568132536494e-06, | |
| "loss": 0.1565, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.40634920634920635, | |
| "grad_norm": 2.45021933901518, | |
| "learning_rate": 9.932860808034847e-06, | |
| "loss": 0.1954, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.4105820105820106, | |
| "grad_norm": 1.6870920962679627, | |
| "learning_rate": 9.929980185352525e-06, | |
| "loss": 0.1415, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.4148148148148148, | |
| "grad_norm": 2.0433028628482277, | |
| "learning_rate": 9.927039492417452e-06, | |
| "loss": 0.1752, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.41904761904761906, | |
| "grad_norm": 2.3924574752175576, | |
| "learning_rate": 9.924038765061042e-06, | |
| "loss": 0.169, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.42328042328042326, | |
| "grad_norm": 2.1514028524429563, | |
| "learning_rate": 9.92097803984621e-06, | |
| "loss": 0.2042, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.4275132275132275, | |
| "grad_norm": 1.9233810049291866, | |
| "learning_rate": 9.91785735406693e-06, | |
| "loss": 0.1625, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.43174603174603177, | |
| "grad_norm": 2.0269343004100433, | |
| "learning_rate": 9.914676745747772e-06, | |
| "loss": 0.1783, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.43597883597883597, | |
| "grad_norm": 2.9018403399357657, | |
| "learning_rate": 9.911436253643445e-06, | |
| "loss": 0.2005, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.4402116402116402, | |
| "grad_norm": 2.3864889162085694, | |
| "learning_rate": 9.908135917238321e-06, | |
| "loss": 0.1899, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.4444444444444444, | |
| "grad_norm": 2.029884804297179, | |
| "learning_rate": 9.904775776745959e-06, | |
| "loss": 0.1646, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.4486772486772487, | |
| "grad_norm": 2.217474285635462, | |
| "learning_rate": 9.901355873108611e-06, | |
| "loss": 0.1975, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.45291005291005293, | |
| "grad_norm": 3.316060847140544, | |
| "learning_rate": 9.89787624799672e-06, | |
| "loss": 0.1757, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.45714285714285713, | |
| "grad_norm": 2.256786436988083, | |
| "learning_rate": 9.894336943808426e-06, | |
| "loss": 0.1808, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.4613756613756614, | |
| "grad_norm": 2.3520584460075855, | |
| "learning_rate": 9.890738003669029e-06, | |
| "loss": 0.1832, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.4656084656084656, | |
| "grad_norm": 2.819589162595303, | |
| "learning_rate": 9.887079471430481e-06, | |
| "loss": 0.1937, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.46984126984126984, | |
| "grad_norm": 2.0799214485519713, | |
| "learning_rate": 9.883361391670841e-06, | |
| "loss": 0.1204, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.4740740740740741, | |
| "grad_norm": 2.4098902856637148, | |
| "learning_rate": 9.879583809693737e-06, | |
| "loss": 0.1774, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.4783068783068783, | |
| "grad_norm": 2.4639460942435334, | |
| "learning_rate": 9.875746771527817e-06, | |
| "loss": 0.1954, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.48253968253968255, | |
| "grad_norm": 1.8744056955663004, | |
| "learning_rate": 9.871850323926178e-06, | |
| "loss": 0.186, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.48677248677248675, | |
| "grad_norm": 2.1213169931981226, | |
| "learning_rate": 9.867894514365802e-06, | |
| "loss": 0.1425, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.491005291005291, | |
| "grad_norm": 2.1632110720054962, | |
| "learning_rate": 9.863879391046985e-06, | |
| "loss": 0.1552, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.49523809523809526, | |
| "grad_norm": 2.7544098112396282, | |
| "learning_rate": 9.859805002892733e-06, | |
| "loss": 0.1432, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.49947089947089945, | |
| "grad_norm": 2.100186269105726, | |
| "learning_rate": 9.85567139954818e-06, | |
| "loss": 0.1648, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.5037037037037037, | |
| "grad_norm": 1.9132320908075728, | |
| "learning_rate": 9.851478631379982e-06, | |
| "loss": 0.1645, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.5079365079365079, | |
| "grad_norm": 2.080325184083409, | |
| "learning_rate": 9.847226749475696e-06, | |
| "loss": 0.1421, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.5121693121693122, | |
| "grad_norm": 2.1166691740240946, | |
| "learning_rate": 9.842915805643156e-06, | |
| "loss": 0.1739, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.5164021164021164, | |
| "grad_norm": 2.7777924615211402, | |
| "learning_rate": 9.838545852409857e-06, | |
| "loss": 0.2158, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.5206349206349207, | |
| "grad_norm": 2.011825936444935, | |
| "learning_rate": 9.834116943022299e-06, | |
| "loss": 0.1283, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.5248677248677248, | |
| "grad_norm": 2.3088090393013707, | |
| "learning_rate": 9.829629131445342e-06, | |
| "loss": 0.1594, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.5291005291005291, | |
| "grad_norm": 2.5599162277007506, | |
| "learning_rate": 9.825082472361558e-06, | |
| "loss": 0.1602, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.5333333333333333, | |
| "grad_norm": 2.8804298377274624, | |
| "learning_rate": 9.82047702117055e-06, | |
| "loss": 0.21, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.5375661375661376, | |
| "grad_norm": 2.322574871028054, | |
| "learning_rate": 9.815812833988292e-06, | |
| "loss": 0.1894, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.5417989417989418, | |
| "grad_norm": 2.3497044645708662, | |
| "learning_rate": 9.811089967646427e-06, | |
| "loss": 0.2065, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.546031746031746, | |
| "grad_norm": 2.1840543614377785, | |
| "learning_rate": 9.806308479691595e-06, | |
| "loss": 0.1752, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.5502645502645502, | |
| "grad_norm": 1.9149801234923844, | |
| "learning_rate": 9.801468428384716e-06, | |
| "loss": 0.1449, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5544973544973545, | |
| "grad_norm": 1.9431409264703756, | |
| "learning_rate": 9.796569872700287e-06, | |
| "loss": 0.1562, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.5587301587301587, | |
| "grad_norm": 1.9277634151443268, | |
| "learning_rate": 9.791612872325667e-06, | |
| "loss": 0.1665, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.562962962962963, | |
| "grad_norm": 2.6741227436047703, | |
| "learning_rate": 9.786597487660336e-06, | |
| "loss": 0.195, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.5671957671957671, | |
| "grad_norm": 1.9587886510325367, | |
| "learning_rate": 9.781523779815178e-06, | |
| "loss": 0.1496, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 2.167004574977241, | |
| "learning_rate": 9.776391810611719e-06, | |
| "loss": 0.1873, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.5756613756613757, | |
| "grad_norm": 2.3098964403422717, | |
| "learning_rate": 9.771201642581384e-06, | |
| "loss": 0.1448, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.5798941798941799, | |
| "grad_norm": 2.2442486200909886, | |
| "learning_rate": 9.765953338964736e-06, | |
| "loss": 0.1787, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.5841269841269842, | |
| "grad_norm": 2.213445354502761, | |
| "learning_rate": 9.760646963710694e-06, | |
| "loss": 0.2045, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.5883597883597883, | |
| "grad_norm": 1.7015924599869439, | |
| "learning_rate": 9.755282581475769e-06, | |
| "loss": 0.1105, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.5925925925925926, | |
| "grad_norm": 2.135699337583285, | |
| "learning_rate": 9.749860257623262e-06, | |
| "loss": 0.1696, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5968253968253968, | |
| "grad_norm": 2.022004867298511, | |
| "learning_rate": 9.744380058222483e-06, | |
| "loss": 0.1436, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.6010582010582011, | |
| "grad_norm": 1.938130175850262, | |
| "learning_rate": 9.73884205004793e-06, | |
| "loss": 0.152, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.6052910052910053, | |
| "grad_norm": 2.2793353633377644, | |
| "learning_rate": 9.733246300578482e-06, | |
| "loss": 0.1801, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.6095238095238096, | |
| "grad_norm": 2.3515552282000023, | |
| "learning_rate": 9.727592877996585e-06, | |
| "loss": 0.1709, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.6137566137566137, | |
| "grad_norm": 2.4174135230244693, | |
| "learning_rate": 9.721881851187406e-06, | |
| "loss": 0.1888, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.617989417989418, | |
| "grad_norm": 2.02003900874957, | |
| "learning_rate": 9.716113289738005e-06, | |
| "loss": 0.1549, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.6222222222222222, | |
| "grad_norm": 2.1212067581354637, | |
| "learning_rate": 9.710287263936485e-06, | |
| "loss": 0.1463, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.6264550264550265, | |
| "grad_norm": 2.10847848049695, | |
| "learning_rate": 9.704403844771128e-06, | |
| "loss": 0.1548, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.6306878306878307, | |
| "grad_norm": 2.1977701291513783, | |
| "learning_rate": 9.698463103929542e-06, | |
| "loss": 0.1842, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.6349206349206349, | |
| "grad_norm": 2.0234900314907303, | |
| "learning_rate": 9.69246511379778e-06, | |
| "loss": 0.1564, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.6391534391534391, | |
| "grad_norm": 2.221863968702518, | |
| "learning_rate": 9.68640994745946e-06, | |
| "loss": 0.1492, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.6433862433862434, | |
| "grad_norm": 2.2957077394485865, | |
| "learning_rate": 9.680297678694867e-06, | |
| "loss": 0.1681, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.6476190476190476, | |
| "grad_norm": 2.319080436137813, | |
| "learning_rate": 9.674128381980073e-06, | |
| "loss": 0.1584, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.6518518518518519, | |
| "grad_norm": 2.081726848575624, | |
| "learning_rate": 9.667902132486009e-06, | |
| "loss": 0.1605, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.656084656084656, | |
| "grad_norm": 1.696442003069325, | |
| "learning_rate": 9.661619006077562e-06, | |
| "loss": 0.1175, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.6603174603174603, | |
| "grad_norm": 1.9060631377388697, | |
| "learning_rate": 9.655279079312643e-06, | |
| "loss": 0.1327, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.6645502645502646, | |
| "grad_norm": 2.437272536385666, | |
| "learning_rate": 9.648882429441258e-06, | |
| "loss": 0.1673, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.6687830687830688, | |
| "grad_norm": 2.5407710665248406, | |
| "learning_rate": 9.642429134404568e-06, | |
| "loss": 0.1795, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.6730158730158731, | |
| "grad_norm": 1.7280628828064413, | |
| "learning_rate": 9.635919272833938e-06, | |
| "loss": 0.1192, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.6772486772486772, | |
| "grad_norm": 2.0466311163540554, | |
| "learning_rate": 9.629352924049975e-06, | |
| "loss": 0.1641, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6814814814814815, | |
| "grad_norm": 2.430559199002775, | |
| "learning_rate": 9.622730168061568e-06, | |
| "loss": 0.164, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.6857142857142857, | |
| "grad_norm": 1.8034268409827128, | |
| "learning_rate": 9.616051085564905e-06, | |
| "loss": 0.1151, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.68994708994709, | |
| "grad_norm": 2.076098544214192, | |
| "learning_rate": 9.609315757942504e-06, | |
| "loss": 0.1356, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.6941798941798942, | |
| "grad_norm": 1.8175196422774194, | |
| "learning_rate": 9.602524267262202e-06, | |
| "loss": 0.1286, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.6984126984126984, | |
| "grad_norm": 1.9844294374596974, | |
| "learning_rate": 9.595676696276173e-06, | |
| "loss": 0.1447, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.7026455026455026, | |
| "grad_norm": 2.0850373939499103, | |
| "learning_rate": 9.588773128419907e-06, | |
| "loss": 0.1641, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.7068783068783069, | |
| "grad_norm": 2.1830728529262213, | |
| "learning_rate": 9.581813647811199e-06, | |
| "loss": 0.146, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.7111111111111111, | |
| "grad_norm": 1.9107339137949033, | |
| "learning_rate": 9.574798339249124e-06, | |
| "loss": 0.1424, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.7153439153439154, | |
| "grad_norm": 1.9073629765575915, | |
| "learning_rate": 9.567727288213005e-06, | |
| "loss": 0.1321, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.7195767195767195, | |
| "grad_norm": 1.7910676565872312, | |
| "learning_rate": 9.560600580861366e-06, | |
| "loss": 0.124, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.7238095238095238, | |
| "grad_norm": 2.3559878393889444, | |
| "learning_rate": 9.553418304030886e-06, | |
| "loss": 0.1542, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.728042328042328, | |
| "grad_norm": 2.257035851112734, | |
| "learning_rate": 9.546180545235344e-06, | |
| "loss": 0.1811, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.7322751322751323, | |
| "grad_norm": 2.372408574995787, | |
| "learning_rate": 9.538887392664544e-06, | |
| "loss": 0.1415, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.7365079365079366, | |
| "grad_norm": 2.1859794226240643, | |
| "learning_rate": 9.531538935183252e-06, | |
| "loss": 0.164, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.7407407407407407, | |
| "grad_norm": 1.9315910825592963, | |
| "learning_rate": 9.524135262330098e-06, | |
| "loss": 0.1242, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.744973544973545, | |
| "grad_norm": 1.932293530910452, | |
| "learning_rate": 9.516676464316505e-06, | |
| "loss": 0.1223, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.7492063492063492, | |
| "grad_norm": 2.5245512957918037, | |
| "learning_rate": 9.50916263202557e-06, | |
| "loss": 0.1758, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.7534391534391535, | |
| "grad_norm": 2.2836095033018307, | |
| "learning_rate": 9.501593857010968e-06, | |
| "loss": 0.1472, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.7576719576719577, | |
| "grad_norm": 2.0376880346254227, | |
| "learning_rate": 9.493970231495836e-06, | |
| "loss": 0.1489, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.7619047619047619, | |
| "grad_norm": 1.9142818784287132, | |
| "learning_rate": 9.486291848371642e-06, | |
| "loss": 0.1242, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.7661375661375661, | |
| "grad_norm": 2.0433594909925823, | |
| "learning_rate": 9.478558801197065e-06, | |
| "loss": 0.1246, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.7703703703703704, | |
| "grad_norm": 1.814914680097962, | |
| "learning_rate": 9.470771184196842e-06, | |
| "loss": 0.1178, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.7746031746031746, | |
| "grad_norm": 2.038031930273932, | |
| "learning_rate": 9.46292909226063e-06, | |
| "loss": 0.1741, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.7788359788359789, | |
| "grad_norm": 2.2491615885983025, | |
| "learning_rate": 9.45503262094184e-06, | |
| "loss": 0.1428, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.783068783068783, | |
| "grad_norm": 1.9769912092154236, | |
| "learning_rate": 9.44708186645649e-06, | |
| "loss": 0.1431, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.7873015873015873, | |
| "grad_norm": 1.908015924501842, | |
| "learning_rate": 9.439076925682006e-06, | |
| "loss": 0.1249, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.7915343915343915, | |
| "grad_norm": 1.8504394720752184, | |
| "learning_rate": 9.431017896156074e-06, | |
| "loss": 0.1205, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.7957671957671958, | |
| "grad_norm": 2.0652458552127384, | |
| "learning_rate": 9.42290487607542e-06, | |
| "loss": 0.1333, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 2.065167228642111, | |
| "learning_rate": 9.414737964294636e-06, | |
| "loss": 0.1407, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.8042328042328042, | |
| "grad_norm": 2.1818559331219234, | |
| "learning_rate": 9.406517260324962e-06, | |
| "loss": 0.127, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.8084656084656084, | |
| "grad_norm": 1.9594250023861293, | |
| "learning_rate": 9.398242864333084e-06, | |
| "loss": 0.1196, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.8126984126984127, | |
| "grad_norm": 2.143484340303363, | |
| "learning_rate": 9.389914877139903e-06, | |
| "loss": 0.1361, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.816931216931217, | |
| "grad_norm": 1.925094740498635, | |
| "learning_rate": 9.381533400219319e-06, | |
| "loss": 0.1089, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.8211640211640212, | |
| "grad_norm": 2.951061076480981, | |
| "learning_rate": 9.37309853569698e-06, | |
| "loss": 0.1371, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.8253968253968254, | |
| "grad_norm": 1.9339219951657884, | |
| "learning_rate": 9.364610386349048e-06, | |
| "loss": 0.126, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.8296296296296296, | |
| "grad_norm": 2.0008984944402566, | |
| "learning_rate": 9.356069055600949e-06, | |
| "loss": 0.1194, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.8338624338624339, | |
| "grad_norm": 2.2860380840573713, | |
| "learning_rate": 9.347474647526095e-06, | |
| "loss": 0.1311, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.8380952380952381, | |
| "grad_norm": 1.8033278898675713, | |
| "learning_rate": 9.338827266844643e-06, | |
| "loss": 0.1184, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.8423280423280424, | |
| "grad_norm": 1.9247101184163469, | |
| "learning_rate": 9.330127018922195e-06, | |
| "loss": 0.1195, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.8465608465608465, | |
| "grad_norm": 2.1205250196763927, | |
| "learning_rate": 9.321374009768525e-06, | |
| "loss": 0.1193, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.8507936507936508, | |
| "grad_norm": 1.92871242138401, | |
| "learning_rate": 9.312568346036288e-06, | |
| "loss": 0.1127, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.855026455026455, | |
| "grad_norm": 1.936940428750681, | |
| "learning_rate": 9.30371013501972e-06, | |
| "loss": 0.1159, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.8592592592592593, | |
| "grad_norm": 1.885933491210588, | |
| "learning_rate": 9.294799484653323e-06, | |
| "loss": 0.1196, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.8634920634920635, | |
| "grad_norm": 2.082748868345629, | |
| "learning_rate": 9.285836503510562e-06, | |
| "loss": 0.1136, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.8677248677248677, | |
| "grad_norm": 1.7285347264228283, | |
| "learning_rate": 9.276821300802535e-06, | |
| "loss": 0.1076, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.8719576719576719, | |
| "grad_norm": 2.1678906514126584, | |
| "learning_rate": 9.267753986376638e-06, | |
| "loss": 0.1309, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.8761904761904762, | |
| "grad_norm": 2.2303167600680838, | |
| "learning_rate": 9.25863467071524e-06, | |
| "loss": 0.1188, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.8804232804232804, | |
| "grad_norm": 1.8524486771122908, | |
| "learning_rate": 9.24946346493432e-06, | |
| "loss": 0.1037, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.8846560846560847, | |
| "grad_norm": 1.9545872165465215, | |
| "learning_rate": 9.24024048078213e-06, | |
| "loss": 0.1239, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 2.2585920119228486, | |
| "learning_rate": 9.230965830637821e-06, | |
| "loss": 0.1426, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.8931216931216931, | |
| "grad_norm": 2.604112442901209, | |
| "learning_rate": 9.221639627510076e-06, | |
| "loss": 0.1691, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.8973544973544973, | |
| "grad_norm": 2.148863724504725, | |
| "learning_rate": 9.21226198503574e-06, | |
| "loss": 0.1239, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.9015873015873016, | |
| "grad_norm": 2.1443004360413283, | |
| "learning_rate": 9.202833017478421e-06, | |
| "loss": 0.1111, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.9058201058201059, | |
| "grad_norm": 2.0065790334640754, | |
| "learning_rate": 9.193352839727122e-06, | |
| "loss": 0.0986, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.91005291005291, | |
| "grad_norm": 1.685651884041455, | |
| "learning_rate": 9.18382156729481e-06, | |
| "loss": 0.0962, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.9142857142857143, | |
| "grad_norm": 1.9759626665833376, | |
| "learning_rate": 9.174239316317034e-06, | |
| "loss": 0.1228, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.9185185185185185, | |
| "grad_norm": 2.0642455639952093, | |
| "learning_rate": 9.164606203550498e-06, | |
| "loss": 0.1408, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.9227513227513228, | |
| "grad_norm": 2.032852061108819, | |
| "learning_rate": 9.154922346371641e-06, | |
| "loss": 0.1152, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.926984126984127, | |
| "grad_norm": 2.354712801581503, | |
| "learning_rate": 9.145187862775208e-06, | |
| "loss": 0.1428, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.9312169312169312, | |
| "grad_norm": 2.5192247867841546, | |
| "learning_rate": 9.13540287137281e-06, | |
| "loss": 0.1405, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.9354497354497354, | |
| "grad_norm": 1.72459255330544, | |
| "learning_rate": 9.125567491391476e-06, | |
| "loss": 0.1003, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.9396825396825397, | |
| "grad_norm": 1.8550339147298913, | |
| "learning_rate": 9.115681842672211e-06, | |
| "loss": 0.1154, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.9439153439153439, | |
| "grad_norm": 1.9003240348247867, | |
| "learning_rate": 9.10574604566852e-06, | |
| "loss": 0.1181, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.9481481481481482, | |
| "grad_norm": 2.156481954204094, | |
| "learning_rate": 9.09576022144496e-06, | |
| "loss": 0.1264, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.9523809523809523, | |
| "grad_norm": 1.998485739236388, | |
| "learning_rate": 9.085724491675642e-06, | |
| "loss": 0.1341, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.9566137566137566, | |
| "grad_norm": 1.822826676452421, | |
| "learning_rate": 9.07563897864277e-06, | |
| "loss": 0.1399, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.9608465608465608, | |
| "grad_norm": 2.2296945866049365, | |
| "learning_rate": 9.065503805235139e-06, | |
| "loss": 0.113, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.9650793650793651, | |
| "grad_norm": 2.533173971355714, | |
| "learning_rate": 9.055319094946633e-06, | |
| "loss": 0.1544, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.9693121693121693, | |
| "grad_norm": 2.189492349413132, | |
| "learning_rate": 9.045084971874738e-06, | |
| "loss": 0.138, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.9735449735449735, | |
| "grad_norm": 2.0227496012742434, | |
| "learning_rate": 9.03480156071901e-06, | |
| "loss": 0.1201, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.9777777777777777, | |
| "grad_norm": 2.19493710384594, | |
| "learning_rate": 9.02446898677957e-06, | |
| "loss": 0.1501, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.982010582010582, | |
| "grad_norm": 1.6357273412380897, | |
| "learning_rate": 9.014087375955574e-06, | |
| "loss": 0.1024, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.9862433862433863, | |
| "grad_norm": 1.9821532554149048, | |
| "learning_rate": 9.003656854743667e-06, | |
| "loss": 0.1149, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.9904761904761905, | |
| "grad_norm": 1.739947981961382, | |
| "learning_rate": 8.993177550236464e-06, | |
| "loss": 0.1128, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.9947089947089947, | |
| "grad_norm": 1.8038704679693724, | |
| "learning_rate": 8.982649590120982e-06, | |
| "loss": 0.0901, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.9989417989417989, | |
| "grad_norm": 1.755306767490001, | |
| "learning_rate": 8.972073102677091e-06, | |
| "loss": 0.0941, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 3.1593265626408567, | |
| "learning_rate": 8.961448216775955e-06, | |
| "loss": 0.0896, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 1.0042328042328041, | |
| "grad_norm": 1.6382971000061148, | |
| "learning_rate": 8.950775061878453e-06, | |
| "loss": 0.0839, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 1.0084656084656085, | |
| "grad_norm": 1.733233185872628, | |
| "learning_rate": 8.94005376803361e-06, | |
| "loss": 0.0942, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 1.0126984126984127, | |
| "grad_norm": 1.6215967926850725, | |
| "learning_rate": 8.92928446587701e-06, | |
| "loss": 0.0823, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.016931216931217, | |
| "grad_norm": 1.700851779943189, | |
| "learning_rate": 8.9184672866292e-06, | |
| "loss": 0.0707, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 1.0211640211640212, | |
| "grad_norm": 1.5763379344310904, | |
| "learning_rate": 8.907602362094094e-06, | |
| "loss": 0.0709, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 1.0253968253968253, | |
| "grad_norm": 2.1897050252472425, | |
| "learning_rate": 8.896689824657371e-06, | |
| "loss": 0.0944, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 1.0296296296296297, | |
| "grad_norm": 1.7815709759982583, | |
| "learning_rate": 8.885729807284855e-06, | |
| "loss": 0.0782, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 1.0338624338624338, | |
| "grad_norm": 1.8077323351970311, | |
| "learning_rate": 8.874722443520898e-06, | |
| "loss": 0.0652, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.0380952380952382, | |
| "grad_norm": 2.1870219531658295, | |
| "learning_rate": 8.863667867486756e-06, | |
| "loss": 0.0924, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 1.0423280423280423, | |
| "grad_norm": 2.023053098789974, | |
| "learning_rate": 8.852566213878947e-06, | |
| "loss": 0.0879, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 1.0465608465608465, | |
| "grad_norm": 2.0022724327057855, | |
| "learning_rate": 8.841417617967618e-06, | |
| "loss": 0.081, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 1.0507936507936508, | |
| "grad_norm": 1.8561313251630824, | |
| "learning_rate": 8.83022221559489e-06, | |
| "loss": 0.064, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 1.055026455026455, | |
| "grad_norm": 2.0472910334735452, | |
| "learning_rate": 8.818980143173212e-06, | |
| "loss": 0.0654, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.0592592592592593, | |
| "grad_norm": 1.847056303737043, | |
| "learning_rate": 8.807691537683685e-06, | |
| "loss": 0.0645, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 1.0634920634920635, | |
| "grad_norm": 2.3192957709806685, | |
| "learning_rate": 8.796356536674404e-06, | |
| "loss": 0.0595, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 1.0677248677248676, | |
| "grad_norm": 1.8495895474699158, | |
| "learning_rate": 8.784975278258783e-06, | |
| "loss": 0.0547, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 1.071957671957672, | |
| "grad_norm": 2.3472935666710306, | |
| "learning_rate": 8.773547901113862e-06, | |
| "loss": 0.0955, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 1.0761904761904761, | |
| "grad_norm": 1.8466358550678363, | |
| "learning_rate": 8.762074544478622e-06, | |
| "loss": 0.0838, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.0804232804232805, | |
| "grad_norm": 1.8807145283212745, | |
| "learning_rate": 8.750555348152299e-06, | |
| "loss": 0.084, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 1.0846560846560847, | |
| "grad_norm": 2.0632365769348913, | |
| "learning_rate": 8.73899045249266e-06, | |
| "loss": 0.1131, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 1.0888888888888888, | |
| "grad_norm": 1.7220071689576308, | |
| "learning_rate": 8.727379998414311e-06, | |
| "loss": 0.0651, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 1.0931216931216932, | |
| "grad_norm": 1.8875007957585828, | |
| "learning_rate": 8.715724127386971e-06, | |
| "loss": 0.0912, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 1.0973544973544973, | |
| "grad_norm": 1.7491126079962518, | |
| "learning_rate": 8.70402298143375e-06, | |
| "loss": 0.077, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.1015873015873017, | |
| "grad_norm": 1.9113293742259723, | |
| "learning_rate": 8.692276703129421e-06, | |
| "loss": 0.065, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 1.1058201058201058, | |
| "grad_norm": 1.972668011890266, | |
| "learning_rate": 8.680485435598674e-06, | |
| "loss": 0.073, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 1.11005291005291, | |
| "grad_norm": 2.0401162544608704, | |
| "learning_rate": 8.668649322514382e-06, | |
| "loss": 0.0813, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 1.1142857142857143, | |
| "grad_norm": 1.705144782299304, | |
| "learning_rate": 8.656768508095853e-06, | |
| "loss": 0.0687, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 1.1185185185185185, | |
| "grad_norm": 1.8673386144289976, | |
| "learning_rate": 8.644843137107058e-06, | |
| "loss": 0.0736, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.1227513227513228, | |
| "grad_norm": 2.182984594576122, | |
| "learning_rate": 8.632873354854881e-06, | |
| "loss": 0.1059, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 1.126984126984127, | |
| "grad_norm": 2.195083172027126, | |
| "learning_rate": 8.620859307187339e-06, | |
| "loss": 0.0941, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 1.1312169312169311, | |
| "grad_norm": 1.8505253617934838, | |
| "learning_rate": 8.608801140491811e-06, | |
| "loss": 0.0835, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 1.1354497354497355, | |
| "grad_norm": 1.9135866347855481, | |
| "learning_rate": 8.596699001693257e-06, | |
| "loss": 0.0774, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 1.1396825396825396, | |
| "grad_norm": 1.954938336450862, | |
| "learning_rate": 8.584553038252415e-06, | |
| "loss": 0.0855, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.143915343915344, | |
| "grad_norm": 1.6669954564422538, | |
| "learning_rate": 8.572363398164017e-06, | |
| "loss": 0.0758, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 1.1481481481481481, | |
| "grad_norm": 1.6351480365128717, | |
| "learning_rate": 8.560130229954985e-06, | |
| "loss": 0.0683, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 1.1523809523809523, | |
| "grad_norm": 1.7579143840699374, | |
| "learning_rate": 8.547853682682605e-06, | |
| "loss": 0.0747, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 1.1566137566137566, | |
| "grad_norm": 1.5569433244482247, | |
| "learning_rate": 8.535533905932739e-06, | |
| "loss": 0.0671, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 1.1608465608465608, | |
| "grad_norm": 1.8671513527781665, | |
| "learning_rate": 8.523171049817974e-06, | |
| "loss": 0.0737, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.1650793650793652, | |
| "grad_norm": 1.871079630531322, | |
| "learning_rate": 8.510765264975813e-06, | |
| "loss": 0.067, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 1.1693121693121693, | |
| "grad_norm": 1.7124717161637384, | |
| "learning_rate": 8.498316702566828e-06, | |
| "loss": 0.0661, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 1.1735449735449737, | |
| "grad_norm": 1.5069679809087728, | |
| "learning_rate": 8.485825514272824e-06, | |
| "loss": 0.0626, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 1.1777777777777778, | |
| "grad_norm": 1.637366530387125, | |
| "learning_rate": 8.473291852294986e-06, | |
| "loss": 0.0676, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 1.182010582010582, | |
| "grad_norm": 1.870549503958005, | |
| "learning_rate": 8.460715869352035e-06, | |
| "loss": 0.0673, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.1862433862433863, | |
| "grad_norm": 2.6075150985041775, | |
| "learning_rate": 8.44809771867835e-06, | |
| "loss": 0.1037, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 1.1904761904761905, | |
| "grad_norm": 1.8129166707802178, | |
| "learning_rate": 8.435437554022116e-06, | |
| "loss": 0.0658, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 1.1947089947089946, | |
| "grad_norm": 1.9273840349175055, | |
| "learning_rate": 8.422735529643445e-06, | |
| "loss": 0.0692, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 1.198941798941799, | |
| "grad_norm": 1.946526317802552, | |
| "learning_rate": 8.409991800312493e-06, | |
| "loss": 0.0872, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 1.2031746031746031, | |
| "grad_norm": 2.093538504659771, | |
| "learning_rate": 8.397206521307584e-06, | |
| "loss": 0.0575, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.2074074074074075, | |
| "grad_norm": 1.8111794559902639, | |
| "learning_rate": 8.384379848413304e-06, | |
| "loss": 0.0536, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 1.2116402116402116, | |
| "grad_norm": 1.4485261428861709, | |
| "learning_rate": 8.371511937918616e-06, | |
| "loss": 0.0567, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 1.215873015873016, | |
| "grad_norm": 2.2184787541807047, | |
| "learning_rate": 8.358602946614952e-06, | |
| "loss": 0.0777, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 1.2201058201058201, | |
| "grad_norm": 2.019737455684998, | |
| "learning_rate": 8.345653031794292e-06, | |
| "loss": 0.0679, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 1.2243386243386243, | |
| "grad_norm": 1.6322133861673878, | |
| "learning_rate": 8.332662351247262e-06, | |
| "loss": 0.0473, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.2285714285714286, | |
| "grad_norm": 2.2122138272956797, | |
| "learning_rate": 8.319631063261209e-06, | |
| "loss": 0.0902, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 1.2328042328042328, | |
| "grad_norm": 1.7651396895281377, | |
| "learning_rate": 8.30655932661826e-06, | |
| "loss": 0.0732, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 1.237037037037037, | |
| "grad_norm": 1.8538276922333994, | |
| "learning_rate": 8.293447300593402e-06, | |
| "loss": 0.0688, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 1.2412698412698413, | |
| "grad_norm": 2.15194163484418, | |
| "learning_rate": 8.280295144952537e-06, | |
| "loss": 0.0764, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 1.2455026455026454, | |
| "grad_norm": 1.7234143791825733, | |
| "learning_rate": 8.267103019950529e-06, | |
| "loss": 0.0538, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.2497354497354498, | |
| "grad_norm": 2.293390918667569, | |
| "learning_rate": 8.253871086329255e-06, | |
| "loss": 0.0769, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 1.253968253968254, | |
| "grad_norm": 1.8852414195646272, | |
| "learning_rate": 8.240599505315656e-06, | |
| "loss": 0.0584, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 1.2582010582010583, | |
| "grad_norm": 2.1521764452864973, | |
| "learning_rate": 8.227288438619754e-06, | |
| "loss": 0.0718, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 1.2624338624338625, | |
| "grad_norm": 1.3679302426411497, | |
| "learning_rate": 8.213938048432697e-06, | |
| "loss": 0.0476, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 1.2666666666666666, | |
| "grad_norm": 1.8759901982931633, | |
| "learning_rate": 8.200548497424779e-06, | |
| "loss": 0.0588, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.270899470899471, | |
| "grad_norm": 2.1115534695308757, | |
| "learning_rate": 8.18711994874345e-06, | |
| "loss": 0.0474, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 1.2751322751322751, | |
| "grad_norm": 1.42054603929946, | |
| "learning_rate": 8.173652566011339e-06, | |
| "loss": 0.0491, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 1.2793650793650793, | |
| "grad_norm": 2.136699369797229, | |
| "learning_rate": 8.160146513324256e-06, | |
| "loss": 0.087, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 1.2835978835978836, | |
| "grad_norm": 1.7069088981866332, | |
| "learning_rate": 8.146601955249187e-06, | |
| "loss": 0.0535, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 1.2878306878306878, | |
| "grad_norm": 2.678199387264105, | |
| "learning_rate": 8.133019056822303e-06, | |
| "loss": 0.0858, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.2920634920634921, | |
| "grad_norm": 2.400768922479514, | |
| "learning_rate": 8.119397983546932e-06, | |
| "loss": 0.0838, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 1.2962962962962963, | |
| "grad_norm": 2.0384750677283283, | |
| "learning_rate": 8.105738901391553e-06, | |
| "loss": 0.0714, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 1.3005291005291006, | |
| "grad_norm": 1.8955000216446414, | |
| "learning_rate": 8.092041976787772e-06, | |
| "loss": 0.0653, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 1.3047619047619048, | |
| "grad_norm": 1.8920868109479223, | |
| "learning_rate": 8.078307376628292e-06, | |
| "loss": 0.0693, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 1.308994708994709, | |
| "grad_norm": 2.0890907162900434, | |
| "learning_rate": 8.064535268264883e-06, | |
| "loss": 0.0814, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.3132275132275133, | |
| "grad_norm": 1.8374660007465378, | |
| "learning_rate": 8.05072581950634e-06, | |
| "loss": 0.0697, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 1.3174603174603174, | |
| "grad_norm": 1.7534716195041071, | |
| "learning_rate": 8.036879198616434e-06, | |
| "loss": 0.0643, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 1.3216931216931216, | |
| "grad_norm": 1.9643071053476988, | |
| "learning_rate": 8.022995574311876e-06, | |
| "loss": 0.0808, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 1.325925925925926, | |
| "grad_norm": 1.7283546374071002, | |
| "learning_rate": 8.009075115760243e-06, | |
| "loss": 0.0772, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 1.33015873015873, | |
| "grad_norm": 1.5108710880067853, | |
| "learning_rate": 7.99511799257793e-06, | |
| "loss": 0.059, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.3343915343915345, | |
| "grad_norm": 1.4537965690657546, | |
| "learning_rate": 7.981124374828079e-06, | |
| "loss": 0.0594, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 1.3386243386243386, | |
| "grad_norm": 2.237037563797428, | |
| "learning_rate": 7.967094433018508e-06, | |
| "loss": 0.0768, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 1.342857142857143, | |
| "grad_norm": 1.6057750143434733, | |
| "learning_rate": 7.953028338099628e-06, | |
| "loss": 0.0696, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 1.3470899470899471, | |
| "grad_norm": 1.5702160775846645, | |
| "learning_rate": 7.938926261462366e-06, | |
| "loss": 0.065, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 1.3513227513227513, | |
| "grad_norm": 1.8059545714263667, | |
| "learning_rate": 7.92478837493608e-06, | |
| "loss": 0.0751, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.3555555555555556, | |
| "grad_norm": 1.8812566576792862, | |
| "learning_rate": 7.910614850786448e-06, | |
| "loss": 0.066, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 1.3597883597883598, | |
| "grad_norm": 1.64104762399391, | |
| "learning_rate": 7.896405861713393e-06, | |
| "loss": 0.0612, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 1.364021164021164, | |
| "grad_norm": 1.938404168025234, | |
| "learning_rate": 7.882161580848966e-06, | |
| "loss": 0.0716, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 1.3682539682539683, | |
| "grad_norm": 1.9354768854944182, | |
| "learning_rate": 7.86788218175523e-06, | |
| "loss": 0.0625, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 1.3724867724867724, | |
| "grad_norm": 1.7263366442747021, | |
| "learning_rate": 7.85356783842216e-06, | |
| "loss": 0.0641, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.3767195767195768, | |
| "grad_norm": 1.5396345057515881, | |
| "learning_rate": 7.839218725265507e-06, | |
| "loss": 0.0553, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 1.380952380952381, | |
| "grad_norm": 1.8525012205903337, | |
| "learning_rate": 7.82483501712469e-06, | |
| "loss": 0.0766, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 1.3851851851851853, | |
| "grad_norm": 1.6334096140515466, | |
| "learning_rate": 7.810416889260653e-06, | |
| "loss": 0.0572, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 1.3894179894179894, | |
| "grad_norm": 1.5922133675439893, | |
| "learning_rate": 7.795964517353734e-06, | |
| "loss": 0.0687, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 1.3936507936507936, | |
| "grad_norm": 1.2091317327665092, | |
| "learning_rate": 7.781478077501526e-06, | |
| "loss": 0.0424, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.397883597883598, | |
| "grad_norm": 2.132743017629856, | |
| "learning_rate": 7.76695774621672e-06, | |
| "loss": 0.0617, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 1.402116402116402, | |
| "grad_norm": 2.178343442230559, | |
| "learning_rate": 7.752403700424978e-06, | |
| "loss": 0.0767, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 1.4063492063492062, | |
| "grad_norm": 2.0956999898184905, | |
| "learning_rate": 7.737816117462752e-06, | |
| "loss": 0.0856, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 1.4105820105820106, | |
| "grad_norm": 2.328993538503913, | |
| "learning_rate": 7.723195175075136e-06, | |
| "loss": 0.0697, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 1.4148148148148147, | |
| "grad_norm": 2.004814063553197, | |
| "learning_rate": 7.7085410514137e-06, | |
| "loss": 0.0719, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.4190476190476191, | |
| "grad_norm": 1.560361912982766, | |
| "learning_rate": 7.693853925034316e-06, | |
| "loss": 0.0478, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 1.4232804232804233, | |
| "grad_norm": 1.6252770277096498, | |
| "learning_rate": 7.679133974894984e-06, | |
| "loss": 0.0593, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 1.4275132275132276, | |
| "grad_norm": 1.8015191094916037, | |
| "learning_rate": 7.66438138035365e-06, | |
| "loss": 0.0651, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 1.4317460317460318, | |
| "grad_norm": 2.3510718772860475, | |
| "learning_rate": 7.649596321166024e-06, | |
| "loss": 0.0774, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 1.435978835978836, | |
| "grad_norm": 2.7202765882488382, | |
| "learning_rate": 7.634778977483389e-06, | |
| "loss": 0.0861, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.4402116402116403, | |
| "grad_norm": 2.336843036814147, | |
| "learning_rate": 7.619929529850397e-06, | |
| "loss": 0.0755, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 1.4444444444444444, | |
| "grad_norm": 1.9373601062542976, | |
| "learning_rate": 7.605048159202884e-06, | |
| "loss": 0.0633, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 1.4486772486772486, | |
| "grad_norm": 1.6817059556405092, | |
| "learning_rate": 7.590135046865652e-06, | |
| "loss": 0.063, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 1.452910052910053, | |
| "grad_norm": 2.1941319252714404, | |
| "learning_rate": 7.575190374550272e-06, | |
| "loss": 0.0682, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 1.457142857142857, | |
| "grad_norm": 1.6374443562647756, | |
| "learning_rate": 7.560214324352858e-06, | |
| "loss": 0.0566, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.4613756613756614, | |
| "grad_norm": 2.007828522647094, | |
| "learning_rate": 7.545207078751858e-06, | |
| "loss": 0.0714, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 1.4656084656084656, | |
| "grad_norm": 1.4907319319460866, | |
| "learning_rate": 7.530168820605819e-06, | |
| "loss": 0.0528, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 1.46984126984127, | |
| "grad_norm": 1.7939591139887787, | |
| "learning_rate": 7.515099733151177e-06, | |
| "loss": 0.0539, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 1.474074074074074, | |
| "grad_norm": 1.4906209466069649, | |
| "learning_rate": 7.500000000000001e-06, | |
| "loss": 0.0605, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 1.4783068783068782, | |
| "grad_norm": 1.8613522048230187, | |
| "learning_rate": 7.484869805137778e-06, | |
| "loss": 0.0664, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.4825396825396826, | |
| "grad_norm": 1.8229049611749533, | |
| "learning_rate": 7.469709332921155e-06, | |
| "loss": 0.0568, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 1.4867724867724867, | |
| "grad_norm": 1.4355304492933953, | |
| "learning_rate": 7.454518768075705e-06, | |
| "loss": 0.0512, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 1.491005291005291, | |
| "grad_norm": 2.1617485048622775, | |
| "learning_rate": 7.4392982956936644e-06, | |
| "loss": 0.0866, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 1.4952380952380953, | |
| "grad_norm": 1.6436091016901637, | |
| "learning_rate": 7.424048101231687e-06, | |
| "loss": 0.0569, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 1.4994708994708994, | |
| "grad_norm": 2.117413284430222, | |
| "learning_rate": 7.408768370508577e-06, | |
| "loss": 0.073, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.5037037037037035, | |
| "grad_norm": 2.0982337999329355, | |
| "learning_rate": 7.393459289703035e-06, | |
| "loss": 0.0548, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 1.507936507936508, | |
| "grad_norm": 1.9819236384785228, | |
| "learning_rate": 7.378121045351378e-06, | |
| "loss": 0.065, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 1.5121693121693123, | |
| "grad_norm": 1.9155338730187241, | |
| "learning_rate": 7.362753824345271e-06, | |
| "loss": 0.0619, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 1.5164021164021164, | |
| "grad_norm": 1.834898522988967, | |
| "learning_rate": 7.347357813929455e-06, | |
| "loss": 0.0652, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 1.5206349206349206, | |
| "grad_norm": 2.2906521765290715, | |
| "learning_rate": 7.3319332016994575e-06, | |
| "loss": 0.0754, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.524867724867725, | |
| "grad_norm": 2.1765523809119127, | |
| "learning_rate": 7.31648017559931e-06, | |
| "loss": 0.0692, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 1.529100529100529, | |
| "grad_norm": 2.000515811428855, | |
| "learning_rate": 7.300998923919259e-06, | |
| "loss": 0.0611, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 1.5333333333333332, | |
| "grad_norm": 1.8581129150182212, | |
| "learning_rate": 7.285489635293472e-06, | |
| "loss": 0.0719, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 1.5375661375661376, | |
| "grad_norm": 1.9376344291681409, | |
| "learning_rate": 7.269952498697734e-06, | |
| "loss": 0.062, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 1.541798941798942, | |
| "grad_norm": 1.5469896978040778, | |
| "learning_rate": 7.254387703447154e-06, | |
| "loss": 0.0463, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.5460317460317459, | |
| "grad_norm": 1.7399436482112474, | |
| "learning_rate": 7.238795439193849e-06, | |
| "loss": 0.0636, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 1.5502645502645502, | |
| "grad_norm": 2.0308279574862516, | |
| "learning_rate": 7.223175895924638e-06, | |
| "loss": 0.0814, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 1.5544973544973546, | |
| "grad_norm": 1.771006156983727, | |
| "learning_rate": 7.207529263958727e-06, | |
| "loss": 0.0707, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 1.5587301587301587, | |
| "grad_norm": 1.692251815342732, | |
| "learning_rate": 7.191855733945388e-06, | |
| "loss": 0.0502, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 1.5629629629629629, | |
| "grad_norm": 1.9394355705804434, | |
| "learning_rate": 7.176155496861639e-06, | |
| "loss": 0.0665, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.5671957671957673, | |
| "grad_norm": 1.6884842235455637, | |
| "learning_rate": 7.160428744009913e-06, | |
| "loss": 0.0569, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 1.5714285714285714, | |
| "grad_norm": 1.7508558255498743, | |
| "learning_rate": 7.1446756670157306e-06, | |
| "loss": 0.0629, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 1.5756613756613755, | |
| "grad_norm": 1.3658017585158069, | |
| "learning_rate": 7.128896457825364e-06, | |
| "loss": 0.0419, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 1.57989417989418, | |
| "grad_norm": 1.7555663442176712, | |
| "learning_rate": 7.113091308703498e-06, | |
| "loss": 0.0592, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 1.5841269841269843, | |
| "grad_norm": 1.631900320996386, | |
| "learning_rate": 7.0972604122308865e-06, | |
| "loss": 0.0635, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.5883597883597882, | |
| "grad_norm": 1.7741916871006203, | |
| "learning_rate": 7.081403961302007e-06, | |
| "loss": 0.0633, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 1.5925925925925926, | |
| "grad_norm": 1.8151890739104601, | |
| "learning_rate": 7.06552214912271e-06, | |
| "loss": 0.0671, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 1.596825396825397, | |
| "grad_norm": 1.797004370573756, | |
| "learning_rate": 7.049615169207864e-06, | |
| "loss": 0.0588, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 1.601058201058201, | |
| "grad_norm": 2.3631789146103084, | |
| "learning_rate": 7.033683215379002e-06, | |
| "loss": 0.0752, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 1.6052910052910052, | |
| "grad_norm": 1.763180877149612, | |
| "learning_rate": 7.0177264817619514e-06, | |
| "loss": 0.0557, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.6095238095238096, | |
| "grad_norm": 1.8995850867394273, | |
| "learning_rate": 7.0017451627844765e-06, | |
| "loss": 0.0717, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 1.6137566137566137, | |
| "grad_norm": 1.801085177997209, | |
| "learning_rate": 6.985739453173903e-06, | |
| "loss": 0.0592, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 1.6179894179894179, | |
| "grad_norm": 1.7879752968994786, | |
| "learning_rate": 6.9697095479547564e-06, | |
| "loss": 0.0585, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 1.6222222222222222, | |
| "grad_norm": 1.6567664297191056, | |
| "learning_rate": 6.953655642446368e-06, | |
| "loss": 0.0457, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 1.6264550264550266, | |
| "grad_norm": 1.6155271306262087, | |
| "learning_rate": 6.9375779322605154e-06, | |
| "loss": 0.059, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.6306878306878307, | |
| "grad_norm": 1.68695010471739, | |
| "learning_rate": 6.921476613299018e-06, | |
| "loss": 0.0542, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 1.6349206349206349, | |
| "grad_norm": 1.9421206046284432, | |
| "learning_rate": 6.905351881751372e-06, | |
| "loss": 0.0717, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 1.6391534391534393, | |
| "grad_norm": 2.448162982815167, | |
| "learning_rate": 6.889203934092337e-06, | |
| "loss": 0.0742, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 1.6433862433862434, | |
| "grad_norm": 1.9166726296413361, | |
| "learning_rate": 6.873032967079562e-06, | |
| "loss": 0.0581, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 1.6476190476190475, | |
| "grad_norm": 1.8239195781910584, | |
| "learning_rate": 6.856839177751175e-06, | |
| "loss": 0.0656, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.651851851851852, | |
| "grad_norm": 1.3729431544806852, | |
| "learning_rate": 6.840622763423391e-06, | |
| "loss": 0.0402, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 1.656084656084656, | |
| "grad_norm": 1.7457900027219597, | |
| "learning_rate": 6.824383921688098e-06, | |
| "loss": 0.0581, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 1.6603174603174602, | |
| "grad_norm": 1.4041185285984514, | |
| "learning_rate": 6.808122850410461e-06, | |
| "loss": 0.0445, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 1.6645502645502646, | |
| "grad_norm": 2.05451457672099, | |
| "learning_rate": 6.7918397477265e-06, | |
| "loss": 0.071, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 1.668783068783069, | |
| "grad_norm": 1.6512575262339615, | |
| "learning_rate": 6.775534812040686e-06, | |
| "loss": 0.0553, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.673015873015873, | |
| "grad_norm": 1.8115387096252427, | |
| "learning_rate": 6.759208242023509e-06, | |
| "loss": 0.0576, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 1.6772486772486772, | |
| "grad_norm": 1.2702269626974747, | |
| "learning_rate": 6.7428602366090764e-06, | |
| "loss": 0.0381, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 1.6814814814814816, | |
| "grad_norm": 1.5144029396774048, | |
| "learning_rate": 6.7264909949926735e-06, | |
| "loss": 0.0501, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 1.6857142857142857, | |
| "grad_norm": 1.4102482360717825, | |
| "learning_rate": 6.710100716628345e-06, | |
| "loss": 0.0409, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 1.6899470899470899, | |
| "grad_norm": 2.1685643870197486, | |
| "learning_rate": 6.693689601226458e-06, | |
| "loss": 0.0737, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.6941798941798942, | |
| "grad_norm": 1.900913941941258, | |
| "learning_rate": 6.677257848751276e-06, | |
| "loss": 0.0688, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 1.6984126984126984, | |
| "grad_norm": 1.6471464997248573, | |
| "learning_rate": 6.6608056594185166e-06, | |
| "loss": 0.0491, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 1.7026455026455025, | |
| "grad_norm": 1.7754346406800103, | |
| "learning_rate": 6.644333233692917e-06, | |
| "loss": 0.0559, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 1.7068783068783069, | |
| "grad_norm": 2.201736043430736, | |
| "learning_rate": 6.627840772285784e-06, | |
| "loss": 0.0768, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 1.7111111111111112, | |
| "grad_norm": 1.728174746623607, | |
| "learning_rate": 6.611328476152557e-06, | |
| "loss": 0.0544, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.7153439153439154, | |
| "grad_norm": 1.653359901954905, | |
| "learning_rate": 6.594796546490351e-06, | |
| "loss": 0.0516, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 1.7195767195767195, | |
| "grad_norm": 2.0915084346199113, | |
| "learning_rate": 6.578245184735513e-06, | |
| "loss": 0.0677, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 1.723809523809524, | |
| "grad_norm": 1.3338655011947615, | |
| "learning_rate": 6.561674592561164e-06, | |
| "loss": 0.0452, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 1.728042328042328, | |
| "grad_norm": 1.916548372288022, | |
| "learning_rate": 6.545084971874738e-06, | |
| "loss": 0.0511, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 1.7322751322751322, | |
| "grad_norm": 1.4723543528268832, | |
| "learning_rate": 6.5284765248155295e-06, | |
| "loss": 0.0427, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.7365079365079366, | |
| "grad_norm": 1.7767150430559346, | |
| "learning_rate": 6.5118494537522235e-06, | |
| "loss": 0.0549, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 1.7407407407407407, | |
| "grad_norm": 2.073520060997652, | |
| "learning_rate": 6.495203961280434e-06, | |
| "loss": 0.0705, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 1.7449735449735448, | |
| "grad_norm": 1.9164477972239198, | |
| "learning_rate": 6.4785402502202345e-06, | |
| "loss": 0.0467, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 1.7492063492063492, | |
| "grad_norm": 1.4479847528255574, | |
| "learning_rate": 6.461858523613684e-06, | |
| "loss": 0.0411, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 1.7534391534391536, | |
| "grad_norm": 1.6478763296120762, | |
| "learning_rate": 6.445158984722358e-06, | |
| "loss": 0.0506, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.7576719576719577, | |
| "grad_norm": 1.7693910067270657, | |
| "learning_rate": 6.428441837024868e-06, | |
| "loss": 0.0453, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 1.7619047619047619, | |
| "grad_norm": 1.3800910764927643, | |
| "learning_rate": 6.411707284214384e-06, | |
| "loss": 0.0351, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 1.7661375661375662, | |
| "grad_norm": 1.7577530047799417, | |
| "learning_rate": 6.3949555301961474e-06, | |
| "loss": 0.0661, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 1.7703703703703704, | |
| "grad_norm": 1.8222505148681252, | |
| "learning_rate": 6.378186779084996e-06, | |
| "loss": 0.0628, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 1.7746031746031745, | |
| "grad_norm": 2.0358038447360047, | |
| "learning_rate": 6.361401235202872e-06, | |
| "loss": 0.0507, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.7788359788359789, | |
| "grad_norm": 2.1606830531676495, | |
| "learning_rate": 6.344599103076329e-06, | |
| "loss": 0.071, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 1.783068783068783, | |
| "grad_norm": 1.579905383421796, | |
| "learning_rate": 6.327780587434045e-06, | |
| "loss": 0.0464, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 1.7873015873015872, | |
| "grad_norm": 2.361618973258393, | |
| "learning_rate": 6.310945893204324e-06, | |
| "loss": 0.0559, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 1.7915343915343915, | |
| "grad_norm": 2.4256436411132274, | |
| "learning_rate": 6.294095225512604e-06, | |
| "loss": 0.0703, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 1.795767195767196, | |
| "grad_norm": 2.0483972204305405, | |
| "learning_rate": 6.277228789678953e-06, | |
| "loss": 0.076, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 1.6486424050280866, | |
| "learning_rate": 6.26034679121557e-06, | |
| "loss": 0.0354, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 1.8042328042328042, | |
| "grad_norm": 1.7057849786251837, | |
| "learning_rate": 6.243449435824276e-06, | |
| "loss": 0.0459, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 1.8084656084656086, | |
| "grad_norm": 1.4964739470323853, | |
| "learning_rate": 6.2265369293940135e-06, | |
| "loss": 0.0468, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 1.8126984126984127, | |
| "grad_norm": 1.5712916952431084, | |
| "learning_rate": 6.209609477998339e-06, | |
| "loss": 0.0511, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 1.8169312169312168, | |
| "grad_norm": 1.6075446649270715, | |
| "learning_rate": 6.192667287892905e-06, | |
| "loss": 0.0488, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.8211640211640212, | |
| "grad_norm": 1.766038600491867, | |
| "learning_rate": 6.17571056551295e-06, | |
| "loss": 0.0549, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 1.8253968253968254, | |
| "grad_norm": 1.438358120751698, | |
| "learning_rate": 6.158739517470786e-06, | |
| "loss": 0.042, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 1.8296296296296295, | |
| "grad_norm": 1.855618360148089, | |
| "learning_rate": 6.141754350553279e-06, | |
| "loss": 0.0636, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 1.8338624338624339, | |
| "grad_norm": 1.6267674194949144, | |
| "learning_rate": 6.124755271719326e-06, | |
| "loss": 0.0565, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 1.8380952380952382, | |
| "grad_norm": 1.957873438669131, | |
| "learning_rate": 6.107742488097338e-06, | |
| "loss": 0.0589, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.8423280423280424, | |
| "grad_norm": 1.8169340288404146, | |
| "learning_rate": 6.090716206982714e-06, | |
| "loss": 0.0619, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 1.8465608465608465, | |
| "grad_norm": 1.9473267362173434, | |
| "learning_rate": 6.073676635835317e-06, | |
| "loss": 0.054, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 1.8507936507936509, | |
| "grad_norm": 1.6984158293128067, | |
| "learning_rate": 6.056623982276945e-06, | |
| "loss": 0.043, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 1.855026455026455, | |
| "grad_norm": 1.7978129108415464, | |
| "learning_rate": 6.039558454088796e-06, | |
| "loss": 0.0536, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 1.8592592592592592, | |
| "grad_norm": 1.4388535107743592, | |
| "learning_rate": 6.022480259208951e-06, | |
| "loss": 0.0461, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.8634920634920635, | |
| "grad_norm": 1.9385002527687403, | |
| "learning_rate": 6.005389605729824e-06, | |
| "loss": 0.0606, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 1.8677248677248677, | |
| "grad_norm": 1.715710356385189, | |
| "learning_rate": 5.988286701895631e-06, | |
| "loss": 0.0505, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 1.8719576719576718, | |
| "grad_norm": 1.9893593627181096, | |
| "learning_rate": 5.97117175609986e-06, | |
| "loss": 0.0491, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 1.8761904761904762, | |
| "grad_norm": 1.669051511290126, | |
| "learning_rate": 5.954044976882725e-06, | |
| "loss": 0.0443, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 1.8804232804232806, | |
| "grad_norm": 1.8334555873159546, | |
| "learning_rate": 5.936906572928625e-06, | |
| "loss": 0.0595, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.8846560846560847, | |
| "grad_norm": 1.482799215009145, | |
| "learning_rate": 5.919756753063601e-06, | |
| "loss": 0.0419, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 1.8888888888888888, | |
| "grad_norm": 2.158817135873301, | |
| "learning_rate": 5.902595726252801e-06, | |
| "loss": 0.0523, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 1.8931216931216932, | |
| "grad_norm": 1.4512278026518133, | |
| "learning_rate": 5.885423701597918e-06, | |
| "loss": 0.0341, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 1.8973544973544973, | |
| "grad_norm": 2.055857253134802, | |
| "learning_rate": 5.8682408883346535e-06, | |
| "loss": 0.0678, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 1.9015873015873015, | |
| "grad_norm": 1.4630518070623408, | |
| "learning_rate": 5.851047495830163e-06, | |
| "loss": 0.0428, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.9058201058201059, | |
| "grad_norm": 1.5001384600677683, | |
| "learning_rate": 5.8338437335805124e-06, | |
| "loss": 0.0487, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 1.91005291005291, | |
| "grad_norm": 1.5957812640910574, | |
| "learning_rate": 5.816629811208112e-06, | |
| "loss": 0.0454, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 1.9142857142857141, | |
| "grad_norm": 1.7392417324751044, | |
| "learning_rate": 5.799405938459175e-06, | |
| "loss": 0.0545, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 1.9185185185185185, | |
| "grad_norm": 1.2059754368705733, | |
| "learning_rate": 5.782172325201155e-06, | |
| "loss": 0.0332, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 1.9227513227513229, | |
| "grad_norm": 2.1573344543845696, | |
| "learning_rate": 5.764929181420191e-06, | |
| "loss": 0.0504, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.926984126984127, | |
| "grad_norm": 1.7501207563945467, | |
| "learning_rate": 5.747676717218549e-06, | |
| "loss": 0.0523, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 1.9312169312169312, | |
| "grad_norm": 1.4778971571819108, | |
| "learning_rate": 5.730415142812059e-06, | |
| "loss": 0.0374, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 1.9354497354497355, | |
| "grad_norm": 1.9397580291264296, | |
| "learning_rate": 5.7131446685275595e-06, | |
| "loss": 0.0483, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 1.9396825396825397, | |
| "grad_norm": 1.5774150951682633, | |
| "learning_rate": 5.695865504800328e-06, | |
| "loss": 0.0427, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 1.9439153439153438, | |
| "grad_norm": 1.5205801074165397, | |
| "learning_rate": 5.678577862171523e-06, | |
| "loss": 0.0403, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.9481481481481482, | |
| "grad_norm": 1.9814309441969933, | |
| "learning_rate": 5.661281951285613e-06, | |
| "loss": 0.0546, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 1.9523809523809523, | |
| "grad_norm": 1.8199362216465413, | |
| "learning_rate": 5.643977982887815e-06, | |
| "loss": 0.0556, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 1.9566137566137565, | |
| "grad_norm": 2.403211392183965, | |
| "learning_rate": 5.626666167821522e-06, | |
| "loss": 0.063, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 1.9608465608465608, | |
| "grad_norm": 1.3278174443006239, | |
| "learning_rate": 5.609346717025738e-06, | |
| "loss": 0.0412, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 1.9650793650793652, | |
| "grad_norm": 1.3097142822228083, | |
| "learning_rate": 5.592019841532507e-06, | |
| "loss": 0.038, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.9693121693121693, | |
| "grad_norm": 1.3209428741356222, | |
| "learning_rate": 5.5746857524643335e-06, | |
| "loss": 0.0321, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 1.9735449735449735, | |
| "grad_norm": 1.4633993153900189, | |
| "learning_rate": 5.557344661031628e-06, | |
| "loss": 0.0439, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 1.9777777777777779, | |
| "grad_norm": 1.6791239992400115, | |
| "learning_rate": 5.539996778530114e-06, | |
| "loss": 0.0496, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 1.982010582010582, | |
| "grad_norm": 1.6259211954562258, | |
| "learning_rate": 5.522642316338268e-06, | |
| "loss": 0.0361, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 1.9862433862433861, | |
| "grad_norm": 1.6806163306838742, | |
| "learning_rate": 5.505281485914732e-06, | |
| "loss": 0.0419, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.9904761904761905, | |
| "grad_norm": 1.9595273597942116, | |
| "learning_rate": 5.487914498795748e-06, | |
| "loss": 0.0555, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 1.9947089947089947, | |
| "grad_norm": 1.8422091219424643, | |
| "learning_rate": 5.470541566592573e-06, | |
| "loss": 0.0466, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 1.9989417989417988, | |
| "grad_norm": 1.7500818137119094, | |
| "learning_rate": 5.453162900988902e-06, | |
| "loss": 0.0438, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 2.761502971744513, | |
| "learning_rate": 5.435778713738292e-06, | |
| "loss": 0.0405, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 2.0042328042328044, | |
| "grad_norm": 1.425482793293008, | |
| "learning_rate": 5.41838921666158e-06, | |
| "loss": 0.0281, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 2.0084656084656083, | |
| "grad_norm": 2.068969474891088, | |
| "learning_rate": 5.400994621644294e-06, | |
| "loss": 0.0328, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 2.0126984126984127, | |
| "grad_norm": 1.3694895890619094, | |
| "learning_rate": 5.383595140634093e-06, | |
| "loss": 0.0267, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 2.016931216931217, | |
| "grad_norm": 0.9597151794646276, | |
| "learning_rate": 5.366190985638159e-06, | |
| "loss": 0.0239, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 2.0211640211640214, | |
| "grad_norm": 0.9612439901820802, | |
| "learning_rate": 5.348782368720627e-06, | |
| "loss": 0.0195, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 2.0253968253968253, | |
| "grad_norm": 1.5229336713881063, | |
| "learning_rate": 5.3313695020000026e-06, | |
| "loss": 0.0286, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.0296296296296297, | |
| "grad_norm": 1.6211741677941989, | |
| "learning_rate": 5.3139525976465675e-06, | |
| "loss": 0.0297, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 2.033862433862434, | |
| "grad_norm": 1.4938054925255841, | |
| "learning_rate": 5.296531867879809e-06, | |
| "loss": 0.0246, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 2.038095238095238, | |
| "grad_norm": 1.4029085405633699, | |
| "learning_rate": 5.27910752496582e-06, | |
| "loss": 0.0225, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 2.0423280423280423, | |
| "grad_norm": 1.3430049638024157, | |
| "learning_rate": 5.2616797812147205e-06, | |
| "loss": 0.0208, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 2.0465608465608467, | |
| "grad_norm": 1.2721185303171878, | |
| "learning_rate": 5.244248848978067e-06, | |
| "loss": 0.0232, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 2.0507936507936506, | |
| "grad_norm": 1.463578245334908, | |
| "learning_rate": 5.226814940646268e-06, | |
| "loss": 0.029, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 2.055026455026455, | |
| "grad_norm": 1.702815967014725, | |
| "learning_rate": 5.209378268645998e-06, | |
| "loss": 0.0323, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 2.0592592592592593, | |
| "grad_norm": 1.26455994835701, | |
| "learning_rate": 5.1919390454376e-06, | |
| "loss": 0.0223, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 2.0634920634920633, | |
| "grad_norm": 1.110889327620989, | |
| "learning_rate": 5.174497483512506e-06, | |
| "loss": 0.0265, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 2.0677248677248676, | |
| "grad_norm": 1.2207931374567949, | |
| "learning_rate": 5.157053795390642e-06, | |
| "loss": 0.0226, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.071957671957672, | |
| "grad_norm": 1.0427216738524305, | |
| "learning_rate": 5.139608193617846e-06, | |
| "loss": 0.0175, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 2.0761904761904764, | |
| "grad_norm": 1.277813572552022, | |
| "learning_rate": 5.1221608907632665e-06, | |
| "loss": 0.0207, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 2.0804232804232803, | |
| "grad_norm": 1.4478523585989624, | |
| "learning_rate": 5.1047120994167855e-06, | |
| "loss": 0.0205, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 2.0846560846560847, | |
| "grad_norm": 0.9255571718220782, | |
| "learning_rate": 5.087262032186418e-06, | |
| "loss": 0.0176, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 2.088888888888889, | |
| "grad_norm": 1.321686704356327, | |
| "learning_rate": 5.069810901695727e-06, | |
| "loss": 0.0214, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 2.093121693121693, | |
| "grad_norm": 1.7093390536661062, | |
| "learning_rate": 5.05235892058123e-06, | |
| "loss": 0.0251, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 2.0973544973544973, | |
| "grad_norm": 1.8903454646946267, | |
| "learning_rate": 5.034906301489808e-06, | |
| "loss": 0.0328, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 2.1015873015873017, | |
| "grad_norm": 1.622033237548866, | |
| "learning_rate": 5.0174532570761194e-06, | |
| "loss": 0.0222, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 2.105820105820106, | |
| "grad_norm": 1.3898704763132579, | |
| "learning_rate": 5e-06, | |
| "loss": 0.0213, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 2.11005291005291, | |
| "grad_norm": 1.7802781541406807, | |
| "learning_rate": 4.982546742923883e-06, | |
| "loss": 0.0351, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.1142857142857143, | |
| "grad_norm": 1.3745676765685821, | |
| "learning_rate": 4.965093698510192e-06, | |
| "loss": 0.0229, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 2.1185185185185187, | |
| "grad_norm": 1.2480925927636604, | |
| "learning_rate": 4.9476410794187726e-06, | |
| "loss": 0.0224, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 2.1227513227513226, | |
| "grad_norm": 1.328383776404079, | |
| "learning_rate": 4.9301890983042744e-06, | |
| "loss": 0.021, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 2.126984126984127, | |
| "grad_norm": 1.2744683969863004, | |
| "learning_rate": 4.9127379678135825e-06, | |
| "loss": 0.0195, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 2.1312169312169313, | |
| "grad_norm": 1.5077037111075677, | |
| "learning_rate": 4.895287900583216e-06, | |
| "loss": 0.0203, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 2.1354497354497353, | |
| "grad_norm": 1.505975774545219, | |
| "learning_rate": 4.877839109236735e-06, | |
| "loss": 0.0272, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 2.1396825396825396, | |
| "grad_norm": 1.923885296333145, | |
| "learning_rate": 4.860391806382157e-06, | |
| "loss": 0.0328, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 2.143915343915344, | |
| "grad_norm": 1.1003948235042609, | |
| "learning_rate": 4.842946204609359e-06, | |
| "loss": 0.0205, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 2.148148148148148, | |
| "grad_norm": 1.5711105962165373, | |
| "learning_rate": 4.825502516487497e-06, | |
| "loss": 0.0239, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 2.1523809523809523, | |
| "grad_norm": 2.723763362350314, | |
| "learning_rate": 4.8080609545624004e-06, | |
| "loss": 0.0277, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.1566137566137566, | |
| "grad_norm": 1.1294689243975755, | |
| "learning_rate": 4.7906217313540035e-06, | |
| "loss": 0.025, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 2.160846560846561, | |
| "grad_norm": 1.2353799701389105, | |
| "learning_rate": 4.7731850593537316e-06, | |
| "loss": 0.0225, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 2.165079365079365, | |
| "grad_norm": 0.9875825016071724, | |
| "learning_rate": 4.755751151021934e-06, | |
| "loss": 0.0155, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 2.1693121693121693, | |
| "grad_norm": 1.058383517339813, | |
| "learning_rate": 4.738320218785281e-06, | |
| "loss": 0.0227, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 2.1735449735449737, | |
| "grad_norm": 1.3182471200484542, | |
| "learning_rate": 4.720892475034181e-06, | |
| "loss": 0.0205, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 2.1777777777777776, | |
| "grad_norm": 1.091377381304472, | |
| "learning_rate": 4.703468132120193e-06, | |
| "loss": 0.0248, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 2.182010582010582, | |
| "grad_norm": 1.505329948005569, | |
| "learning_rate": 4.686047402353433e-06, | |
| "loss": 0.0291, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 2.1862433862433863, | |
| "grad_norm": 1.0420378868483025, | |
| "learning_rate": 4.668630498000001e-06, | |
| "loss": 0.0178, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 2.1904761904761907, | |
| "grad_norm": 1.0770607200654685, | |
| "learning_rate": 4.651217631279374e-06, | |
| "loss": 0.0183, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 2.1947089947089946, | |
| "grad_norm": 1.0753499999979097, | |
| "learning_rate": 4.6338090143618435e-06, | |
| "loss": 0.0237, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.198941798941799, | |
| "grad_norm": 1.2102511948406212, | |
| "learning_rate": 4.6164048593659076e-06, | |
| "loss": 0.0212, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 2.2031746031746033, | |
| "grad_norm": 1.6612931544402387, | |
| "learning_rate": 4.5990053783557066e-06, | |
| "loss": 0.0306, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 2.2074074074074073, | |
| "grad_norm": 1.434922916344193, | |
| "learning_rate": 4.581610783338424e-06, | |
| "loss": 0.0173, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 2.2116402116402116, | |
| "grad_norm": 1.318046293179565, | |
| "learning_rate": 4.564221286261709e-06, | |
| "loss": 0.0188, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 2.215873015873016, | |
| "grad_norm": 1.5523824099307213, | |
| "learning_rate": 4.546837099011101e-06, | |
| "loss": 0.0279, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 2.22010582010582, | |
| "grad_norm": 1.265493250306095, | |
| "learning_rate": 4.529458433407429e-06, | |
| "loss": 0.0264, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 2.2243386243386243, | |
| "grad_norm": 1.4757784411924582, | |
| "learning_rate": 4.512085501204254e-06, | |
| "loss": 0.0264, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 2.2285714285714286, | |
| "grad_norm": 1.1688272093442453, | |
| "learning_rate": 4.494718514085269e-06, | |
| "loss": 0.0189, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 2.2328042328042326, | |
| "grad_norm": 1.4981131427762684, | |
| "learning_rate": 4.477357683661734e-06, | |
| "loss": 0.0238, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 2.237037037037037, | |
| "grad_norm": 1.060141826721873, | |
| "learning_rate": 4.460003221469886e-06, | |
| "loss": 0.0202, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.2412698412698413, | |
| "grad_norm": 1.9904078872978332, | |
| "learning_rate": 4.442655338968373e-06, | |
| "loss": 0.0201, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 2.2455026455026457, | |
| "grad_norm": 1.1693403834565992, | |
| "learning_rate": 4.425314247535668e-06, | |
| "loss": 0.021, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 2.2497354497354496, | |
| "grad_norm": 5.074805136944312, | |
| "learning_rate": 4.4079801584674955e-06, | |
| "loss": 0.0222, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 2.253968253968254, | |
| "grad_norm": 1.079029935941859, | |
| "learning_rate": 4.390653282974264e-06, | |
| "loss": 0.0199, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 2.2582010582010583, | |
| "grad_norm": 2.38714350665424, | |
| "learning_rate": 4.373333832178478e-06, | |
| "loss": 0.0298, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 2.2624338624338622, | |
| "grad_norm": 1.2813798644078875, | |
| "learning_rate": 4.356022017112187e-06, | |
| "loss": 0.0201, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 2.2666666666666666, | |
| "grad_norm": 1.305628889106575, | |
| "learning_rate": 4.3387180487143875e-06, | |
| "loss": 0.0173, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 2.270899470899471, | |
| "grad_norm": 1.094988751447929, | |
| "learning_rate": 4.321422137828479e-06, | |
| "loss": 0.0155, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 2.2751322751322753, | |
| "grad_norm": 1.348309678538079, | |
| "learning_rate": 4.304134495199675e-06, | |
| "loss": 0.0209, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 2.2793650793650793, | |
| "grad_norm": 1.5868771818843859, | |
| "learning_rate": 4.286855331472442e-06, | |
| "loss": 0.0282, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.2835978835978836, | |
| "grad_norm": 1.2350590787759976, | |
| "learning_rate": 4.269584857187942e-06, | |
| "loss": 0.0212, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 2.287830687830688, | |
| "grad_norm": 1.4470288037087002, | |
| "learning_rate": 4.2523232827814534e-06, | |
| "loss": 0.0192, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 2.292063492063492, | |
| "grad_norm": 1.4099588785263357, | |
| "learning_rate": 4.23507081857981e-06, | |
| "loss": 0.0184, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 2.2962962962962963, | |
| "grad_norm": 1.4782113067056102, | |
| "learning_rate": 4.217827674798845e-06, | |
| "loss": 0.023, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 2.3005291005291006, | |
| "grad_norm": 1.9086034541901193, | |
| "learning_rate": 4.200594061540827e-06, | |
| "loss": 0.0275, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 2.3047619047619046, | |
| "grad_norm": 1.2032912201710146, | |
| "learning_rate": 4.183370188791891e-06, | |
| "loss": 0.0196, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 2.308994708994709, | |
| "grad_norm": 1.3308907007952828, | |
| "learning_rate": 4.166156266419489e-06, | |
| "loss": 0.022, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 2.3132275132275133, | |
| "grad_norm": 1.0984488540294757, | |
| "learning_rate": 4.148952504169839e-06, | |
| "loss": 0.0228, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 2.317460317460317, | |
| "grad_norm": 0.7471478820780099, | |
| "learning_rate": 4.131759111665349e-06, | |
| "loss": 0.0111, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 2.3216931216931216, | |
| "grad_norm": 1.0717935625690418, | |
| "learning_rate": 4.114576298402085e-06, | |
| "loss": 0.0161, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.325925925925926, | |
| "grad_norm": 1.218749961666418, | |
| "learning_rate": 4.0974042737472005e-06, | |
| "loss": 0.0186, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 2.3301587301587303, | |
| "grad_norm": 2.8887759534542723, | |
| "learning_rate": 4.0802432469364e-06, | |
| "loss": 0.0254, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 2.3343915343915342, | |
| "grad_norm": 1.930475086228816, | |
| "learning_rate": 4.063093427071376e-06, | |
| "loss": 0.023, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 2.3386243386243386, | |
| "grad_norm": 1.8908317964947536, | |
| "learning_rate": 4.045955023117276e-06, | |
| "loss": 0.0285, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 2.342857142857143, | |
| "grad_norm": 1.1934657197447822, | |
| "learning_rate": 4.028828243900141e-06, | |
| "loss": 0.0201, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 2.3470899470899473, | |
| "grad_norm": 1.4865568918577254, | |
| "learning_rate": 4.0117132981043695e-06, | |
| "loss": 0.0273, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 2.3513227513227513, | |
| "grad_norm": 0.9529787490185494, | |
| "learning_rate": 3.994610394270178e-06, | |
| "loss": 0.0137, | |
| "step": 557 | |
| }, | |
| { | |
| "epoch": 2.3555555555555556, | |
| "grad_norm": 1.4123895372263633, | |
| "learning_rate": 3.977519740791049e-06, | |
| "loss": 0.0247, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 2.35978835978836, | |
| "grad_norm": 1.4967579484639353, | |
| "learning_rate": 3.960441545911205e-06, | |
| "loss": 0.0223, | |
| "step": 559 | |
| }, | |
| { | |
| "epoch": 2.364021164021164, | |
| "grad_norm": 1.3782252518557059, | |
| "learning_rate": 3.943376017723058e-06, | |
| "loss": 0.0203, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.3682539682539683, | |
| "grad_norm": 1.011290054301172, | |
| "learning_rate": 3.926323364164684e-06, | |
| "loss": 0.0174, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 2.3724867724867726, | |
| "grad_norm": 0.9372736736401631, | |
| "learning_rate": 3.909283793017289e-06, | |
| "loss": 0.0127, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 2.3767195767195766, | |
| "grad_norm": 1.0720245224295357, | |
| "learning_rate": 3.892257511902664e-06, | |
| "loss": 0.0185, | |
| "step": 563 | |
| }, | |
| { | |
| "epoch": 2.380952380952381, | |
| "grad_norm": 1.3820051395972568, | |
| "learning_rate": 3.875244728280676e-06, | |
| "loss": 0.0245, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 2.3851851851851853, | |
| "grad_norm": 1.54167399561864, | |
| "learning_rate": 3.8582456494467214e-06, | |
| "loss": 0.0229, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 2.389417989417989, | |
| "grad_norm": 1.3749494144666705, | |
| "learning_rate": 3.841260482529215e-06, | |
| "loss": 0.0209, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 2.3936507936507936, | |
| "grad_norm": 0.9837959983275872, | |
| "learning_rate": 3.82428943448705e-06, | |
| "loss": 0.0153, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 2.397883597883598, | |
| "grad_norm": 1.2997925691857108, | |
| "learning_rate": 3.8073327121070968e-06, | |
| "loss": 0.0168, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 2.402116402116402, | |
| "grad_norm": 0.99630339528548, | |
| "learning_rate": 3.790390522001662e-06, | |
| "loss": 0.0162, | |
| "step": 569 | |
| }, | |
| { | |
| "epoch": 2.4063492063492062, | |
| "grad_norm": 1.2303194376503306, | |
| "learning_rate": 3.7734630706059873e-06, | |
| "loss": 0.0185, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.4105820105820106, | |
| "grad_norm": 1.7621485315307714, | |
| "learning_rate": 3.756550564175727e-06, | |
| "loss": 0.0255, | |
| "step": 571 | |
| }, | |
| { | |
| "epoch": 2.414814814814815, | |
| "grad_norm": 1.0569009320866272, | |
| "learning_rate": 3.7396532087844318e-06, | |
| "loss": 0.0143, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 2.419047619047619, | |
| "grad_norm": 1.7821677956105721, | |
| "learning_rate": 3.7227712103210485e-06, | |
| "loss": 0.0249, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 2.4232804232804233, | |
| "grad_norm": 1.3573086881674947, | |
| "learning_rate": 3.705904774487396e-06, | |
| "loss": 0.0204, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 2.4275132275132276, | |
| "grad_norm": 1.1718662977450818, | |
| "learning_rate": 3.6890541067956775e-06, | |
| "loss": 0.0146, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 2.431746031746032, | |
| "grad_norm": 1.614044036491686, | |
| "learning_rate": 3.672219412565956e-06, | |
| "loss": 0.0192, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 2.435978835978836, | |
| "grad_norm": 1.5234312497660505, | |
| "learning_rate": 3.655400896923672e-06, | |
| "loss": 0.024, | |
| "step": 577 | |
| }, | |
| { | |
| "epoch": 2.4402116402116403, | |
| "grad_norm": 1.2229255498253058, | |
| "learning_rate": 3.6385987647971287e-06, | |
| "loss": 0.0213, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 2.4444444444444446, | |
| "grad_norm": 1.188233949021249, | |
| "learning_rate": 3.6218132209150047e-06, | |
| "loss": 0.0158, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 2.4486772486772486, | |
| "grad_norm": 1.2163503629423913, | |
| "learning_rate": 3.6050444698038547e-06, | |
| "loss": 0.0233, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.452910052910053, | |
| "grad_norm": 1.181425300400814, | |
| "learning_rate": 3.5882927157856175e-06, | |
| "loss": 0.0196, | |
| "step": 581 | |
| }, | |
| { | |
| "epoch": 2.4571428571428573, | |
| "grad_norm": 1.7922575768503977, | |
| "learning_rate": 3.571558162975133e-06, | |
| "loss": 0.0218, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 2.461375661375661, | |
| "grad_norm": 5.6028053486201514, | |
| "learning_rate": 3.5548410152776414e-06, | |
| "loss": 0.0175, | |
| "step": 583 | |
| }, | |
| { | |
| "epoch": 2.4656084656084656, | |
| "grad_norm": 1.4059375192080907, | |
| "learning_rate": 3.538141476386317e-06, | |
| "loss": 0.0217, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 2.46984126984127, | |
| "grad_norm": 1.2473008607933311, | |
| "learning_rate": 3.521459749779769e-06, | |
| "loss": 0.0211, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 2.474074074074074, | |
| "grad_norm": 1.0232237746517738, | |
| "learning_rate": 3.5047960387195673e-06, | |
| "loss": 0.0164, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 2.4783068783068782, | |
| "grad_norm": 1.1551458380571011, | |
| "learning_rate": 3.488150546247778e-06, | |
| "loss": 0.0177, | |
| "step": 587 | |
| }, | |
| { | |
| "epoch": 2.4825396825396826, | |
| "grad_norm": 1.241989730027279, | |
| "learning_rate": 3.471523475184472e-06, | |
| "loss": 0.0213, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 2.4867724867724865, | |
| "grad_norm": 1.3270342015490493, | |
| "learning_rate": 3.4549150281252635e-06, | |
| "loss": 0.0227, | |
| "step": 589 | |
| }, | |
| { | |
| "epoch": 2.491005291005291, | |
| "grad_norm": 1.3678587583649702, | |
| "learning_rate": 3.4383254074388373e-06, | |
| "loss": 0.0228, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.4952380952380953, | |
| "grad_norm": 2.2689660584590783, | |
| "learning_rate": 3.4217548152644887e-06, | |
| "loss": 0.0175, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 2.4994708994708996, | |
| "grad_norm": 1.5596005660525438, | |
| "learning_rate": 3.40520345350965e-06, | |
| "loss": 0.0199, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 2.5037037037037035, | |
| "grad_norm": 1.2720201006007885, | |
| "learning_rate": 3.3886715238474454e-06, | |
| "loss": 0.0177, | |
| "step": 593 | |
| }, | |
| { | |
| "epoch": 2.507936507936508, | |
| "grad_norm": 1.093460165926427, | |
| "learning_rate": 3.372159227714218e-06, | |
| "loss": 0.0139, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 2.5121693121693123, | |
| "grad_norm": 1.6179479955437774, | |
| "learning_rate": 3.355666766307084e-06, | |
| "loss": 0.0232, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 2.5164021164021166, | |
| "grad_norm": 1.2102869187922056, | |
| "learning_rate": 3.339194340581485e-06, | |
| "loss": 0.0187, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 2.5206349206349206, | |
| "grad_norm": 1.1103393510900712, | |
| "learning_rate": 3.322742151248726e-06, | |
| "loss": 0.0143, | |
| "step": 597 | |
| }, | |
| { | |
| "epoch": 2.524867724867725, | |
| "grad_norm": 1.2647278804921822, | |
| "learning_rate": 3.3063103987735433e-06, | |
| "loss": 0.0186, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 2.5291005291005293, | |
| "grad_norm": 1.1276120725215881, | |
| "learning_rate": 3.289899283371657e-06, | |
| "loss": 0.0148, | |
| "step": 599 | |
| }, | |
| { | |
| "epoch": 2.533333333333333, | |
| "grad_norm": 1.0196598923468276, | |
| "learning_rate": 3.273509005007327e-06, | |
| "loss": 0.0166, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.5375661375661376, | |
| "grad_norm": 0.9388923647646054, | |
| "learning_rate": 3.2571397633909252e-06, | |
| "loss": 0.0147, | |
| "step": 601 | |
| }, | |
| { | |
| "epoch": 2.541798941798942, | |
| "grad_norm": 1.09462020411353, | |
| "learning_rate": 3.2407917579764914e-06, | |
| "loss": 0.0152, | |
| "step": 602 | |
| }, | |
| { | |
| "epoch": 2.546031746031746, | |
| "grad_norm": 1.066286773297906, | |
| "learning_rate": 3.224465187959316e-06, | |
| "loss": 0.0169, | |
| "step": 603 | |
| }, | |
| { | |
| "epoch": 2.5502645502645502, | |
| "grad_norm": 1.5390006694829865, | |
| "learning_rate": 3.2081602522734987e-06, | |
| "loss": 0.0198, | |
| "step": 604 | |
| }, | |
| { | |
| "epoch": 2.5544973544973546, | |
| "grad_norm": 1.4007819028143056, | |
| "learning_rate": 3.1918771495895395e-06, | |
| "loss": 0.0203, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 2.5587301587301585, | |
| "grad_norm": 0.9872842079869668, | |
| "learning_rate": 3.1756160783119015e-06, | |
| "loss": 0.014, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 2.562962962962963, | |
| "grad_norm": 1.0921886747672007, | |
| "learning_rate": 3.1593772365766107e-06, | |
| "loss": 0.0162, | |
| "step": 607 | |
| }, | |
| { | |
| "epoch": 2.5671957671957673, | |
| "grad_norm": 1.1585886648374661, | |
| "learning_rate": 3.1431608222488276e-06, | |
| "loss": 0.0163, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 2.571428571428571, | |
| "grad_norm": 1.248592252763677, | |
| "learning_rate": 3.12696703292044e-06, | |
| "loss": 0.0199, | |
| "step": 609 | |
| }, | |
| { | |
| "epoch": 2.5756613756613755, | |
| "grad_norm": 1.4112394419564622, | |
| "learning_rate": 3.110796065907665e-06, | |
| "loss": 0.022, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.57989417989418, | |
| "grad_norm": 1.165336623248628, | |
| "learning_rate": 3.09464811824863e-06, | |
| "loss": 0.0168, | |
| "step": 611 | |
| }, | |
| { | |
| "epoch": 2.5841269841269843, | |
| "grad_norm": 1.4448704250044067, | |
| "learning_rate": 3.078523386700982e-06, | |
| "loss": 0.0157, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 2.588359788359788, | |
| "grad_norm": 1.3676792063312764, | |
| "learning_rate": 3.0624220677394854e-06, | |
| "loss": 0.0199, | |
| "step": 613 | |
| }, | |
| { | |
| "epoch": 2.5925925925925926, | |
| "grad_norm": 1.129628210066895, | |
| "learning_rate": 3.0463443575536324e-06, | |
| "loss": 0.0178, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 2.596825396825397, | |
| "grad_norm": 1.0316857304867937, | |
| "learning_rate": 3.030290452045245e-06, | |
| "loss": 0.0199, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 2.6010582010582013, | |
| "grad_norm": 1.3226258154560515, | |
| "learning_rate": 3.0142605468260976e-06, | |
| "loss": 0.0152, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 2.605291005291005, | |
| "grad_norm": 1.5852740515104826, | |
| "learning_rate": 2.9982548372155264e-06, | |
| "loss": 0.0271, | |
| "step": 617 | |
| }, | |
| { | |
| "epoch": 2.6095238095238096, | |
| "grad_norm": 1.1196079859503962, | |
| "learning_rate": 2.98227351823805e-06, | |
| "loss": 0.0138, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 2.613756613756614, | |
| "grad_norm": 1.8626892669889739, | |
| "learning_rate": 2.966316784621e-06, | |
| "loss": 0.0264, | |
| "step": 619 | |
| }, | |
| { | |
| "epoch": 2.617989417989418, | |
| "grad_norm": 1.383225574585185, | |
| "learning_rate": 2.9503848307921363e-06, | |
| "loss": 0.0205, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.6222222222222222, | |
| "grad_norm": 2.733985735902233, | |
| "learning_rate": 2.934477850877292e-06, | |
| "loss": 0.0246, | |
| "step": 621 | |
| }, | |
| { | |
| "epoch": 2.6264550264550266, | |
| "grad_norm": 1.1381292666769829, | |
| "learning_rate": 2.918596038697995e-06, | |
| "loss": 0.0188, | |
| "step": 622 | |
| }, | |
| { | |
| "epoch": 2.6306878306878305, | |
| "grad_norm": 1.0370355502501467, | |
| "learning_rate": 2.9027395877691143e-06, | |
| "loss": 0.0162, | |
| "step": 623 | |
| }, | |
| { | |
| "epoch": 2.634920634920635, | |
| "grad_norm": 1.439361804374129, | |
| "learning_rate": 2.886908691296504e-06, | |
| "loss": 0.0197, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 2.6391534391534393, | |
| "grad_norm": 1.0102400740735453, | |
| "learning_rate": 2.871103542174637e-06, | |
| "loss": 0.0175, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 2.643386243386243, | |
| "grad_norm": 1.5198907140103581, | |
| "learning_rate": 2.8553243329842715e-06, | |
| "loss": 0.0196, | |
| "step": 626 | |
| }, | |
| { | |
| "epoch": 2.6476190476190475, | |
| "grad_norm": 1.0531509330878106, | |
| "learning_rate": 2.839571255990088e-06, | |
| "loss": 0.0141, | |
| "step": 627 | |
| }, | |
| { | |
| "epoch": 2.651851851851852, | |
| "grad_norm": 1.0357940819910703, | |
| "learning_rate": 2.8238445031383634e-06, | |
| "loss": 0.0173, | |
| "step": 628 | |
| }, | |
| { | |
| "epoch": 2.656084656084656, | |
| "grad_norm": 1.1307445427957337, | |
| "learning_rate": 2.8081442660546126e-06, | |
| "loss": 0.0184, | |
| "step": 629 | |
| }, | |
| { | |
| "epoch": 2.66031746031746, | |
| "grad_norm": 1.3306823534784538, | |
| "learning_rate": 2.7924707360412743e-06, | |
| "loss": 0.0147, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.6645502645502646, | |
| "grad_norm": 1.5271179132418502, | |
| "learning_rate": 2.776824104075364e-06, | |
| "loss": 0.021, | |
| "step": 631 | |
| }, | |
| { | |
| "epoch": 2.668783068783069, | |
| "grad_norm": 1.497782212123214, | |
| "learning_rate": 2.761204560806152e-06, | |
| "loss": 0.0186, | |
| "step": 632 | |
| }, | |
| { | |
| "epoch": 2.6730158730158733, | |
| "grad_norm": 1.3368279908950362, | |
| "learning_rate": 2.7456122965528475e-06, | |
| "loss": 0.0206, | |
| "step": 633 | |
| }, | |
| { | |
| "epoch": 2.677248677248677, | |
| "grad_norm": 1.2484325713697815, | |
| "learning_rate": 2.7300475013022666e-06, | |
| "loss": 0.0173, | |
| "step": 634 | |
| }, | |
| { | |
| "epoch": 2.6814814814814816, | |
| "grad_norm": 0.865644253076397, | |
| "learning_rate": 2.714510364706531e-06, | |
| "loss": 0.0128, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 2.685714285714286, | |
| "grad_norm": 1.0152662338026661, | |
| "learning_rate": 2.699001076080742e-06, | |
| "loss": 0.0152, | |
| "step": 636 | |
| }, | |
| { | |
| "epoch": 2.68994708994709, | |
| "grad_norm": 0.9844152479186873, | |
| "learning_rate": 2.683519824400693e-06, | |
| "loss": 0.0156, | |
| "step": 637 | |
| }, | |
| { | |
| "epoch": 2.6941798941798942, | |
| "grad_norm": 0.7334924826589737, | |
| "learning_rate": 2.6680667983005446e-06, | |
| "loss": 0.0125, | |
| "step": 638 | |
| }, | |
| { | |
| "epoch": 2.6984126984126986, | |
| "grad_norm": 1.1114018703107458, | |
| "learning_rate": 2.6526421860705474e-06, | |
| "loss": 0.018, | |
| "step": 639 | |
| }, | |
| { | |
| "epoch": 2.7026455026455025, | |
| "grad_norm": 1.7960430432669203, | |
| "learning_rate": 2.637246175654731e-06, | |
| "loss": 0.0177, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.706878306878307, | |
| "grad_norm": 0.8414473229281895, | |
| "learning_rate": 2.6218789546486235e-06, | |
| "loss": 0.0127, | |
| "step": 641 | |
| }, | |
| { | |
| "epoch": 2.7111111111111112, | |
| "grad_norm": 1.4403979439268044, | |
| "learning_rate": 2.6065407102969664e-06, | |
| "loss": 0.021, | |
| "step": 642 | |
| }, | |
| { | |
| "epoch": 2.715343915343915, | |
| "grad_norm": 1.4586265785977015, | |
| "learning_rate": 2.5912316294914232e-06, | |
| "loss": 0.0216, | |
| "step": 643 | |
| }, | |
| { | |
| "epoch": 2.7195767195767195, | |
| "grad_norm": 1.6315741402908153, | |
| "learning_rate": 2.5759518987683154e-06, | |
| "loss": 0.0162, | |
| "step": 644 | |
| }, | |
| { | |
| "epoch": 2.723809523809524, | |
| "grad_norm": 1.34535240869925, | |
| "learning_rate": 2.560701704306336e-06, | |
| "loss": 0.019, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 2.728042328042328, | |
| "grad_norm": 0.9316829843215475, | |
| "learning_rate": 2.545481231924296e-06, | |
| "loss": 0.0172, | |
| "step": 646 | |
| }, | |
| { | |
| "epoch": 2.732275132275132, | |
| "grad_norm": 1.4011841310159103, | |
| "learning_rate": 2.5302906670788463e-06, | |
| "loss": 0.0168, | |
| "step": 647 | |
| }, | |
| { | |
| "epoch": 2.7365079365079366, | |
| "grad_norm": 0.6747870852771698, | |
| "learning_rate": 2.5151301948622235e-06, | |
| "loss": 0.0136, | |
| "step": 648 | |
| }, | |
| { | |
| "epoch": 2.7407407407407405, | |
| "grad_norm": 1.057610980636594, | |
| "learning_rate": 2.5000000000000015e-06, | |
| "loss": 0.0131, | |
| "step": 649 | |
| }, | |
| { | |
| "epoch": 2.744973544973545, | |
| "grad_norm": 1.6746201989469378, | |
| "learning_rate": 2.484900266848825e-06, | |
| "loss": 0.0205, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.749206349206349, | |
| "grad_norm": 0.9494040703209882, | |
| "learning_rate": 2.469831179394182e-06, | |
| "loss": 0.0135, | |
| "step": 651 | |
| }, | |
| { | |
| "epoch": 2.7534391534391536, | |
| "grad_norm": 1.5003754041430553, | |
| "learning_rate": 2.4547929212481436e-06, | |
| "loss": 0.0158, | |
| "step": 652 | |
| }, | |
| { | |
| "epoch": 2.757671957671958, | |
| "grad_norm": 0.8934711380462033, | |
| "learning_rate": 2.4397856756471435e-06, | |
| "loss": 0.0154, | |
| "step": 653 | |
| }, | |
| { | |
| "epoch": 2.761904761904762, | |
| "grad_norm": 1.6609542321924584, | |
| "learning_rate": 2.424809625449729e-06, | |
| "loss": 0.0181, | |
| "step": 654 | |
| }, | |
| { | |
| "epoch": 2.7661375661375662, | |
| "grad_norm": 1.6824205917017476, | |
| "learning_rate": 2.40986495313435e-06, | |
| "loss": 0.0169, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 2.7703703703703706, | |
| "grad_norm": 0.9183880056629511, | |
| "learning_rate": 2.39495184079712e-06, | |
| "loss": 0.0134, | |
| "step": 656 | |
| }, | |
| { | |
| "epoch": 2.7746031746031745, | |
| "grad_norm": 1.0245037009884315, | |
| "learning_rate": 2.380070470149605e-06, | |
| "loss": 0.0139, | |
| "step": 657 | |
| }, | |
| { | |
| "epoch": 2.778835978835979, | |
| "grad_norm": 1.3421089341613759, | |
| "learning_rate": 2.3652210225166122e-06, | |
| "loss": 0.0142, | |
| "step": 658 | |
| }, | |
| { | |
| "epoch": 2.7830687830687832, | |
| "grad_norm": 1.3809956989606829, | |
| "learning_rate": 2.3504036788339763e-06, | |
| "loss": 0.0178, | |
| "step": 659 | |
| }, | |
| { | |
| "epoch": 2.787301587301587, | |
| "grad_norm": 1.1273731561378648, | |
| "learning_rate": 2.3356186196463497e-06, | |
| "loss": 0.0185, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.7915343915343915, | |
| "grad_norm": 1.0143190004115585, | |
| "learning_rate": 2.320866025105016e-06, | |
| "loss": 0.0124, | |
| "step": 661 | |
| }, | |
| { | |
| "epoch": 2.795767195767196, | |
| "grad_norm": 1.3534973057449449, | |
| "learning_rate": 2.3061460749656844e-06, | |
| "loss": 0.0208, | |
| "step": 662 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 1.0435939785814092, | |
| "learning_rate": 2.2914589485863015e-06, | |
| "loss": 0.0163, | |
| "step": 663 | |
| }, | |
| { | |
| "epoch": 2.804232804232804, | |
| "grad_norm": 0.9203370702295048, | |
| "learning_rate": 2.2768048249248648e-06, | |
| "loss": 0.0146, | |
| "step": 664 | |
| }, | |
| { | |
| "epoch": 2.8084656084656086, | |
| "grad_norm": 1.098704858652637, | |
| "learning_rate": 2.2621838825372496e-06, | |
| "loss": 0.0123, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 2.8126984126984125, | |
| "grad_norm": 1.191480174922082, | |
| "learning_rate": 2.2475962995750224e-06, | |
| "loss": 0.0144, | |
| "step": 666 | |
| }, | |
| { | |
| "epoch": 2.816931216931217, | |
| "grad_norm": 1.276715625287459, | |
| "learning_rate": 2.23304225378328e-06, | |
| "loss": 0.0188, | |
| "step": 667 | |
| }, | |
| { | |
| "epoch": 2.821164021164021, | |
| "grad_norm": 1.131649710749666, | |
| "learning_rate": 2.218521922498476e-06, | |
| "loss": 0.0167, | |
| "step": 668 | |
| }, | |
| { | |
| "epoch": 2.825396825396825, | |
| "grad_norm": 0.8016802676955552, | |
| "learning_rate": 2.204035482646267e-06, | |
| "loss": 0.0131, | |
| "step": 669 | |
| }, | |
| { | |
| "epoch": 2.8296296296296295, | |
| "grad_norm": 0.6893558290783893, | |
| "learning_rate": 2.1895831107393485e-06, | |
| "loss": 0.0108, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.833862433862434, | |
| "grad_norm": 1.234838405201532, | |
| "learning_rate": 2.175164982875311e-06, | |
| "loss": 0.017, | |
| "step": 671 | |
| }, | |
| { | |
| "epoch": 2.8380952380952382, | |
| "grad_norm": 1.2011736692296922, | |
| "learning_rate": 2.1607812747344955e-06, | |
| "loss": 0.0157, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 2.8423280423280426, | |
| "grad_norm": 1.1466267503085876, | |
| "learning_rate": 2.146432161577842e-06, | |
| "loss": 0.0157, | |
| "step": 673 | |
| }, | |
| { | |
| "epoch": 2.8465608465608465, | |
| "grad_norm": 0.969199506256315, | |
| "learning_rate": 2.132117818244771e-06, | |
| "loss": 0.0152, | |
| "step": 674 | |
| }, | |
| { | |
| "epoch": 2.850793650793651, | |
| "grad_norm": 1.3534092293875413, | |
| "learning_rate": 2.1178384191510344e-06, | |
| "loss": 0.0162, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 2.8550264550264552, | |
| "grad_norm": 1.2829883064220085, | |
| "learning_rate": 2.103594138286607e-06, | |
| "loss": 0.0195, | |
| "step": 676 | |
| }, | |
| { | |
| "epoch": 2.859259259259259, | |
| "grad_norm": 1.1271167538927263, | |
| "learning_rate": 2.0893851492135536e-06, | |
| "loss": 0.0172, | |
| "step": 677 | |
| }, | |
| { | |
| "epoch": 2.8634920634920635, | |
| "grad_norm": 1.343559581567661, | |
| "learning_rate": 2.075211625063923e-06, | |
| "loss": 0.0163, | |
| "step": 678 | |
| }, | |
| { | |
| "epoch": 2.867724867724868, | |
| "grad_norm": 1.1918468332382046, | |
| "learning_rate": 2.061073738537635e-06, | |
| "loss": 0.0164, | |
| "step": 679 | |
| }, | |
| { | |
| "epoch": 2.871957671957672, | |
| "grad_norm": 1.4193106995910565, | |
| "learning_rate": 2.046971661900373e-06, | |
| "loss": 0.017, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.876190476190476, | |
| "grad_norm": 0.7357092999278971, | |
| "learning_rate": 2.0329055669814936e-06, | |
| "loss": 0.0123, | |
| "step": 681 | |
| }, | |
| { | |
| "epoch": 2.8804232804232806, | |
| "grad_norm": 1.4745946537108765, | |
| "learning_rate": 2.0188756251719204e-06, | |
| "loss": 0.0153, | |
| "step": 682 | |
| }, | |
| { | |
| "epoch": 2.8846560846560845, | |
| "grad_norm": 1.1857296628720868, | |
| "learning_rate": 2.0048820074220716e-06, | |
| "loss": 0.0188, | |
| "step": 683 | |
| }, | |
| { | |
| "epoch": 2.888888888888889, | |
| "grad_norm": 1.4131026911127977, | |
| "learning_rate": 1.990924884239758e-06, | |
| "loss": 0.0173, | |
| "step": 684 | |
| }, | |
| { | |
| "epoch": 2.893121693121693, | |
| "grad_norm": 1.0848820434397748, | |
| "learning_rate": 1.977004425688126e-06, | |
| "loss": 0.0177, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 2.897354497354497, | |
| "grad_norm": 1.0664286982794255, | |
| "learning_rate": 1.9631208013835677e-06, | |
| "loss": 0.0124, | |
| "step": 686 | |
| }, | |
| { | |
| "epoch": 2.9015873015873015, | |
| "grad_norm": 1.0870334070827417, | |
| "learning_rate": 1.9492741804936623e-06, | |
| "loss": 0.0134, | |
| "step": 687 | |
| }, | |
| { | |
| "epoch": 2.905820105820106, | |
| "grad_norm": 0.9405135966991898, | |
| "learning_rate": 1.9354647317351187e-06, | |
| "loss": 0.0138, | |
| "step": 688 | |
| }, | |
| { | |
| "epoch": 2.91005291005291, | |
| "grad_norm": 1.1557524847446539, | |
| "learning_rate": 1.9216926233717087e-06, | |
| "loss": 0.0154, | |
| "step": 689 | |
| }, | |
| { | |
| "epoch": 2.914285714285714, | |
| "grad_norm": 1.145639729440071, | |
| "learning_rate": 1.90795802321223e-06, | |
| "loss": 0.0163, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.9185185185185185, | |
| "grad_norm": 1.3118126671264705, | |
| "learning_rate": 1.8942610986084487e-06, | |
| "loss": 0.0148, | |
| "step": 691 | |
| }, | |
| { | |
| "epoch": 2.922751322751323, | |
| "grad_norm": 1.0498615239365812, | |
| "learning_rate": 1.8806020164530702e-06, | |
| "loss": 0.0135, | |
| "step": 692 | |
| }, | |
| { | |
| "epoch": 2.9269841269841272, | |
| "grad_norm": 1.0097762792486051, | |
| "learning_rate": 1.8669809431776991e-06, | |
| "loss": 0.0134, | |
| "step": 693 | |
| }, | |
| { | |
| "epoch": 2.931216931216931, | |
| "grad_norm": 1.0798461928002918, | |
| "learning_rate": 1.8533980447508138e-06, | |
| "loss": 0.0138, | |
| "step": 694 | |
| }, | |
| { | |
| "epoch": 2.9354497354497355, | |
| "grad_norm": 0.7591375304655225, | |
| "learning_rate": 1.8398534866757455e-06, | |
| "loss": 0.0085, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 2.93968253968254, | |
| "grad_norm": 1.1784009394326262, | |
| "learning_rate": 1.8263474339886628e-06, | |
| "loss": 0.0132, | |
| "step": 696 | |
| }, | |
| { | |
| "epoch": 2.943915343915344, | |
| "grad_norm": 1.220548304552571, | |
| "learning_rate": 1.8128800512565514e-06, | |
| "loss": 0.0223, | |
| "step": 697 | |
| }, | |
| { | |
| "epoch": 2.948148148148148, | |
| "grad_norm": 1.1132676461069289, | |
| "learning_rate": 1.799451502575222e-06, | |
| "loss": 0.0144, | |
| "step": 698 | |
| }, | |
| { | |
| "epoch": 2.9523809523809526, | |
| "grad_norm": 0.7581893865812536, | |
| "learning_rate": 1.7860619515673034e-06, | |
| "loss": 0.011, | |
| "step": 699 | |
| }, | |
| { | |
| "epoch": 2.9566137566137565, | |
| "grad_norm": 0.9716146953090274, | |
| "learning_rate": 1.7727115613802465e-06, | |
| "loss": 0.0096, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.960846560846561, | |
| "grad_norm": 1.674856558431357, | |
| "learning_rate": 1.7594004946843458e-06, | |
| "loss": 0.0128, | |
| "step": 701 | |
| }, | |
| { | |
| "epoch": 2.965079365079365, | |
| "grad_norm": 1.1843497789446338, | |
| "learning_rate": 1.746128913670746e-06, | |
| "loss": 0.0144, | |
| "step": 702 | |
| }, | |
| { | |
| "epoch": 2.969312169312169, | |
| "grad_norm": 1.7482050222848124, | |
| "learning_rate": 1.7328969800494727e-06, | |
| "loss": 0.0191, | |
| "step": 703 | |
| }, | |
| { | |
| "epoch": 2.9735449735449735, | |
| "grad_norm": 1.3527434122410733, | |
| "learning_rate": 1.7197048550474643e-06, | |
| "loss": 0.0199, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 2.977777777777778, | |
| "grad_norm": 1.04160773464941, | |
| "learning_rate": 1.7065526994065973e-06, | |
| "loss": 0.0187, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 2.982010582010582, | |
| "grad_norm": 1.262543944063061, | |
| "learning_rate": 1.6934406733817417e-06, | |
| "loss": 0.019, | |
| "step": 706 | |
| }, | |
| { | |
| "epoch": 2.986243386243386, | |
| "grad_norm": 1.1849446771124943, | |
| "learning_rate": 1.680368936738792e-06, | |
| "loss": 0.0162, | |
| "step": 707 | |
| }, | |
| { | |
| "epoch": 2.9904761904761905, | |
| "grad_norm": 1.0664910484388517, | |
| "learning_rate": 1.6673376487527382e-06, | |
| "loss": 0.0161, | |
| "step": 708 | |
| }, | |
| { | |
| "epoch": 2.9947089947089944, | |
| "grad_norm": 2.0774774510855742, | |
| "learning_rate": 1.6543469682057105e-06, | |
| "loss": 0.0194, | |
| "step": 709 | |
| }, | |
| { | |
| "epoch": 2.998941798941799, | |
| "grad_norm": 1.0920006522906185, | |
| "learning_rate": 1.6413970533850498e-06, | |
| "loss": 0.0117, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 1.4997123713326503, | |
| "learning_rate": 1.6284880620813847e-06, | |
| "loss": 0.0124, | |
| "step": 711 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 948, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 116192413286400.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |