{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.077544426494345, "eval_steps": 200, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016155088852988692, "grad_norm": 8.991457683520215, "learning_rate": 2.5000000000000004e-07, "loss": 1.3989, "step": 1 }, { "epoch": 0.0032310177705977385, "grad_norm": 9.204385674681644, "learning_rate": 5.000000000000001e-07, "loss": 1.3523, "step": 2 }, { "epoch": 0.004846526655896607, "grad_norm": 7.649629969590384, "learning_rate": 7.5e-07, "loss": 1.2474, "step": 3 }, { "epoch": 0.006462035541195477, "grad_norm": 8.300175421428294, "learning_rate": 1.0000000000000002e-06, "loss": 1.3623, "step": 4 }, { "epoch": 0.008077544426494346, "grad_norm": 8.357058330198413, "learning_rate": 1.25e-06, "loss": 1.2821, "step": 5 }, { "epoch": 0.009693053311793215, "grad_norm": 9.592288796584752, "learning_rate": 1.5e-06, "loss": 1.4056, "step": 6 }, { "epoch": 0.011308562197092083, "grad_norm": 6.646500592700167, "learning_rate": 1.75e-06, "loss": 1.4215, "step": 7 }, { "epoch": 0.012924071082390954, "grad_norm": 4.296752048506672, "learning_rate": 2.0000000000000003e-06, "loss": 1.1683, "step": 8 }, { "epoch": 0.014539579967689823, "grad_norm": 4.238249860681597, "learning_rate": 2.25e-06, "loss": 1.2246, "step": 9 }, { "epoch": 0.01615508885298869, "grad_norm": 4.92545024221912, "learning_rate": 2.5e-06, "loss": 1.1843, "step": 10 }, { "epoch": 0.017770597738287562, "grad_norm": 5.161311383755215, "learning_rate": 2.7500000000000004e-06, "loss": 1.1798, "step": 11 }, { "epoch": 0.01938610662358643, "grad_norm": 4.603791127196939, "learning_rate": 3e-06, "loss": 1.3274, "step": 12 }, { "epoch": 0.0210016155088853, "grad_norm": 4.381864176600256, "learning_rate": 3.2500000000000002e-06, "loss": 1.226, "step": 13 }, { "epoch": 0.022617124394184167, "grad_norm": 4.209014228524383, "learning_rate": 3.5e-06, "loss": 1.1465, "step": 14 }, { "epoch": 0.024232633279483037, "grad_norm": 4.017926617402423, "learning_rate": 3.7500000000000005e-06, "loss": 1.2793, "step": 15 }, { "epoch": 0.025848142164781908, "grad_norm": 5.111631496558089, "learning_rate": 4.000000000000001e-06, "loss": 1.2628, "step": 16 }, { "epoch": 0.027463651050080775, "grad_norm": 3.9002728881786624, "learning_rate": 4.25e-06, "loss": 1.1487, "step": 17 }, { "epoch": 0.029079159935379646, "grad_norm": 3.5339539327079588, "learning_rate": 4.5e-06, "loss": 1.1686, "step": 18 }, { "epoch": 0.030694668820678513, "grad_norm": 3.947144404628675, "learning_rate": 4.75e-06, "loss": 1.3025, "step": 19 }, { "epoch": 0.03231017770597738, "grad_norm": 3.703700043458666, "learning_rate": 5e-06, "loss": 1.2251, "step": 20 }, { "epoch": 0.033925686591276254, "grad_norm": 4.198179059371649, "learning_rate": 4.999999675929559e-06, "loss": 1.3255, "step": 21 }, { "epoch": 0.035541195476575124, "grad_norm": 3.163168058473692, "learning_rate": 4.9999987037183174e-06, "loss": 1.0785, "step": 22 }, { "epoch": 0.03715670436187399, "grad_norm": 2.8648063455508566, "learning_rate": 4.99999708336653e-06, "loss": 1.1775, "step": 23 }, { "epoch": 0.03877221324717286, "grad_norm": 3.5105533001325573, "learning_rate": 4.9999948148746145e-06, "loss": 1.2144, "step": 24 }, { "epoch": 0.04038772213247173, "grad_norm": 3.7029918366651677, "learning_rate": 4.999991898243161e-06, "loss": 1.1417, "step": 25 }, { "epoch": 0.0420032310177706, "grad_norm": 3.640799740762253, "learning_rate": 4.999988333472923e-06, "loss": 1.3541, "step": 26 }, { "epoch": 0.04361873990306947, "grad_norm": 3.2839724407831774, "learning_rate": 4.999984120564828e-06, "loss": 1.2172, "step": 27 }, { "epoch": 0.045234248788368334, "grad_norm": 3.531685363882572, "learning_rate": 4.999979259519967e-06, "loss": 1.2847, "step": 28 }, { "epoch": 0.046849757673667204, "grad_norm": 3.8809228862735425, "learning_rate": 4.999973750339599e-06, "loss": 1.2584, "step": 29 }, { "epoch": 0.048465266558966075, "grad_norm": 3.7704134060166727, "learning_rate": 4.9999675930251536e-06, "loss": 1.2992, "step": 30 }, { "epoch": 0.050080775444264945, "grad_norm": 3.2221307074015133, "learning_rate": 4.999960787578226e-06, "loss": 1.1594, "step": 31 }, { "epoch": 0.051696284329563816, "grad_norm": 3.382471840921329, "learning_rate": 4.999953334000581e-06, "loss": 1.2145, "step": 32 }, { "epoch": 0.05331179321486268, "grad_norm": 3.3068398343360883, "learning_rate": 4.9999452322941525e-06, "loss": 1.228, "step": 33 }, { "epoch": 0.05492730210016155, "grad_norm": 3.0060297378694383, "learning_rate": 4.9999364824610385e-06, "loss": 1.1716, "step": 34 }, { "epoch": 0.05654281098546042, "grad_norm": 3.3966566898477093, "learning_rate": 4.999927084503508e-06, "loss": 1.0654, "step": 35 }, { "epoch": 0.05815831987075929, "grad_norm": 3.2194250975667305, "learning_rate": 4.9999170384240005e-06, "loss": 1.2565, "step": 36 }, { "epoch": 0.05977382875605816, "grad_norm": 3.3153932803083266, "learning_rate": 4.999906344225116e-06, "loss": 1.1063, "step": 37 }, { "epoch": 0.061389337641357025, "grad_norm": 3.3499236733633673, "learning_rate": 4.99989500190963e-06, "loss": 1.2659, "step": 38 }, { "epoch": 0.0630048465266559, "grad_norm": 3.234738446805166, "learning_rate": 4.999883011480482e-06, "loss": 1.1734, "step": 39 }, { "epoch": 0.06462035541195477, "grad_norm": 3.5179304376170295, "learning_rate": 4.999870372940781e-06, "loss": 1.2332, "step": 40 }, { "epoch": 0.06623586429725363, "grad_norm": 3.06853760039248, "learning_rate": 4.9998570862938024e-06, "loss": 1.1642, "step": 41 }, { "epoch": 0.06785137318255251, "grad_norm": 3.4379531522622093, "learning_rate": 4.999843151542993e-06, "loss": 1.2686, "step": 42 }, { "epoch": 0.06946688206785137, "grad_norm": 3.3885409913064093, "learning_rate": 4.999828568691964e-06, "loss": 1.2405, "step": 43 }, { "epoch": 0.07108239095315025, "grad_norm": 3.1097004625500047, "learning_rate": 4.999813337744497e-06, "loss": 1.1018, "step": 44 }, { "epoch": 0.07269789983844911, "grad_norm": 3.160519178761703, "learning_rate": 4.999797458704539e-06, "loss": 1.1194, "step": 45 }, { "epoch": 0.07431340872374798, "grad_norm": 3.355088745414034, "learning_rate": 4.999780931576208e-06, "loss": 1.241, "step": 46 }, { "epoch": 0.07592891760904685, "grad_norm": 3.2747713761995714, "learning_rate": 4.999763756363789e-06, "loss": 1.3358, "step": 47 }, { "epoch": 0.07754442649434572, "grad_norm": 3.134129685373582, "learning_rate": 4.999745933071735e-06, "loss": 1.1335, "step": 48 }, { "epoch": 0.0791599353796446, "grad_norm": 3.8025619501609023, "learning_rate": 4.9997274617046655e-06, "loss": 1.3815, "step": 49 }, { "epoch": 0.08077544426494346, "grad_norm": 3.728244263912563, "learning_rate": 4.999708342267371e-06, "loss": 1.1374, "step": 50 }, { "epoch": 0.08239095315024232, "grad_norm": 3.4780279091393607, "learning_rate": 4.999688574764806e-06, "loss": 1.2733, "step": 51 }, { "epoch": 0.0840064620355412, "grad_norm": 3.282918069450121, "learning_rate": 4.999668159202097e-06, "loss": 1.1686, "step": 52 }, { "epoch": 0.08562197092084006, "grad_norm": 3.462316425682957, "learning_rate": 4.999647095584537e-06, "loss": 1.2725, "step": 53 }, { "epoch": 0.08723747980613894, "grad_norm": 3.443275025038192, "learning_rate": 4.999625383917586e-06, "loss": 1.2097, "step": 54 }, { "epoch": 0.0888529886914378, "grad_norm": 3.5124854041689586, "learning_rate": 4.999603024206875e-06, "loss": 1.3324, "step": 55 }, { "epoch": 0.09046849757673667, "grad_norm": 3.5009158562843807, "learning_rate": 4.999580016458197e-06, "loss": 1.1963, "step": 56 }, { "epoch": 0.09208400646203554, "grad_norm": 3.1586869439286134, "learning_rate": 4.999556360677521e-06, "loss": 1.2261, "step": 57 }, { "epoch": 0.09369951534733441, "grad_norm": 3.5796597378669173, "learning_rate": 4.999532056870977e-06, "loss": 1.2447, "step": 58 }, { "epoch": 0.09531502423263329, "grad_norm": 3.8906866826206974, "learning_rate": 4.999507105044867e-06, "loss": 1.2858, "step": 59 }, { "epoch": 0.09693053311793215, "grad_norm": 3.340173864943382, "learning_rate": 4.999481505205661e-06, "loss": 1.1821, "step": 60 }, { "epoch": 0.09854604200323101, "grad_norm": 3.3899136719001586, "learning_rate": 4.999455257359994e-06, "loss": 1.163, "step": 61 }, { "epoch": 0.10016155088852989, "grad_norm": 3.209304008262493, "learning_rate": 4.999428361514672e-06, "loss": 1.1262, "step": 62 }, { "epoch": 0.10177705977382875, "grad_norm": 2.9867411305750093, "learning_rate": 4.999400817676667e-06, "loss": 1.0979, "step": 63 }, { "epoch": 0.10339256865912763, "grad_norm": 3.2851645596147705, "learning_rate": 4.9993726258531215e-06, "loss": 1.0836, "step": 64 }, { "epoch": 0.1050080775444265, "grad_norm": 3.2548873930326327, "learning_rate": 4.999343786051344e-06, "loss": 1.2336, "step": 65 }, { "epoch": 0.10662358642972536, "grad_norm": 3.3536674672196485, "learning_rate": 4.99931429827881e-06, "loss": 1.2817, "step": 66 }, { "epoch": 0.10823909531502424, "grad_norm": 3.4372358210004172, "learning_rate": 4.999284162543165e-06, "loss": 1.2581, "step": 67 }, { "epoch": 0.1098546042003231, "grad_norm": 3.262881881683617, "learning_rate": 4.9992533788522225e-06, "loss": 1.1544, "step": 68 }, { "epoch": 0.11147011308562198, "grad_norm": 3.0847567674302363, "learning_rate": 4.999221947213963e-06, "loss": 1.094, "step": 69 }, { "epoch": 0.11308562197092084, "grad_norm": 2.7216210936311582, "learning_rate": 4.999189867636535e-06, "loss": 1.0469, "step": 70 }, { "epoch": 0.1147011308562197, "grad_norm": 3.09529885645119, "learning_rate": 4.999157140128257e-06, "loss": 1.1592, "step": 71 }, { "epoch": 0.11631663974151858, "grad_norm": 3.126390522200968, "learning_rate": 4.999123764697611e-06, "loss": 1.1024, "step": 72 }, { "epoch": 0.11793214862681745, "grad_norm": 3.251312736875641, "learning_rate": 4.999089741353253e-06, "loss": 1.1352, "step": 73 }, { "epoch": 0.11954765751211632, "grad_norm": 3.363303925897396, "learning_rate": 4.999055070104001e-06, "loss": 1.127, "step": 74 }, { "epoch": 0.12116316639741519, "grad_norm": 2.937766911094209, "learning_rate": 4.9990197509588445e-06, "loss": 1.0719, "step": 75 }, { "epoch": 0.12277867528271405, "grad_norm": 3.3488933207483296, "learning_rate": 4.998983783926941e-06, "loss": 1.2621, "step": 76 }, { "epoch": 0.12439418416801293, "grad_norm": 3.6800804067194073, "learning_rate": 4.998947169017615e-06, "loss": 1.2946, "step": 77 }, { "epoch": 0.1260096930533118, "grad_norm": 2.958385447923565, "learning_rate": 4.998909906240359e-06, "loss": 0.993, "step": 78 }, { "epoch": 0.12762520193861066, "grad_norm": 2.952904489814771, "learning_rate": 4.998871995604832e-06, "loss": 1.0303, "step": 79 }, { "epoch": 0.12924071082390953, "grad_norm": 3.2152111993433317, "learning_rate": 4.998833437120866e-06, "loss": 1.1503, "step": 80 }, { "epoch": 0.1308562197092084, "grad_norm": 3.3977985530026453, "learning_rate": 4.998794230798455e-06, "loss": 1.2498, "step": 81 }, { "epoch": 0.13247172859450726, "grad_norm": 3.1013673186776827, "learning_rate": 4.998754376647764e-06, "loss": 1.1837, "step": 82 }, { "epoch": 0.13408723747980614, "grad_norm": 3.338916144155502, "learning_rate": 4.998713874679125e-06, "loss": 1.2064, "step": 83 }, { "epoch": 0.13570274636510501, "grad_norm": 3.1711787740028967, "learning_rate": 4.9986727249030394e-06, "loss": 1.1568, "step": 84 }, { "epoch": 0.13731825525040386, "grad_norm": 2.722621067186377, "learning_rate": 4.998630927330176e-06, "loss": 1.0076, "step": 85 }, { "epoch": 0.13893376413570274, "grad_norm": 3.0556086108357365, "learning_rate": 4.99858848197137e-06, "loss": 1.1219, "step": 86 }, { "epoch": 0.14054927302100162, "grad_norm": 3.0012894465957567, "learning_rate": 4.9985453888376245e-06, "loss": 1.1528, "step": 87 }, { "epoch": 0.1421647819063005, "grad_norm": 2.9982404771930313, "learning_rate": 4.998501647940114e-06, "loss": 1.1388, "step": 88 }, { "epoch": 0.14378029079159935, "grad_norm": 3.372467847244374, "learning_rate": 4.998457259290176e-06, "loss": 1.0991, "step": 89 }, { "epoch": 0.14539579967689822, "grad_norm": 3.7166035464210547, "learning_rate": 4.998412222899321e-06, "loss": 1.3245, "step": 90 }, { "epoch": 0.1470113085621971, "grad_norm": 2.981647322160533, "learning_rate": 4.998366538779224e-06, "loss": 1.2244, "step": 91 }, { "epoch": 0.14862681744749595, "grad_norm": 3.437508850277753, "learning_rate": 4.99832020694173e-06, "loss": 1.2074, "step": 92 }, { "epoch": 0.15024232633279483, "grad_norm": 4.0847488805012935, "learning_rate": 4.998273227398849e-06, "loss": 1.1014, "step": 93 }, { "epoch": 0.1518578352180937, "grad_norm": 3.001986614884883, "learning_rate": 4.998225600162762e-06, "loss": 1.0919, "step": 94 }, { "epoch": 0.15347334410339256, "grad_norm": 3.386224795276982, "learning_rate": 4.998177325245815e-06, "loss": 1.1298, "step": 95 }, { "epoch": 0.15508885298869143, "grad_norm": 3.0791723406788383, "learning_rate": 4.9981284026605256e-06, "loss": 1.1584, "step": 96 }, { "epoch": 0.1567043618739903, "grad_norm": 3.090651474734741, "learning_rate": 4.998078832419577e-06, "loss": 1.2772, "step": 97 }, { "epoch": 0.1583198707592892, "grad_norm": 3.859630103097824, "learning_rate": 4.9980286145358196e-06, "loss": 1.2764, "step": 98 }, { "epoch": 0.15993537964458804, "grad_norm": 3.4149648420738448, "learning_rate": 4.997977749022273e-06, "loss": 1.2075, "step": 99 }, { "epoch": 0.16155088852988692, "grad_norm": 3.4164338004249837, "learning_rate": 4.997926235892124e-06, "loss": 1.1282, "step": 100 }, { "epoch": 0.1631663974151858, "grad_norm": 3.1020571638753256, "learning_rate": 4.99787407515873e-06, "loss": 1.0726, "step": 101 }, { "epoch": 0.16478190630048464, "grad_norm": 2.7667074464453347, "learning_rate": 4.99782126683561e-06, "loss": 1.0309, "step": 102 }, { "epoch": 0.16639741518578352, "grad_norm": 3.5202448614338864, "learning_rate": 4.997767810936459e-06, "loss": 1.149, "step": 103 }, { "epoch": 0.1680129240710824, "grad_norm": 3.3905139225649212, "learning_rate": 4.997713707475133e-06, "loss": 1.22, "step": 104 }, { "epoch": 0.16962843295638125, "grad_norm": 3.213661393839447, "learning_rate": 4.99765895646566e-06, "loss": 1.1373, "step": 105 }, { "epoch": 0.17124394184168013, "grad_norm": 3.105181953794725, "learning_rate": 4.9976035579222345e-06, "loss": 1.1048, "step": 106 }, { "epoch": 0.172859450726979, "grad_norm": 3.625488644070283, "learning_rate": 4.9975475118592185e-06, "loss": 1.2268, "step": 107 }, { "epoch": 0.17447495961227788, "grad_norm": 3.8226922814129174, "learning_rate": 4.997490818291142e-06, "loss": 1.2201, "step": 108 }, { "epoch": 0.17609046849757673, "grad_norm": 3.120174810994621, "learning_rate": 4.997433477232704e-06, "loss": 1.0162, "step": 109 }, { "epoch": 0.1777059773828756, "grad_norm": 3.1756750698712533, "learning_rate": 4.997375488698769e-06, "loss": 1.1728, "step": 110 }, { "epoch": 0.17932148626817448, "grad_norm": 17.758927356775924, "learning_rate": 4.997316852704373e-06, "loss": 0.943, "step": 111 }, { "epoch": 0.18093699515347333, "grad_norm": 3.477927353868724, "learning_rate": 4.9972575692647166e-06, "loss": 1.1703, "step": 112 }, { "epoch": 0.1825525040387722, "grad_norm": 3.0839370068333882, "learning_rate": 4.99719763839517e-06, "loss": 1.1175, "step": 113 }, { "epoch": 0.1841680129240711, "grad_norm": 2.8737540882433494, "learning_rate": 4.99713706011127e-06, "loss": 1.0665, "step": 114 }, { "epoch": 0.18578352180936994, "grad_norm": 3.6700734641631962, "learning_rate": 4.997075834428722e-06, "loss": 1.1556, "step": 115 }, { "epoch": 0.18739903069466882, "grad_norm": 3.1110855290150283, "learning_rate": 4.997013961363398e-06, "loss": 1.2316, "step": 116 }, { "epoch": 0.1890145395799677, "grad_norm": 3.3661829976733904, "learning_rate": 4.996951440931342e-06, "loss": 1.2921, "step": 117 }, { "epoch": 0.19063004846526657, "grad_norm": 3.191038575067256, "learning_rate": 4.996888273148761e-06, "loss": 1.2411, "step": 118 }, { "epoch": 0.19224555735056542, "grad_norm": 3.345722499412386, "learning_rate": 4.996824458032031e-06, "loss": 1.1575, "step": 119 }, { "epoch": 0.1938610662358643, "grad_norm": 3.1793743839168296, "learning_rate": 4.996759995597698e-06, "loss": 1.1168, "step": 120 }, { "epoch": 0.19547657512116318, "grad_norm": 2.85685840599248, "learning_rate": 4.996694885862472e-06, "loss": 1.0588, "step": 121 }, { "epoch": 0.19709208400646203, "grad_norm": 3.3807264006719806, "learning_rate": 4.996629128843235e-06, "loss": 1.1915, "step": 122 }, { "epoch": 0.1987075928917609, "grad_norm": 3.425360286832518, "learning_rate": 4.996562724557036e-06, "loss": 1.1223, "step": 123 }, { "epoch": 0.20032310177705978, "grad_norm": 3.169624088836746, "learning_rate": 4.996495673021088e-06, "loss": 1.0641, "step": 124 }, { "epoch": 0.20193861066235863, "grad_norm": 3.5568374572837516, "learning_rate": 4.996427974252776e-06, "loss": 1.3247, "step": 125 }, { "epoch": 0.2035541195476575, "grad_norm": 3.6944571666371884, "learning_rate": 4.996359628269651e-06, "loss": 1.4471, "step": 126 }, { "epoch": 0.20516962843295639, "grad_norm": 2.7889178663293017, "learning_rate": 4.996290635089432e-06, "loss": 1.1061, "step": 127 }, { "epoch": 0.20678513731825526, "grad_norm": 3.022927421114448, "learning_rate": 4.996220994730008e-06, "loss": 1.1014, "step": 128 }, { "epoch": 0.2084006462035541, "grad_norm": 3.015061386500223, "learning_rate": 4.99615070720943e-06, "loss": 1.0823, "step": 129 }, { "epoch": 0.210016155088853, "grad_norm": 3.2316736257974275, "learning_rate": 4.996079772545923e-06, "loss": 1.1374, "step": 130 }, { "epoch": 0.21163166397415187, "grad_norm": 3.359861084585751, "learning_rate": 4.996008190757876e-06, "loss": 1.1943, "step": 131 }, { "epoch": 0.21324717285945072, "grad_norm": 2.984545832265968, "learning_rate": 4.995935961863849e-06, "loss": 1.0748, "step": 132 }, { "epoch": 0.2148626817447496, "grad_norm": 3.1995208307067426, "learning_rate": 4.995863085882566e-06, "loss": 1.0767, "step": 133 }, { "epoch": 0.21647819063004847, "grad_norm": 3.4990035025497654, "learning_rate": 4.99578956283292e-06, "loss": 1.1795, "step": 134 }, { "epoch": 0.21809369951534732, "grad_norm": 3.5751496734528336, "learning_rate": 4.9957153927339755e-06, "loss": 1.2557, "step": 135 }, { "epoch": 0.2197092084006462, "grad_norm": 2.9273168589758036, "learning_rate": 4.995640575604957e-06, "loss": 1.1243, "step": 136 }, { "epoch": 0.22132471728594508, "grad_norm": 2.7529893831293606, "learning_rate": 4.995565111465265e-06, "loss": 1.1369, "step": 137 }, { "epoch": 0.22294022617124395, "grad_norm": 3.3430927491662543, "learning_rate": 4.9954890003344634e-06, "loss": 1.1215, "step": 138 }, { "epoch": 0.2245557350565428, "grad_norm": 3.281650594100157, "learning_rate": 4.9954122422322825e-06, "loss": 1.0981, "step": 139 }, { "epoch": 0.22617124394184168, "grad_norm": 2.924522045519414, "learning_rate": 4.995334837178625e-06, "loss": 1.1411, "step": 140 }, { "epoch": 0.22778675282714056, "grad_norm": 2.9217030165898064, "learning_rate": 4.995256785193556e-06, "loss": 1.1063, "step": 141 }, { "epoch": 0.2294022617124394, "grad_norm": 3.0501234340492873, "learning_rate": 4.995178086297313e-06, "loss": 1.1347, "step": 142 }, { "epoch": 0.2310177705977383, "grad_norm": 3.204099177317036, "learning_rate": 4.995098740510299e-06, "loss": 1.0339, "step": 143 }, { "epoch": 0.23263327948303716, "grad_norm": 3.1007279543331934, "learning_rate": 4.995018747853084e-06, "loss": 1.0156, "step": 144 }, { "epoch": 0.23424878836833601, "grad_norm": 3.2102826662307815, "learning_rate": 4.994938108346407e-06, "loss": 1.0491, "step": 145 }, { "epoch": 0.2358642972536349, "grad_norm": 2.9374882061905163, "learning_rate": 4.994856822011175e-06, "loss": 1.0203, "step": 146 }, { "epoch": 0.23747980613893377, "grad_norm": 2.883575952940483, "learning_rate": 4.99477488886846e-06, "loss": 0.9553, "step": 147 }, { "epoch": 0.23909531502423265, "grad_norm": 3.123751785348791, "learning_rate": 4.994692308939506e-06, "loss": 1.0275, "step": 148 }, { "epoch": 0.2407108239095315, "grad_norm": 3.1410132375693314, "learning_rate": 4.994609082245721e-06, "loss": 1.0363, "step": 149 }, { "epoch": 0.24232633279483037, "grad_norm": 3.1695546965990378, "learning_rate": 4.994525208808683e-06, "loss": 1.1513, "step": 150 }, { "epoch": 0.24394184168012925, "grad_norm": 3.1324867947816935, "learning_rate": 4.994440688650135e-06, "loss": 1.1944, "step": 151 }, { "epoch": 0.2455573505654281, "grad_norm": 2.8023614632341807, "learning_rate": 4.994355521791992e-06, "loss": 0.9432, "step": 152 }, { "epoch": 0.24717285945072698, "grad_norm": 2.9711825901734463, "learning_rate": 4.994269708256332e-06, "loss": 0.9492, "step": 153 }, { "epoch": 0.24878836833602586, "grad_norm": 3.402348871931905, "learning_rate": 4.9941832480654026e-06, "loss": 1.1105, "step": 154 }, { "epoch": 0.25040387722132473, "grad_norm": 2.985175941699146, "learning_rate": 4.994096141241621e-06, "loss": 1.074, "step": 155 }, { "epoch": 0.2520193861066236, "grad_norm": 3.4943585763225204, "learning_rate": 4.994008387807569e-06, "loss": 1.2137, "step": 156 }, { "epoch": 0.25363489499192243, "grad_norm": 3.2068300912405214, "learning_rate": 4.993919987785996e-06, "loss": 1.1136, "step": 157 }, { "epoch": 0.2552504038772213, "grad_norm": 2.9950572603478998, "learning_rate": 4.9938309411998225e-06, "loss": 1.2826, "step": 158 }, { "epoch": 0.2568659127625202, "grad_norm": 3.3211158889202865, "learning_rate": 4.993741248072134e-06, "loss": 1.2898, "step": 159 }, { "epoch": 0.25848142164781907, "grad_norm": 3.2503327707715357, "learning_rate": 4.993650908426182e-06, "loss": 1.1371, "step": 160 }, { "epoch": 0.26009693053311794, "grad_norm": 3.055988305600261, "learning_rate": 4.99355992228539e-06, "loss": 1.1072, "step": 161 }, { "epoch": 0.2617124394184168, "grad_norm": 2.9561579194023238, "learning_rate": 4.9934682896733465e-06, "loss": 1.1007, "step": 162 }, { "epoch": 0.2633279483037157, "grad_norm": 3.194028257411168, "learning_rate": 4.993376010613806e-06, "loss": 1.0394, "step": 163 }, { "epoch": 0.2649434571890145, "grad_norm": 3.452528854972788, "learning_rate": 4.993283085130694e-06, "loss": 1.1107, "step": 164 }, { "epoch": 0.2665589660743134, "grad_norm": 3.2294156222349546, "learning_rate": 4.993189513248101e-06, "loss": 1.2772, "step": 165 }, { "epoch": 0.2681744749596123, "grad_norm": 3.151950837826212, "learning_rate": 4.993095294990288e-06, "loss": 1.0895, "step": 166 }, { "epoch": 0.26978998384491115, "grad_norm": 3.027470176689102, "learning_rate": 4.99300043038168e-06, "loss": 1.1443, "step": 167 }, { "epoch": 0.27140549273021003, "grad_norm": 2.9784561229721773, "learning_rate": 4.9929049194468715e-06, "loss": 1.1463, "step": 168 }, { "epoch": 0.2730210016155089, "grad_norm": 3.0455203498369605, "learning_rate": 4.992808762210624e-06, "loss": 1.0222, "step": 169 }, { "epoch": 0.27463651050080773, "grad_norm": 3.323278764778758, "learning_rate": 4.992711958697868e-06, "loss": 1.1321, "step": 170 }, { "epoch": 0.2762520193861066, "grad_norm": 3.389150501822644, "learning_rate": 4.9926145089337e-06, "loss": 1.1439, "step": 171 }, { "epoch": 0.2778675282714055, "grad_norm": 4.304303305564607, "learning_rate": 4.992516412943384e-06, "loss": 1.2796, "step": 172 }, { "epoch": 0.27948303715670436, "grad_norm": 3.2617291667588053, "learning_rate": 4.992417670752351e-06, "loss": 1.1177, "step": 173 }, { "epoch": 0.28109854604200324, "grad_norm": 3.5453349755335397, "learning_rate": 4.9923182823862035e-06, "loss": 1.2143, "step": 174 }, { "epoch": 0.2827140549273021, "grad_norm": 3.1902514068991774, "learning_rate": 4.992218247870706e-06, "loss": 1.1094, "step": 175 }, { "epoch": 0.284329563812601, "grad_norm": 2.6502621177757613, "learning_rate": 4.992117567231795e-06, "loss": 1.0183, "step": 176 }, { "epoch": 0.2859450726978998, "grad_norm": 3.085746042332267, "learning_rate": 4.9920162404955705e-06, "loss": 1.0126, "step": 177 }, { "epoch": 0.2875605815831987, "grad_norm": 3.023157872079741, "learning_rate": 4.991914267688302e-06, "loss": 1.1269, "step": 178 }, { "epoch": 0.28917609046849757, "grad_norm": 3.166807338703211, "learning_rate": 4.99181164883643e-06, "loss": 1.1083, "step": 179 }, { "epoch": 0.29079159935379645, "grad_norm": 3.306645008676022, "learning_rate": 4.991708383966556e-06, "loss": 1.0899, "step": 180 }, { "epoch": 0.2924071082390953, "grad_norm": 3.126584118983982, "learning_rate": 4.9916044731054524e-06, "loss": 1.1525, "step": 181 }, { "epoch": 0.2940226171243942, "grad_norm": 3.3897941382718204, "learning_rate": 4.99149991628006e-06, "loss": 1.0911, "step": 182 }, { "epoch": 0.2956381260096931, "grad_norm": 2.926145529698814, "learning_rate": 4.991394713517485e-06, "loss": 1.0193, "step": 183 }, { "epoch": 0.2972536348949919, "grad_norm": 3.0771173750180156, "learning_rate": 4.991288864845002e-06, "loss": 1.0551, "step": 184 }, { "epoch": 0.2988691437802908, "grad_norm": 2.8887869683971736, "learning_rate": 4.991182370290053e-06, "loss": 1.1082, "step": 185 }, { "epoch": 0.30048465266558966, "grad_norm": 3.3436473359838015, "learning_rate": 4.991075229880247e-06, "loss": 1.1807, "step": 186 }, { "epoch": 0.30210016155088854, "grad_norm": 3.735187833178655, "learning_rate": 4.990967443643361e-06, "loss": 1.2685, "step": 187 }, { "epoch": 0.3037156704361874, "grad_norm": 4.008637526173641, "learning_rate": 4.990859011607338e-06, "loss": 1.1402, "step": 188 }, { "epoch": 0.3053311793214863, "grad_norm": 3.3951126298112997, "learning_rate": 4.990749933800294e-06, "loss": 1.2365, "step": 189 }, { "epoch": 0.3069466882067851, "grad_norm": 3.313224828737308, "learning_rate": 4.990640210250503e-06, "loss": 1.1064, "step": 190 }, { "epoch": 0.308562197092084, "grad_norm": 2.6879912259394416, "learning_rate": 4.9905298409864145e-06, "loss": 1.131, "step": 191 }, { "epoch": 0.31017770597738287, "grad_norm": 3.282040007895537, "learning_rate": 4.990418826036642e-06, "loss": 1.1639, "step": 192 }, { "epoch": 0.31179321486268174, "grad_norm": 3.5174595365457706, "learning_rate": 4.9903071654299666e-06, "loss": 1.2711, "step": 193 }, { "epoch": 0.3134087237479806, "grad_norm": 2.5763388872911452, "learning_rate": 4.990194859195336e-06, "loss": 0.9667, "step": 194 }, { "epoch": 0.3150242326332795, "grad_norm": 2.794620898200171, "learning_rate": 4.990081907361867e-06, "loss": 1.0119, "step": 195 }, { "epoch": 0.3166397415185784, "grad_norm": 3.32010673048294, "learning_rate": 4.9899683099588445e-06, "loss": 1.1213, "step": 196 }, { "epoch": 0.3182552504038772, "grad_norm": 2.721581958327973, "learning_rate": 4.989854067015717e-06, "loss": 1.0181, "step": 197 }, { "epoch": 0.3198707592891761, "grad_norm": 2.7665686720839022, "learning_rate": 4.989739178562105e-06, "loss": 0.9642, "step": 198 }, { "epoch": 0.32148626817447495, "grad_norm": 3.195804074881364, "learning_rate": 4.989623644627792e-06, "loss": 1.0124, "step": 199 }, { "epoch": 0.32310177705977383, "grad_norm": 3.726656804841075, "learning_rate": 4.989507465242732e-06, "loss": 1.2267, "step": 200 }, { "epoch": 0.32310177705977383, "eval_loss": 1.1317776441574097, "eval_runtime": 2.3767, "eval_samples_per_second": 126.225, "eval_steps_per_second": 2.945, "step": 200 }, { "epoch": 0.3247172859450727, "grad_norm": 2.8479163016958364, "learning_rate": 4.989390640437046e-06, "loss": 1.0163, "step": 201 }, { "epoch": 0.3263327948303716, "grad_norm": 2.8488757005790144, "learning_rate": 4.9892731702410195e-06, "loss": 0.9996, "step": 202 }, { "epoch": 0.32794830371567046, "grad_norm": 2.8666077513141075, "learning_rate": 4.9891550546851084e-06, "loss": 1.1124, "step": 203 }, { "epoch": 0.3295638126009693, "grad_norm": 3.1899510038526593, "learning_rate": 4.989036293799936e-06, "loss": 1.0879, "step": 204 }, { "epoch": 0.33117932148626816, "grad_norm": 3.1349035061838215, "learning_rate": 4.988916887616291e-06, "loss": 1.1483, "step": 205 }, { "epoch": 0.33279483037156704, "grad_norm": 2.923893251497, "learning_rate": 4.988796836165129e-06, "loss": 1.0678, "step": 206 }, { "epoch": 0.3344103392568659, "grad_norm": 2.9277408066773143, "learning_rate": 4.988676139477576e-06, "loss": 0.9886, "step": 207 }, { "epoch": 0.3360258481421648, "grad_norm": 2.3782832137285634, "learning_rate": 4.988554797584922e-06, "loss": 0.994, "step": 208 }, { "epoch": 0.3376413570274637, "grad_norm": 2.9210173283880327, "learning_rate": 4.988432810518626e-06, "loss": 1.0194, "step": 209 }, { "epoch": 0.3392568659127625, "grad_norm": 3.1740012138590368, "learning_rate": 4.988310178310315e-06, "loss": 1.1115, "step": 210 }, { "epoch": 0.3408723747980614, "grad_norm": 3.2489601905198375, "learning_rate": 4.98818690099178e-06, "loss": 1.0188, "step": 211 }, { "epoch": 0.34248788368336025, "grad_norm": 2.8250494110835236, "learning_rate": 4.988062978594984e-06, "loss": 1.0623, "step": 212 }, { "epoch": 0.3441033925686591, "grad_norm": 2.9060146357902457, "learning_rate": 4.987938411152053e-06, "loss": 1.0698, "step": 213 }, { "epoch": 0.345718901453958, "grad_norm": 3.0378204243349427, "learning_rate": 4.9878131986952825e-06, "loss": 1.0974, "step": 214 }, { "epoch": 0.3473344103392569, "grad_norm": 2.8713443599860264, "learning_rate": 4.987687341257135e-06, "loss": 0.8745, "step": 215 }, { "epoch": 0.34894991922455576, "grad_norm": 3.323253789041301, "learning_rate": 4.987560838870238e-06, "loss": 1.1173, "step": 216 }, { "epoch": 0.3505654281098546, "grad_norm": 3.5739209793531512, "learning_rate": 4.9874336915673895e-06, "loss": 1.0282, "step": 217 }, { "epoch": 0.35218093699515346, "grad_norm": 2.6751915429670254, "learning_rate": 4.987305899381554e-06, "loss": 1.092, "step": 218 }, { "epoch": 0.35379644588045234, "grad_norm": 3.7651869185846767, "learning_rate": 4.9871774623458606e-06, "loss": 1.094, "step": 219 }, { "epoch": 0.3554119547657512, "grad_norm": 2.7243647126432284, "learning_rate": 4.987048380493609e-06, "loss": 1.0281, "step": 220 }, { "epoch": 0.3570274636510501, "grad_norm": 3.1651227370407264, "learning_rate": 4.986918653858264e-06, "loss": 1.045, "step": 221 }, { "epoch": 0.35864297253634897, "grad_norm": 4.968776979957011, "learning_rate": 4.986788282473457e-06, "loss": 1.2405, "step": 222 }, { "epoch": 0.3602584814216478, "grad_norm": 2.8764948149376917, "learning_rate": 4.986657266372988e-06, "loss": 1.1605, "step": 223 }, { "epoch": 0.36187399030694667, "grad_norm": 3.278853153387758, "learning_rate": 4.986525605590825e-06, "loss": 1.1747, "step": 224 }, { "epoch": 0.36348949919224555, "grad_norm": 3.4636963847879065, "learning_rate": 4.9863933001611015e-06, "loss": 1.1153, "step": 225 }, { "epoch": 0.3651050080775444, "grad_norm": 3.3631127239609278, "learning_rate": 4.9862603501181174e-06, "loss": 1.104, "step": 226 }, { "epoch": 0.3667205169628433, "grad_norm": 4.3367202050554114, "learning_rate": 4.986126755496342e-06, "loss": 1.12, "step": 227 }, { "epoch": 0.3683360258481422, "grad_norm": 3.4356976481346124, "learning_rate": 4.98599251633041e-06, "loss": 1.0999, "step": 228 }, { "epoch": 0.36995153473344106, "grad_norm": 3.038325209887424, "learning_rate": 4.9858576326551245e-06, "loss": 0.9788, "step": 229 }, { "epoch": 0.3715670436187399, "grad_norm": 3.1977576622335255, "learning_rate": 4.9857221045054535e-06, "loss": 1.1442, "step": 230 }, { "epoch": 0.37318255250403876, "grad_norm": 3.054854279924882, "learning_rate": 4.9855859319165355e-06, "loss": 1.0412, "step": 231 }, { "epoch": 0.37479806138933763, "grad_norm": 3.3489120064907176, "learning_rate": 4.985449114923672e-06, "loss": 1.1984, "step": 232 }, { "epoch": 0.3764135702746365, "grad_norm": 3.194524141612492, "learning_rate": 4.985311653562334e-06, "loss": 1.0369, "step": 233 }, { "epoch": 0.3780290791599354, "grad_norm": 3.151564583577931, "learning_rate": 4.985173547868161e-06, "loss": 1.149, "step": 234 }, { "epoch": 0.37964458804523427, "grad_norm": 2.8204463371106137, "learning_rate": 4.985034797876957e-06, "loss": 0.943, "step": 235 }, { "epoch": 0.38126009693053314, "grad_norm": 3.4350703293789207, "learning_rate": 4.9848954036246934e-06, "loss": 1.1432, "step": 236 }, { "epoch": 0.38287560581583197, "grad_norm": 3.0432021857413396, "learning_rate": 4.984755365147509e-06, "loss": 1.0017, "step": 237 }, { "epoch": 0.38449111470113084, "grad_norm": 3.056488190052054, "learning_rate": 4.9846146824817095e-06, "loss": 1.0467, "step": 238 }, { "epoch": 0.3861066235864297, "grad_norm": 2.8831979559298695, "learning_rate": 4.984473355663768e-06, "loss": 1.0706, "step": 239 }, { "epoch": 0.3877221324717286, "grad_norm": 2.8887477582069856, "learning_rate": 4.984331384730325e-06, "loss": 1.0831, "step": 240 }, { "epoch": 0.3893376413570275, "grad_norm": 3.0913687441723434, "learning_rate": 4.984188769718186e-06, "loss": 1.1143, "step": 241 }, { "epoch": 0.39095315024232635, "grad_norm": 3.1619341768577454, "learning_rate": 4.984045510664326e-06, "loss": 1.1371, "step": 242 }, { "epoch": 0.3925686591276252, "grad_norm": 3.9406956370272432, "learning_rate": 4.983901607605886e-06, "loss": 1.2354, "step": 243 }, { "epoch": 0.39418416801292405, "grad_norm": 3.642326774259985, "learning_rate": 4.983757060580173e-06, "loss": 1.2839, "step": 244 }, { "epoch": 0.39579967689822293, "grad_norm": 2.863255458419629, "learning_rate": 4.983611869624662e-06, "loss": 1.0131, "step": 245 }, { "epoch": 0.3974151857835218, "grad_norm": 2.868204623949919, "learning_rate": 4.983466034776994e-06, "loss": 1.0678, "step": 246 }, { "epoch": 0.3990306946688207, "grad_norm": 2.9253218786509647, "learning_rate": 4.983319556074979e-06, "loss": 1.0148, "step": 247 }, { "epoch": 0.40064620355411956, "grad_norm": 2.922492445927329, "learning_rate": 4.9831724335565915e-06, "loss": 1.0984, "step": 248 }, { "epoch": 0.40226171243941844, "grad_norm": 2.7239176737805733, "learning_rate": 4.983024667259975e-06, "loss": 0.9928, "step": 249 }, { "epoch": 0.40387722132471726, "grad_norm": 3.113571731709937, "learning_rate": 4.982876257223437e-06, "loss": 1.0936, "step": 250 }, { "epoch": 0.40549273021001614, "grad_norm": 3.3154109832067356, "learning_rate": 4.982727203485456e-06, "loss": 1.1879, "step": 251 }, { "epoch": 0.407108239095315, "grad_norm": 3.4786539752100327, "learning_rate": 4.9825775060846735e-06, "loss": 1.104, "step": 252 }, { "epoch": 0.4087237479806139, "grad_norm": 2.696550158608926, "learning_rate": 4.9824271650599e-06, "loss": 0.9949, "step": 253 }, { "epoch": 0.41033925686591277, "grad_norm": 3.5545775208931447, "learning_rate": 4.982276180450113e-06, "loss": 1.1524, "step": 254 }, { "epoch": 0.41195476575121165, "grad_norm": 2.980687824588055, "learning_rate": 4.982124552294454e-06, "loss": 0.9061, "step": 255 }, { "epoch": 0.4135702746365105, "grad_norm": 2.943325078528236, "learning_rate": 4.9819722806322365e-06, "loss": 1.0896, "step": 256 }, { "epoch": 0.41518578352180935, "grad_norm": 2.7202114728294218, "learning_rate": 4.981819365502936e-06, "loss": 0.9696, "step": 257 }, { "epoch": 0.4168012924071082, "grad_norm": 3.472920258516267, "learning_rate": 4.9816658069461976e-06, "loss": 1.0875, "step": 258 }, { "epoch": 0.4184168012924071, "grad_norm": 3.9692682782950923, "learning_rate": 4.981511605001832e-06, "loss": 1.1469, "step": 259 }, { "epoch": 0.420032310177706, "grad_norm": 3.404488474375711, "learning_rate": 4.981356759709817e-06, "loss": 1.1896, "step": 260 }, { "epoch": 0.42164781906300486, "grad_norm": 3.2709688806580988, "learning_rate": 4.981201271110297e-06, "loss": 1.0698, "step": 261 }, { "epoch": 0.42326332794830374, "grad_norm": 3.058176550224811, "learning_rate": 4.981045139243584e-06, "loss": 1.1466, "step": 262 }, { "epoch": 0.42487883683360256, "grad_norm": 2.983413006893194, "learning_rate": 4.980888364150156e-06, "loss": 1.0463, "step": 263 }, { "epoch": 0.42649434571890144, "grad_norm": 2.904992697999748, "learning_rate": 4.980730945870656e-06, "loss": 1.0426, "step": 264 }, { "epoch": 0.4281098546042003, "grad_norm": 3.184964705068802, "learning_rate": 4.9805728844459e-06, "loss": 1.0508, "step": 265 }, { "epoch": 0.4297253634894992, "grad_norm": 3.383195846220801, "learning_rate": 4.980414179916863e-06, "loss": 1.1138, "step": 266 }, { "epoch": 0.43134087237479807, "grad_norm": 3.221978819215262, "learning_rate": 4.98025483232469e-06, "loss": 1.0465, "step": 267 }, { "epoch": 0.43295638126009695, "grad_norm": 2.913830981107215, "learning_rate": 4.980094841710696e-06, "loss": 0.9662, "step": 268 }, { "epoch": 0.4345718901453958, "grad_norm": 3.055288893131654, "learning_rate": 4.979934208116355e-06, "loss": 1.0965, "step": 269 }, { "epoch": 0.43618739903069464, "grad_norm": 3.232863298433014, "learning_rate": 4.979772931583316e-06, "loss": 1.113, "step": 270 }, { "epoch": 0.4378029079159935, "grad_norm": 3.7701760916767255, "learning_rate": 4.97961101215339e-06, "loss": 1.1845, "step": 271 }, { "epoch": 0.4394184168012924, "grad_norm": 3.0711507586606728, "learning_rate": 4.979448449868556e-06, "loss": 1.0753, "step": 272 }, { "epoch": 0.4410339256865913, "grad_norm": 3.0645535352268745, "learning_rate": 4.979285244770958e-06, "loss": 1.1602, "step": 273 }, { "epoch": 0.44264943457189015, "grad_norm": 3.236421538857654, "learning_rate": 4.979121396902909e-06, "loss": 1.0933, "step": 274 }, { "epoch": 0.44426494345718903, "grad_norm": 3.0586290639472486, "learning_rate": 4.978956906306886e-06, "loss": 1.0216, "step": 275 }, { "epoch": 0.4458804523424879, "grad_norm": 3.425935832150902, "learning_rate": 4.9787917730255366e-06, "loss": 1.1833, "step": 276 }, { "epoch": 0.44749596122778673, "grad_norm": 3.071557711154897, "learning_rate": 4.978625997101671e-06, "loss": 1.1378, "step": 277 }, { "epoch": 0.4491114701130856, "grad_norm": 3.5601597989762506, "learning_rate": 4.978459578578269e-06, "loss": 1.1794, "step": 278 }, { "epoch": 0.4507269789983845, "grad_norm": 3.313072644496898, "learning_rate": 4.978292517498475e-06, "loss": 1.0517, "step": 279 }, { "epoch": 0.45234248788368336, "grad_norm": 3.169795603919028, "learning_rate": 4.978124813905599e-06, "loss": 1.0471, "step": 280 }, { "epoch": 0.45395799676898224, "grad_norm": 3.4688071198971344, "learning_rate": 4.977956467843122e-06, "loss": 1.0361, "step": 281 }, { "epoch": 0.4555735056542811, "grad_norm": 3.237625505113707, "learning_rate": 4.9777874793546874e-06, "loss": 1.06, "step": 282 }, { "epoch": 0.45718901453957994, "grad_norm": 2.933061263202983, "learning_rate": 4.977617848484107e-06, "loss": 1.1235, "step": 283 }, { "epoch": 0.4588045234248788, "grad_norm": 2.945517148756499, "learning_rate": 4.977447575275358e-06, "loss": 0.9997, "step": 284 }, { "epoch": 0.4604200323101777, "grad_norm": 3.218475183181675, "learning_rate": 4.9772766597725854e-06, "loss": 1.0836, "step": 285 }, { "epoch": 0.4620355411954766, "grad_norm": 2.786121479276717, "learning_rate": 4.977105102020099e-06, "loss": 1.0388, "step": 286 }, { "epoch": 0.46365105008077545, "grad_norm": 2.9786178570721846, "learning_rate": 4.976932902062378e-06, "loss": 1.0354, "step": 287 }, { "epoch": 0.46526655896607433, "grad_norm": 3.394463490798878, "learning_rate": 4.976760059944064e-06, "loss": 1.0647, "step": 288 }, { "epoch": 0.4668820678513732, "grad_norm": 2.787617622884442, "learning_rate": 4.97658657570997e-06, "loss": 0.9768, "step": 289 }, { "epoch": 0.46849757673667203, "grad_norm": 3.1266701647501596, "learning_rate": 4.976412449405072e-06, "loss": 1.0843, "step": 290 }, { "epoch": 0.4701130856219709, "grad_norm": 3.156578709943947, "learning_rate": 4.976237681074514e-06, "loss": 1.0802, "step": 291 }, { "epoch": 0.4717285945072698, "grad_norm": 2.931076853909443, "learning_rate": 4.976062270763603e-06, "loss": 1.0794, "step": 292 }, { "epoch": 0.47334410339256866, "grad_norm": 3.048635879965205, "learning_rate": 4.975886218517818e-06, "loss": 1.1432, "step": 293 }, { "epoch": 0.47495961227786754, "grad_norm": 3.7326434522258256, "learning_rate": 4.975709524382801e-06, "loss": 1.1498, "step": 294 }, { "epoch": 0.4765751211631664, "grad_norm": 3.004596254627448, "learning_rate": 4.97553218840436e-06, "loss": 1.0407, "step": 295 }, { "epoch": 0.4781906300484653, "grad_norm": 2.8933038593910094, "learning_rate": 4.975354210628473e-06, "loss": 1.0216, "step": 296 }, { "epoch": 0.4798061389337641, "grad_norm": 3.2571938542971006, "learning_rate": 4.975175591101279e-06, "loss": 1.1346, "step": 297 }, { "epoch": 0.481421647819063, "grad_norm": 3.6405909114144004, "learning_rate": 4.9749963298690875e-06, "loss": 1.1091, "step": 298 }, { "epoch": 0.48303715670436187, "grad_norm": 3.0319207186882564, "learning_rate": 4.974816426978374e-06, "loss": 1.2118, "step": 299 }, { "epoch": 0.48465266558966075, "grad_norm": 3.0406741265187334, "learning_rate": 4.974635882475778e-06, "loss": 0.9879, "step": 300 }, { "epoch": 0.4862681744749596, "grad_norm": 3.3575308611016754, "learning_rate": 4.974454696408108e-06, "loss": 1.0301, "step": 301 }, { "epoch": 0.4878836833602585, "grad_norm": 3.18197213549727, "learning_rate": 4.974272868822335e-06, "loss": 1.1336, "step": 302 }, { "epoch": 0.4894991922455573, "grad_norm": 2.786538182604595, "learning_rate": 4.974090399765603e-06, "loss": 1.1205, "step": 303 }, { "epoch": 0.4911147011308562, "grad_norm": 3.459096390847017, "learning_rate": 4.973907289285216e-06, "loss": 1.0535, "step": 304 }, { "epoch": 0.4927302100161551, "grad_norm": 2.974348985114133, "learning_rate": 4.973723537428647e-06, "loss": 0.92, "step": 305 }, { "epoch": 0.49434571890145396, "grad_norm": 3.2799613752414922, "learning_rate": 4.9735391442435334e-06, "loss": 1.0874, "step": 306 }, { "epoch": 0.49596122778675283, "grad_norm": 2.86687694775561, "learning_rate": 4.973354109777681e-06, "loss": 1.1097, "step": 307 }, { "epoch": 0.4975767366720517, "grad_norm": 3.5717257695010614, "learning_rate": 4.9731684340790635e-06, "loss": 1.0663, "step": 308 }, { "epoch": 0.4991922455573506, "grad_norm": 3.443662555065265, "learning_rate": 4.972982117195816e-06, "loss": 1.277, "step": 309 }, { "epoch": 0.5008077544426495, "grad_norm": 3.0030085318542024, "learning_rate": 4.972795159176244e-06, "loss": 1.1608, "step": 310 }, { "epoch": 0.5024232633279483, "grad_norm": 2.931562892762512, "learning_rate": 4.9726075600688154e-06, "loss": 1.1851, "step": 311 }, { "epoch": 0.5040387722132472, "grad_norm": 3.2323154936112988, "learning_rate": 4.972419319922168e-06, "loss": 1.0714, "step": 312 }, { "epoch": 0.505654281098546, "grad_norm": 2.6078515789212497, "learning_rate": 4.972230438785104e-06, "loss": 0.8866, "step": 313 }, { "epoch": 0.5072697899838449, "grad_norm": 3.092078480100944, "learning_rate": 4.972040916706591e-06, "loss": 1.0595, "step": 314 }, { "epoch": 0.5088852988691438, "grad_norm": 2.754557826140931, "learning_rate": 4.971850753735766e-06, "loss": 1.0332, "step": 315 }, { "epoch": 0.5105008077544426, "grad_norm": 2.787496752494238, "learning_rate": 4.971659949921928e-06, "loss": 0.99, "step": 316 }, { "epoch": 0.5121163166397416, "grad_norm": 3.2204937307497032, "learning_rate": 4.971468505314545e-06, "loss": 1.0202, "step": 317 }, { "epoch": 0.5137318255250404, "grad_norm": 3.16119076128899, "learning_rate": 4.97127641996325e-06, "loss": 1.1486, "step": 318 }, { "epoch": 0.5153473344103393, "grad_norm": 2.9464993042711733, "learning_rate": 4.971083693917843e-06, "loss": 1.1484, "step": 319 }, { "epoch": 0.5169628432956381, "grad_norm": 3.287797549682093, "learning_rate": 4.970890327228288e-06, "loss": 1.0882, "step": 320 }, { "epoch": 0.518578352180937, "grad_norm": 3.492733161498165, "learning_rate": 4.9706963199447174e-06, "loss": 1.1872, "step": 321 }, { "epoch": 0.5201938610662359, "grad_norm": 3.1842991870636927, "learning_rate": 4.9705016721174295e-06, "loss": 1.1484, "step": 322 }, { "epoch": 0.5218093699515347, "grad_norm": 2.9328853799572943, "learning_rate": 4.9703063837968865e-06, "loss": 1.0487, "step": 323 }, { "epoch": 0.5234248788368336, "grad_norm": 3.01774791853161, "learning_rate": 4.97011045503372e-06, "loss": 1.1024, "step": 324 }, { "epoch": 0.5250403877221325, "grad_norm": 2.910546782051953, "learning_rate": 4.969913885878723e-06, "loss": 1.1162, "step": 325 }, { "epoch": 0.5266558966074314, "grad_norm": 3.1633916051667303, "learning_rate": 4.9697166763828606e-06, "loss": 1.1809, "step": 326 }, { "epoch": 0.5282714054927302, "grad_norm": 2.9588462854184963, "learning_rate": 4.969518826597258e-06, "loss": 0.9509, "step": 327 }, { "epoch": 0.529886914378029, "grad_norm": 3.205175160891202, "learning_rate": 4.969320336573211e-06, "loss": 1.1394, "step": 328 }, { "epoch": 0.531502423263328, "grad_norm": 2.90335701468703, "learning_rate": 4.969121206362177e-06, "loss": 1.0817, "step": 329 }, { "epoch": 0.5331179321486268, "grad_norm": 3.11072083081614, "learning_rate": 4.968921436015785e-06, "loss": 1.0809, "step": 330 }, { "epoch": 0.5347334410339257, "grad_norm": 2.972360375781754, "learning_rate": 4.968721025585824e-06, "loss": 1.0469, "step": 331 }, { "epoch": 0.5363489499192245, "grad_norm": 2.7366642273418185, "learning_rate": 4.968519975124252e-06, "loss": 1.0863, "step": 332 }, { "epoch": 0.5379644588045234, "grad_norm": 2.559418479784477, "learning_rate": 4.968318284683195e-06, "loss": 0.9166, "step": 333 }, { "epoch": 0.5395799676898223, "grad_norm": 2.8265709654930515, "learning_rate": 4.968115954314939e-06, "loss": 1.0537, "step": 334 }, { "epoch": 0.5411954765751211, "grad_norm": 3.4889652653184804, "learning_rate": 4.967912984071941e-06, "loss": 1.1491, "step": 335 }, { "epoch": 0.5428109854604201, "grad_norm": 3.2540230829521253, "learning_rate": 4.967709374006824e-06, "loss": 1.1966, "step": 336 }, { "epoch": 0.5444264943457189, "grad_norm": 3.143813987443097, "learning_rate": 4.967505124172373e-06, "loss": 1.0955, "step": 337 }, { "epoch": 0.5460420032310178, "grad_norm": 3.1275826516699277, "learning_rate": 4.967300234621541e-06, "loss": 1.0774, "step": 338 }, { "epoch": 0.5476575121163166, "grad_norm": 3.1784660569008323, "learning_rate": 4.967094705407449e-06, "loss": 1.1078, "step": 339 }, { "epoch": 0.5492730210016155, "grad_norm": 3.1933260600908095, "learning_rate": 4.966888536583379e-06, "loss": 1.2559, "step": 340 }, { "epoch": 0.5508885298869144, "grad_norm": 2.9804499459810776, "learning_rate": 4.966681728202785e-06, "loss": 1.1579, "step": 341 }, { "epoch": 0.5525040387722132, "grad_norm": 3.10310417112114, "learning_rate": 4.96647428031928e-06, "loss": 1.119, "step": 342 }, { "epoch": 0.5541195476575121, "grad_norm": 2.743315774481981, "learning_rate": 4.966266192986648e-06, "loss": 1.1039, "step": 343 }, { "epoch": 0.555735056542811, "grad_norm": 3.4374457096246074, "learning_rate": 4.966057466258837e-06, "loss": 1.1114, "step": 344 }, { "epoch": 0.5573505654281099, "grad_norm": 3.095786015620268, "learning_rate": 4.965848100189959e-06, "loss": 1.0991, "step": 345 }, { "epoch": 0.5589660743134087, "grad_norm": 3.3122457122200335, "learning_rate": 4.965638094834296e-06, "loss": 1.1587, "step": 346 }, { "epoch": 0.5605815831987075, "grad_norm": 2.8196827222578875, "learning_rate": 4.965427450246291e-06, "loss": 0.9715, "step": 347 }, { "epoch": 0.5621970920840065, "grad_norm": 2.7417185652563805, "learning_rate": 4.965216166480556e-06, "loss": 1.0653, "step": 348 }, { "epoch": 0.5638126009693053, "grad_norm": 3.0950638242012842, "learning_rate": 4.965004243591869e-06, "loss": 1.1241, "step": 349 }, { "epoch": 0.5654281098546042, "grad_norm": 3.0497467453363436, "learning_rate": 4.96479168163517e-06, "loss": 1.0633, "step": 350 }, { "epoch": 0.567043618739903, "grad_norm": 3.249364670729519, "learning_rate": 4.964578480665568e-06, "loss": 1.0288, "step": 351 }, { "epoch": 0.568659127625202, "grad_norm": 3.2052739952284646, "learning_rate": 4.9643646407383365e-06, "loss": 1.0548, "step": 352 }, { "epoch": 0.5702746365105008, "grad_norm": 3.2584436769747094, "learning_rate": 4.964150161908915e-06, "loss": 1.0522, "step": 353 }, { "epoch": 0.5718901453957996, "grad_norm": 3.0848942404317006, "learning_rate": 4.963935044232909e-06, "loss": 1.1043, "step": 354 }, { "epoch": 0.5735056542810986, "grad_norm": 3.3387642272770446, "learning_rate": 4.9637192877660886e-06, "loss": 1.0323, "step": 355 }, { "epoch": 0.5751211631663974, "grad_norm": 2.9912218374736947, "learning_rate": 4.96350289256439e-06, "loss": 0.914, "step": 356 }, { "epoch": 0.5767366720516963, "grad_norm": 2.9503277384852344, "learning_rate": 4.963285858683915e-06, "loss": 1.1585, "step": 357 }, { "epoch": 0.5783521809369951, "grad_norm": 3.1801957634225437, "learning_rate": 4.963068186180932e-06, "loss": 1.0961, "step": 358 }, { "epoch": 0.5799676898222941, "grad_norm": 16.57435172998196, "learning_rate": 4.962849875111873e-06, "loss": 1.1235, "step": 359 }, { "epoch": 0.5815831987075929, "grad_norm": 3.2476651714027645, "learning_rate": 4.962630925533335e-06, "loss": 1.069, "step": 360 }, { "epoch": 0.5831987075928917, "grad_norm": 3.09676067154371, "learning_rate": 4.962411337502086e-06, "loss": 1.0273, "step": 361 }, { "epoch": 0.5848142164781907, "grad_norm": 3.000862501490868, "learning_rate": 4.9621911110750515e-06, "loss": 1.0879, "step": 362 }, { "epoch": 0.5864297253634895, "grad_norm": 3.087399392823132, "learning_rate": 4.961970246309329e-06, "loss": 1.1207, "step": 363 }, { "epoch": 0.5880452342487884, "grad_norm": 2.751393896501964, "learning_rate": 4.961748743262179e-06, "loss": 1.0522, "step": 364 }, { "epoch": 0.5896607431340872, "grad_norm": 2.8012080380832987, "learning_rate": 4.961526601991027e-06, "loss": 0.9896, "step": 365 }, { "epoch": 0.5912762520193862, "grad_norm": 2.7758554724805946, "learning_rate": 4.961303822553464e-06, "loss": 0.9063, "step": 366 }, { "epoch": 0.592891760904685, "grad_norm": 3.044030911824307, "learning_rate": 4.961080405007248e-06, "loss": 1.0952, "step": 367 }, { "epoch": 0.5945072697899838, "grad_norm": 3.1712965394734645, "learning_rate": 4.960856349410301e-06, "loss": 1.0729, "step": 368 }, { "epoch": 0.5961227786752827, "grad_norm": 2.720498799775322, "learning_rate": 4.960631655820711e-06, "loss": 1.0293, "step": 369 }, { "epoch": 0.5977382875605816, "grad_norm": 3.2459619561532174, "learning_rate": 4.960406324296731e-06, "loss": 1.0468, "step": 370 }, { "epoch": 0.5993537964458805, "grad_norm": 3.959277898298409, "learning_rate": 4.96018035489678e-06, "loss": 1.1879, "step": 371 }, { "epoch": 0.6009693053311793, "grad_norm": 3.2520219511606596, "learning_rate": 4.9599537476794415e-06, "loss": 1.0168, "step": 372 }, { "epoch": 0.6025848142164781, "grad_norm": 2.866669856081564, "learning_rate": 4.959726502703465e-06, "loss": 1.0184, "step": 373 }, { "epoch": 0.6042003231017771, "grad_norm": 3.313387420802499, "learning_rate": 4.959498620027766e-06, "loss": 1.0966, "step": 374 }, { "epoch": 0.6058158319870759, "grad_norm": 3.242272621520551, "learning_rate": 4.959270099711423e-06, "loss": 1.0129, "step": 375 }, { "epoch": 0.6074313408723748, "grad_norm": 2.5001053824477095, "learning_rate": 4.959040941813682e-06, "loss": 0.9176, "step": 376 }, { "epoch": 0.6090468497576736, "grad_norm": 2.9227559643627212, "learning_rate": 4.9588111463939544e-06, "loss": 0.9353, "step": 377 }, { "epoch": 0.6106623586429726, "grad_norm": 2.609433646070067, "learning_rate": 4.9585807135118155e-06, "loss": 0.9156, "step": 378 }, { "epoch": 0.6122778675282714, "grad_norm": 2.876521099437984, "learning_rate": 4.958349643227008e-06, "loss": 1.0617, "step": 379 }, { "epoch": 0.6138933764135702, "grad_norm": 3.192869018703093, "learning_rate": 4.958117935599435e-06, "loss": 1.0929, "step": 380 }, { "epoch": 0.6155088852988692, "grad_norm": 3.2808530601635257, "learning_rate": 4.95788559068917e-06, "loss": 1.0667, "step": 381 }, { "epoch": 0.617124394184168, "grad_norm": 3.133283059990581, "learning_rate": 4.95765260855645e-06, "loss": 0.9373, "step": 382 }, { "epoch": 0.6187399030694669, "grad_norm": 2.994704126787007, "learning_rate": 4.957418989261678e-06, "loss": 0.9812, "step": 383 }, { "epoch": 0.6203554119547657, "grad_norm": 2.6928281932427183, "learning_rate": 4.95718473286542e-06, "loss": 0.9697, "step": 384 }, { "epoch": 0.6219709208400647, "grad_norm": 2.699097261653262, "learning_rate": 4.956949839428408e-06, "loss": 0.9733, "step": 385 }, { "epoch": 0.6235864297253635, "grad_norm": 3.379601747329385, "learning_rate": 4.956714309011541e-06, "loss": 1.1029, "step": 386 }, { "epoch": 0.6252019386106623, "grad_norm": 3.356079024017897, "learning_rate": 4.956478141675881e-06, "loss": 1.1496, "step": 387 }, { "epoch": 0.6268174474959612, "grad_norm": 3.336287510310863, "learning_rate": 4.956241337482657e-06, "loss": 1.0365, "step": 388 }, { "epoch": 0.6284329563812601, "grad_norm": 3.0486342908345367, "learning_rate": 4.95600389649326e-06, "loss": 1.0132, "step": 389 }, { "epoch": 0.630048465266559, "grad_norm": 3.6110093783862083, "learning_rate": 4.9557658187692494e-06, "loss": 1.1168, "step": 390 }, { "epoch": 0.6316639741518578, "grad_norm": 3.2870646652541518, "learning_rate": 4.9555271043723475e-06, "loss": 1.0853, "step": 391 }, { "epoch": 0.6332794830371568, "grad_norm": 3.0269862681272164, "learning_rate": 4.955287753364444e-06, "loss": 1.0649, "step": 392 }, { "epoch": 0.6348949919224556, "grad_norm": 3.100906248499616, "learning_rate": 4.955047765807591e-06, "loss": 1.1423, "step": 393 }, { "epoch": 0.6365105008077544, "grad_norm": 3.4334657835026743, "learning_rate": 4.954807141764007e-06, "loss": 1.0517, "step": 394 }, { "epoch": 0.6381260096930533, "grad_norm": 3.7464233628416483, "learning_rate": 4.9545658812960755e-06, "loss": 1.0543, "step": 395 }, { "epoch": 0.6397415185783522, "grad_norm": 2.9443358895082663, "learning_rate": 4.954323984466344e-06, "loss": 0.9645, "step": 396 }, { "epoch": 0.6413570274636511, "grad_norm": 2.9124101262858377, "learning_rate": 4.954081451337528e-06, "loss": 0.9554, "step": 397 }, { "epoch": 0.6429725363489499, "grad_norm": 3.0339188058564845, "learning_rate": 4.953838281972502e-06, "loss": 1.1316, "step": 398 }, { "epoch": 0.6445880452342488, "grad_norm": 2.939559009455548, "learning_rate": 4.953594476434314e-06, "loss": 1.0825, "step": 399 }, { "epoch": 0.6462035541195477, "grad_norm": 3.274940204962926, "learning_rate": 4.953350034786168e-06, "loss": 1.1297, "step": 400 }, { "epoch": 0.6462035541195477, "eval_loss": 1.0828090906143188, "eval_runtime": 2.3497, "eval_samples_per_second": 127.673, "eval_steps_per_second": 2.979, "step": 400 }, { "epoch": 0.6478190630048465, "grad_norm": 3.6093842755342878, "learning_rate": 4.953104957091439e-06, "loss": 1.1861, "step": 401 }, { "epoch": 0.6494345718901454, "grad_norm": 3.2602258704009297, "learning_rate": 4.952859243413665e-06, "loss": 1.151, "step": 402 }, { "epoch": 0.6510500807754442, "grad_norm": 2.9713253559034674, "learning_rate": 4.952612893816547e-06, "loss": 1.0072, "step": 403 }, { "epoch": 0.6526655896607432, "grad_norm": 2.8762025527001063, "learning_rate": 4.9523659083639555e-06, "loss": 1.0965, "step": 404 }, { "epoch": 0.654281098546042, "grad_norm": 2.860038556322983, "learning_rate": 4.952118287119922e-06, "loss": 1.0738, "step": 405 }, { "epoch": 0.6558966074313409, "grad_norm": 2.8707767150996264, "learning_rate": 4.951870030148642e-06, "loss": 1.0585, "step": 406 }, { "epoch": 0.6575121163166397, "grad_norm": 3.0510810706282188, "learning_rate": 4.95162113751448e-06, "loss": 1.0642, "step": 407 }, { "epoch": 0.6591276252019386, "grad_norm": 3.2232793497851735, "learning_rate": 4.951371609281962e-06, "loss": 1.0755, "step": 408 }, { "epoch": 0.6607431340872375, "grad_norm": 3.2828050195897336, "learning_rate": 4.951121445515781e-06, "loss": 1.049, "step": 409 }, { "epoch": 0.6623586429725363, "grad_norm": 3.238339136640809, "learning_rate": 4.950870646280791e-06, "loss": 1.0648, "step": 410 }, { "epoch": 0.6639741518578353, "grad_norm": 2.7299614661920324, "learning_rate": 4.950619211642016e-06, "loss": 1.0017, "step": 411 }, { "epoch": 0.6655896607431341, "grad_norm": 3.053762075381443, "learning_rate": 4.950367141664641e-06, "loss": 1.068, "step": 412 }, { "epoch": 0.6672051696284329, "grad_norm": 2.77944046745139, "learning_rate": 4.950114436414017e-06, "loss": 0.9584, "step": 413 }, { "epoch": 0.6688206785137318, "grad_norm": 2.9518621288196654, "learning_rate": 4.949861095955657e-06, "loss": 0.9566, "step": 414 }, { "epoch": 0.6704361873990307, "grad_norm": 3.1445616035181243, "learning_rate": 4.949607120355245e-06, "loss": 1.0513, "step": 415 }, { "epoch": 0.6720516962843296, "grad_norm": 3.3970661510459372, "learning_rate": 4.949352509678623e-06, "loss": 1.0876, "step": 416 }, { "epoch": 0.6736672051696284, "grad_norm": 2.7796419946723057, "learning_rate": 4.949097263991802e-06, "loss": 0.9364, "step": 417 }, { "epoch": 0.6752827140549273, "grad_norm": 2.724639189828954, "learning_rate": 4.948841383360955e-06, "loss": 0.9299, "step": 418 }, { "epoch": 0.6768982229402262, "grad_norm": 3.8736415396493245, "learning_rate": 4.9485848678524216e-06, "loss": 0.9976, "step": 419 }, { "epoch": 0.678513731825525, "grad_norm": 3.2902964782860673, "learning_rate": 4.948327717532705e-06, "loss": 1.1121, "step": 420 }, { "epoch": 0.6801292407108239, "grad_norm": 3.0251257831333036, "learning_rate": 4.948069932468473e-06, "loss": 0.9918, "step": 421 }, { "epoch": 0.6817447495961227, "grad_norm": 3.403209393646779, "learning_rate": 4.947811512726556e-06, "loss": 1.0825, "step": 422 }, { "epoch": 0.6833602584814217, "grad_norm": 2.7977961413395285, "learning_rate": 4.947552458373953e-06, "loss": 0.9422, "step": 423 }, { "epoch": 0.6849757673667205, "grad_norm": 3.7641524490773937, "learning_rate": 4.947292769477826e-06, "loss": 1.1867, "step": 424 }, { "epoch": 0.6865912762520194, "grad_norm": 2.9454238014915295, "learning_rate": 4.9470324461055e-06, "loss": 0.9448, "step": 425 }, { "epoch": 0.6882067851373183, "grad_norm": 2.9654943846317456, "learning_rate": 4.946771488324466e-06, "loss": 0.9914, "step": 426 }, { "epoch": 0.6898222940226171, "grad_norm": 3.131330128495054, "learning_rate": 4.946509896202377e-06, "loss": 1.05, "step": 427 }, { "epoch": 0.691437802907916, "grad_norm": 3.108058987766131, "learning_rate": 4.946247669807056e-06, "loss": 1.0534, "step": 428 }, { "epoch": 0.6930533117932148, "grad_norm": 3.241054949555967, "learning_rate": 4.945984809206483e-06, "loss": 1.0344, "step": 429 }, { "epoch": 0.6946688206785138, "grad_norm": 2.882818963369049, "learning_rate": 4.94572131446881e-06, "loss": 0.9245, "step": 430 }, { "epoch": 0.6962843295638126, "grad_norm": 3.4055445912763753, "learning_rate": 4.945457185662347e-06, "loss": 1.1696, "step": 431 }, { "epoch": 0.6978998384491115, "grad_norm": 2.8081083668744395, "learning_rate": 4.945192422855571e-06, "loss": 1.087, "step": 432 }, { "epoch": 0.6995153473344103, "grad_norm": 2.8120301188462182, "learning_rate": 4.9449270261171255e-06, "loss": 0.9597, "step": 433 }, { "epoch": 0.7011308562197092, "grad_norm": 3.3193741714080387, "learning_rate": 4.9446609955158155e-06, "loss": 1.1526, "step": 434 }, { "epoch": 0.7027463651050081, "grad_norm": 3.355865863364781, "learning_rate": 4.94439433112061e-06, "loss": 1.0909, "step": 435 }, { "epoch": 0.7043618739903069, "grad_norm": 3.6094843655074045, "learning_rate": 4.9441270330006445e-06, "loss": 1.1595, "step": 436 }, { "epoch": 0.7059773828756059, "grad_norm": 2.978257095805782, "learning_rate": 4.943859101225217e-06, "loss": 1.0997, "step": 437 }, { "epoch": 0.7075928917609047, "grad_norm": 3.0266288484877504, "learning_rate": 4.943590535863791e-06, "loss": 1.0948, "step": 438 }, { "epoch": 0.7092084006462036, "grad_norm": 3.420517823969551, "learning_rate": 4.943321336985995e-06, "loss": 1.2403, "step": 439 }, { "epoch": 0.7108239095315024, "grad_norm": 2.973554381763161, "learning_rate": 4.943051504661618e-06, "loss": 1.2624, "step": 440 }, { "epoch": 0.7124394184168013, "grad_norm": 3.8229525965779794, "learning_rate": 4.942781038960617e-06, "loss": 1.0952, "step": 441 }, { "epoch": 0.7140549273021002, "grad_norm": 3.042428969605469, "learning_rate": 4.942509939953111e-06, "loss": 1.0213, "step": 442 }, { "epoch": 0.715670436187399, "grad_norm": 3.7683259895449353, "learning_rate": 4.942238207709388e-06, "loss": 1.1535, "step": 443 }, { "epoch": 0.7172859450726979, "grad_norm": 3.379290078961192, "learning_rate": 4.941965842299891e-06, "loss": 1.0581, "step": 444 }, { "epoch": 0.7189014539579968, "grad_norm": 3.200232439807798, "learning_rate": 4.941692843795235e-06, "loss": 1.0929, "step": 445 }, { "epoch": 0.7205169628432956, "grad_norm": 3.230897489045735, "learning_rate": 4.9414192122661975e-06, "loss": 1.0405, "step": 446 }, { "epoch": 0.7221324717285945, "grad_norm": 3.4378741081805515, "learning_rate": 4.941144947783718e-06, "loss": 1.1735, "step": 447 }, { "epoch": 0.7237479806138933, "grad_norm": 2.8218202879785133, "learning_rate": 4.940870050418901e-06, "loss": 0.996, "step": 448 }, { "epoch": 0.7253634894991923, "grad_norm": 3.2434380604940776, "learning_rate": 4.940594520243016e-06, "loss": 1.0388, "step": 449 }, { "epoch": 0.7269789983844911, "grad_norm": 2.866236441343085, "learning_rate": 4.9403183573274955e-06, "loss": 1.1027, "step": 450 }, { "epoch": 0.72859450726979, "grad_norm": 3.3008710353128934, "learning_rate": 4.940041561743937e-06, "loss": 1.0821, "step": 451 }, { "epoch": 0.7302100161550888, "grad_norm": 3.181859983086901, "learning_rate": 4.939764133564101e-06, "loss": 1.1481, "step": 452 }, { "epoch": 0.7318255250403877, "grad_norm": 3.327990368975156, "learning_rate": 4.939486072859914e-06, "loss": 1.1523, "step": 453 }, { "epoch": 0.7334410339256866, "grad_norm": 2.8547188827537124, "learning_rate": 4.939207379703463e-06, "loss": 1.0217, "step": 454 }, { "epoch": 0.7350565428109854, "grad_norm": 3.456136768493984, "learning_rate": 4.938928054167002e-06, "loss": 1.0507, "step": 455 }, { "epoch": 0.7366720516962844, "grad_norm": 3.1217478932655647, "learning_rate": 4.938648096322948e-06, "loss": 1.1116, "step": 456 }, { "epoch": 0.7382875605815832, "grad_norm": 2.9942803351727125, "learning_rate": 4.938367506243881e-06, "loss": 1.0398, "step": 457 }, { "epoch": 0.7399030694668821, "grad_norm": 2.8629077678674895, "learning_rate": 4.938086284002546e-06, "loss": 1.0301, "step": 458 }, { "epoch": 0.7415185783521809, "grad_norm": 2.9815809405871088, "learning_rate": 4.937804429671853e-06, "loss": 1.0766, "step": 459 }, { "epoch": 0.7431340872374798, "grad_norm": 3.0239675208382972, "learning_rate": 4.937521943324873e-06, "loss": 1.0826, "step": 460 }, { "epoch": 0.7447495961227787, "grad_norm": 3.0372448844695086, "learning_rate": 4.937238825034843e-06, "loss": 1.0316, "step": 461 }, { "epoch": 0.7463651050080775, "grad_norm": 3.254979468102765, "learning_rate": 4.9369550748751635e-06, "loss": 1.0632, "step": 462 }, { "epoch": 0.7479806138933764, "grad_norm": 2.6504231866789056, "learning_rate": 4.936670692919397e-06, "loss": 0.9998, "step": 463 }, { "epoch": 0.7495961227786753, "grad_norm": 3.001464525030403, "learning_rate": 4.936385679241273e-06, "loss": 1.0343, "step": 464 }, { "epoch": 0.7512116316639742, "grad_norm": 3.0837271875735794, "learning_rate": 4.936100033914683e-06, "loss": 0.9355, "step": 465 }, { "epoch": 0.752827140549273, "grad_norm": 3.2381745003462585, "learning_rate": 4.935813757013682e-06, "loss": 1.0945, "step": 466 }, { "epoch": 0.7544426494345718, "grad_norm": 2.995606917574073, "learning_rate": 4.935526848612489e-06, "loss": 1.0744, "step": 467 }, { "epoch": 0.7560581583198708, "grad_norm": 2.6444184355901736, "learning_rate": 4.935239308785486e-06, "loss": 1.014, "step": 468 }, { "epoch": 0.7576736672051696, "grad_norm": 3.449795308345059, "learning_rate": 4.934951137607222e-06, "loss": 1.0234, "step": 469 }, { "epoch": 0.7592891760904685, "grad_norm": 3.5847905545140026, "learning_rate": 4.934662335152405e-06, "loss": 1.074, "step": 470 }, { "epoch": 0.7609046849757674, "grad_norm": 2.9980112341743674, "learning_rate": 4.934372901495909e-06, "loss": 1.0157, "step": 471 }, { "epoch": 0.7625201938610663, "grad_norm": 2.942898911005164, "learning_rate": 4.934082836712772e-06, "loss": 1.0266, "step": 472 }, { "epoch": 0.7641357027463651, "grad_norm": 2.746971313557926, "learning_rate": 4.933792140878196e-06, "loss": 0.9742, "step": 473 }, { "epoch": 0.7657512116316639, "grad_norm": 3.0116593736269333, "learning_rate": 4.933500814067544e-06, "loss": 1.0798, "step": 474 }, { "epoch": 0.7673667205169629, "grad_norm": 2.871699256346617, "learning_rate": 4.9332088563563455e-06, "loss": 0.9794, "step": 475 }, { "epoch": 0.7689822294022617, "grad_norm": 3.5047300286009704, "learning_rate": 4.932916267820293e-06, "loss": 1.081, "step": 476 }, { "epoch": 0.7705977382875606, "grad_norm": 3.075360975668953, "learning_rate": 4.932623048535239e-06, "loss": 1.0673, "step": 477 }, { "epoch": 0.7722132471728594, "grad_norm": 3.113492755064327, "learning_rate": 4.932329198577206e-06, "loss": 1.0797, "step": 478 }, { "epoch": 0.7738287560581584, "grad_norm": 2.9978710527466106, "learning_rate": 4.932034718022375e-06, "loss": 0.9904, "step": 479 }, { "epoch": 0.7754442649434572, "grad_norm": 3.0320874126239725, "learning_rate": 4.931739606947092e-06, "loss": 0.9847, "step": 480 }, { "epoch": 0.777059773828756, "grad_norm": 3.2971837085536064, "learning_rate": 4.931443865427865e-06, "loss": 1.0797, "step": 481 }, { "epoch": 0.778675282714055, "grad_norm": 2.9451445539635195, "learning_rate": 4.931147493541369e-06, "loss": 1.0531, "step": 482 }, { "epoch": 0.7802907915993538, "grad_norm": 2.811058704905609, "learning_rate": 4.93085049136444e-06, "loss": 1.0565, "step": 483 }, { "epoch": 0.7819063004846527, "grad_norm": 3.4824860137583795, "learning_rate": 4.930552858974077e-06, "loss": 1.1149, "step": 484 }, { "epoch": 0.7835218093699515, "grad_norm": 2.8785747262116006, "learning_rate": 4.930254596447443e-06, "loss": 0.9582, "step": 485 }, { "epoch": 0.7851373182552503, "grad_norm": 3.803488062317944, "learning_rate": 4.929955703861864e-06, "loss": 1.2802, "step": 486 }, { "epoch": 0.7867528271405493, "grad_norm": 2.9137124869222477, "learning_rate": 4.929656181294832e-06, "loss": 1.0472, "step": 487 }, { "epoch": 0.7883683360258481, "grad_norm": 3.217271322361005, "learning_rate": 4.929356028823998e-06, "loss": 1.002, "step": 488 }, { "epoch": 0.789983844911147, "grad_norm": 3.0267365396658623, "learning_rate": 4.929055246527179e-06, "loss": 1.0714, "step": 489 }, { "epoch": 0.7915993537964459, "grad_norm": 3.224744720021822, "learning_rate": 4.928753834482355e-06, "loss": 1.0715, "step": 490 }, { "epoch": 0.7932148626817448, "grad_norm": 2.8000466379055577, "learning_rate": 4.928451792767669e-06, "loss": 0.9063, "step": 491 }, { "epoch": 0.7948303715670436, "grad_norm": 3.104736969328559, "learning_rate": 4.928149121461427e-06, "loss": 1.0083, "step": 492 }, { "epoch": 0.7964458804523424, "grad_norm": 3.2768125620032347, "learning_rate": 4.927845820642099e-06, "loss": 1.1332, "step": 493 }, { "epoch": 0.7980613893376414, "grad_norm": 3.082290547346106, "learning_rate": 4.927541890388316e-06, "loss": 1.0579, "step": 494 }, { "epoch": 0.7996768982229402, "grad_norm": 3.3128504056578727, "learning_rate": 4.927237330778875e-06, "loss": 1.0261, "step": 495 }, { "epoch": 0.8012924071082391, "grad_norm": 2.987477624886449, "learning_rate": 4.926932141892736e-06, "loss": 0.943, "step": 496 }, { "epoch": 0.802907915993538, "grad_norm": 3.202475508017644, "learning_rate": 4.926626323809021e-06, "loss": 1.1939, "step": 497 }, { "epoch": 0.8045234248788369, "grad_norm": 2.6539600468536646, "learning_rate": 4.926319876607013e-06, "loss": 1.0019, "step": 498 }, { "epoch": 0.8061389337641357, "grad_norm": 2.6860184359258774, "learning_rate": 4.926012800366162e-06, "loss": 0.914, "step": 499 }, { "epoch": 0.8077544426494345, "grad_norm": 2.731654633778715, "learning_rate": 4.92570509516608e-06, "loss": 0.8736, "step": 500 }, { "epoch": 0.8093699515347335, "grad_norm": 2.608630095799193, "learning_rate": 4.92539676108654e-06, "loss": 0.9125, "step": 501 }, { "epoch": 0.8109854604200323, "grad_norm": 3.07398905987087, "learning_rate": 4.92508779820748e-06, "loss": 1.0598, "step": 502 }, { "epoch": 0.8126009693053312, "grad_norm": 2.7549487293781283, "learning_rate": 4.924778206609003e-06, "loss": 1.0572, "step": 503 }, { "epoch": 0.81421647819063, "grad_norm": 3.182789763305118, "learning_rate": 4.9244679863713695e-06, "loss": 1.0259, "step": 504 }, { "epoch": 0.815831987075929, "grad_norm": 3.6467254304844623, "learning_rate": 4.9241571375750075e-06, "loss": 1.0808, "step": 505 }, { "epoch": 0.8174474959612278, "grad_norm": 2.79355714494376, "learning_rate": 4.9238456603005055e-06, "loss": 0.8909, "step": 506 }, { "epoch": 0.8190630048465266, "grad_norm": 3.080312471897344, "learning_rate": 4.923533554628617e-06, "loss": 1.0591, "step": 507 }, { "epoch": 0.8206785137318255, "grad_norm": 2.814780128287342, "learning_rate": 4.923220820640257e-06, "loss": 0.9523, "step": 508 }, { "epoch": 0.8222940226171244, "grad_norm": 3.5458310995102145, "learning_rate": 4.922907458416504e-06, "loss": 1.0918, "step": 509 }, { "epoch": 0.8239095315024233, "grad_norm": 3.745338479756832, "learning_rate": 4.922593468038599e-06, "loss": 1.2414, "step": 510 }, { "epoch": 0.8255250403877221, "grad_norm": 3.2657813958341007, "learning_rate": 4.922278849587946e-06, "loss": 1.0368, "step": 511 }, { "epoch": 0.827140549273021, "grad_norm": 2.9556609451690035, "learning_rate": 4.921963603146111e-06, "loss": 1.0797, "step": 512 }, { "epoch": 0.8287560581583199, "grad_norm": 2.6542258801521483, "learning_rate": 4.921647728794826e-06, "loss": 0.9309, "step": 513 }, { "epoch": 0.8303715670436187, "grad_norm": 2.683386766838752, "learning_rate": 4.92133122661598e-06, "loss": 0.9629, "step": 514 }, { "epoch": 0.8319870759289176, "grad_norm": 3.1932473225790083, "learning_rate": 4.921014096691632e-06, "loss": 1.0736, "step": 515 }, { "epoch": 0.8336025848142165, "grad_norm": 3.2105550686193203, "learning_rate": 4.9206963391039976e-06, "loss": 1.0872, "step": 516 }, { "epoch": 0.8352180936995154, "grad_norm": 3.1456047750269023, "learning_rate": 4.920377953935457e-06, "loss": 1.0534, "step": 517 }, { "epoch": 0.8368336025848142, "grad_norm": 2.8889656935749364, "learning_rate": 4.920058941268555e-06, "loss": 0.9299, "step": 518 }, { "epoch": 0.8384491114701131, "grad_norm": 2.7584650826971457, "learning_rate": 4.919739301185996e-06, "loss": 1.0174, "step": 519 }, { "epoch": 0.840064620355412, "grad_norm": 2.9133160521592245, "learning_rate": 4.9194190337706515e-06, "loss": 1.0427, "step": 520 }, { "epoch": 0.8416801292407108, "grad_norm": 3.0027310986962537, "learning_rate": 4.919098139105551e-06, "loss": 1.0395, "step": 521 }, { "epoch": 0.8432956381260097, "grad_norm": 2.926602293858395, "learning_rate": 4.9187766172738885e-06, "loss": 1.0793, "step": 522 }, { "epoch": 0.8449111470113085, "grad_norm": 3.207098992051195, "learning_rate": 4.91845446835902e-06, "loss": 1.07, "step": 523 }, { "epoch": 0.8465266558966075, "grad_norm": 2.7298191353141945, "learning_rate": 4.918131692444466e-06, "loss": 0.9795, "step": 524 }, { "epoch": 0.8481421647819063, "grad_norm": 2.926845649522626, "learning_rate": 4.917808289613908e-06, "loss": 0.9938, "step": 525 }, { "epoch": 0.8497576736672051, "grad_norm": 3.2422800558407405, "learning_rate": 4.917484259951189e-06, "loss": 1.0012, "step": 526 }, { "epoch": 0.851373182552504, "grad_norm": 3.2342993745641926, "learning_rate": 4.9171596035403175e-06, "loss": 1.0264, "step": 527 }, { "epoch": 0.8529886914378029, "grad_norm": 2.9287577759236765, "learning_rate": 4.916834320465461e-06, "loss": 1.0835, "step": 528 }, { "epoch": 0.8546042003231018, "grad_norm": 3.394589829197802, "learning_rate": 4.916508410810953e-06, "loss": 1.0044, "step": 529 }, { "epoch": 0.8562197092084006, "grad_norm": 3.1567153660453164, "learning_rate": 4.916181874661286e-06, "loss": 1.0396, "step": 530 }, { "epoch": 0.8578352180936996, "grad_norm": 2.9439209078463704, "learning_rate": 4.915854712101118e-06, "loss": 1.1104, "step": 531 }, { "epoch": 0.8594507269789984, "grad_norm": 3.148094694163818, "learning_rate": 4.915526923215267e-06, "loss": 1.1134, "step": 532 }, { "epoch": 0.8610662358642972, "grad_norm": 2.6493954071795613, "learning_rate": 4.915198508088714e-06, "loss": 0.9987, "step": 533 }, { "epoch": 0.8626817447495961, "grad_norm": 3.748206268171067, "learning_rate": 4.914869466806603e-06, "loss": 1.075, "step": 534 }, { "epoch": 0.864297253634895, "grad_norm": 3.6558193771141645, "learning_rate": 4.914539799454242e-06, "loss": 1.0887, "step": 535 }, { "epoch": 0.8659127625201939, "grad_norm": 3.047070871300152, "learning_rate": 4.914209506117096e-06, "loss": 1.0417, "step": 536 }, { "epoch": 0.8675282714054927, "grad_norm": 2.944530875792953, "learning_rate": 4.913878586880798e-06, "loss": 1.0503, "step": 537 }, { "epoch": 0.8691437802907916, "grad_norm": 3.0437158417378924, "learning_rate": 4.913547041831141e-06, "loss": 1.0282, "step": 538 }, { "epoch": 0.8707592891760905, "grad_norm": 3.1387005672759853, "learning_rate": 4.913214871054078e-06, "loss": 0.9655, "step": 539 }, { "epoch": 0.8723747980613893, "grad_norm": 3.244316072231346, "learning_rate": 4.9128820746357296e-06, "loss": 1.0257, "step": 540 }, { "epoch": 0.8739903069466882, "grad_norm": 3.867164396104361, "learning_rate": 4.912548652662372e-06, "loss": 1.0693, "step": 541 }, { "epoch": 0.875605815831987, "grad_norm": 2.946649457785812, "learning_rate": 4.91221460522045e-06, "loss": 1.0386, "step": 542 }, { "epoch": 0.877221324717286, "grad_norm": 2.6137978061206684, "learning_rate": 4.911879932396565e-06, "loss": 0.849, "step": 543 }, { "epoch": 0.8788368336025848, "grad_norm": 3.110662433158093, "learning_rate": 4.911544634277484e-06, "loss": 0.9821, "step": 544 }, { "epoch": 0.8804523424878837, "grad_norm": 2.765939908417997, "learning_rate": 4.9112087109501365e-06, "loss": 0.9915, "step": 545 }, { "epoch": 0.8820678513731826, "grad_norm": 2.885245668641579, "learning_rate": 4.910872162501611e-06, "loss": 0.9864, "step": 546 }, { "epoch": 0.8836833602584814, "grad_norm": 3.048468783687742, "learning_rate": 4.91053498901916e-06, "loss": 0.9507, "step": 547 }, { "epoch": 0.8852988691437803, "grad_norm": 2.8430332362487203, "learning_rate": 4.910197190590198e-06, "loss": 0.9714, "step": 548 }, { "epoch": 0.8869143780290791, "grad_norm": 4.119491144032854, "learning_rate": 4.909858767302302e-06, "loss": 1.0991, "step": 549 }, { "epoch": 0.8885298869143781, "grad_norm": 3.6556017844797495, "learning_rate": 4.9095197192432105e-06, "loss": 1.0708, "step": 550 }, { "epoch": 0.8901453957996769, "grad_norm": 2.991847744987747, "learning_rate": 4.909180046500824e-06, "loss": 1.0332, "step": 551 }, { "epoch": 0.8917609046849758, "grad_norm": 2.8714239853084367, "learning_rate": 4.9088397491632025e-06, "loss": 0.9849, "step": 552 }, { "epoch": 0.8933764135702746, "grad_norm": 3.0400165343533114, "learning_rate": 4.908498827318573e-06, "loss": 1.0222, "step": 553 }, { "epoch": 0.8949919224555735, "grad_norm": 2.6444170541865755, "learning_rate": 4.90815728105532e-06, "loss": 1.0194, "step": 554 }, { "epoch": 0.8966074313408724, "grad_norm": 2.7122074454243714, "learning_rate": 4.907815110461993e-06, "loss": 1.0216, "step": 555 }, { "epoch": 0.8982229402261712, "grad_norm": 2.839645842756569, "learning_rate": 4.9074723156273e-06, "loss": 1.0766, "step": 556 }, { "epoch": 0.8998384491114702, "grad_norm": 3.048667982248548, "learning_rate": 4.907128896640115e-06, "loss": 1.1004, "step": 557 }, { "epoch": 0.901453957996769, "grad_norm": 3.4098017617125906, "learning_rate": 4.90678485358947e-06, "loss": 1.0052, "step": 558 }, { "epoch": 0.9030694668820679, "grad_norm": 3.238877432019058, "learning_rate": 4.90644018656456e-06, "loss": 1.0002, "step": 559 }, { "epoch": 0.9046849757673667, "grad_norm": 3.047680265442759, "learning_rate": 4.906094895654744e-06, "loss": 0.9395, "step": 560 }, { "epoch": 0.9063004846526656, "grad_norm": 2.466819806553536, "learning_rate": 4.905748980949538e-06, "loss": 0.8651, "step": 561 }, { "epoch": 0.9079159935379645, "grad_norm": 3.008302475548862, "learning_rate": 4.9054024425386255e-06, "loss": 1.1216, "step": 562 }, { "epoch": 0.9095315024232633, "grad_norm": 3.1325654685920417, "learning_rate": 4.905055280511848e-06, "loss": 1.0828, "step": 563 }, { "epoch": 0.9111470113085622, "grad_norm": 2.7726569826334417, "learning_rate": 4.904707494959208e-06, "loss": 0.956, "step": 564 }, { "epoch": 0.9127625201938611, "grad_norm": 2.9852009809572357, "learning_rate": 4.904359085970872e-06, "loss": 1.0424, "step": 565 }, { "epoch": 0.9143780290791599, "grad_norm": 2.762007358836134, "learning_rate": 4.904010053637169e-06, "loss": 1.0087, "step": 566 }, { "epoch": 0.9159935379644588, "grad_norm": 2.9704523131087575, "learning_rate": 4.903660398048584e-06, "loss": 0.9333, "step": 567 }, { "epoch": 0.9176090468497576, "grad_norm": 2.4812342533818246, "learning_rate": 4.903310119295771e-06, "loss": 0.7973, "step": 568 }, { "epoch": 0.9192245557350566, "grad_norm": 3.029805186050157, "learning_rate": 4.902959217469541e-06, "loss": 1.0701, "step": 569 }, { "epoch": 0.9208400646203554, "grad_norm": 2.896540452587927, "learning_rate": 4.902607692660866e-06, "loss": 0.8996, "step": 570 }, { "epoch": 0.9224555735056543, "grad_norm": 2.734908991420008, "learning_rate": 4.902255544960882e-06, "loss": 1.0005, "step": 571 }, { "epoch": 0.9240710823909531, "grad_norm": 3.526896420094468, "learning_rate": 4.901902774460887e-06, "loss": 0.9629, "step": 572 }, { "epoch": 0.925686591276252, "grad_norm": 2.691054600506663, "learning_rate": 4.901549381252337e-06, "loss": 0.9387, "step": 573 }, { "epoch": 0.9273021001615509, "grad_norm": 2.9608358888822788, "learning_rate": 4.901195365426853e-06, "loss": 0.9976, "step": 574 }, { "epoch": 0.9289176090468497, "grad_norm": 3.419096356613053, "learning_rate": 4.900840727076214e-06, "loss": 1.0028, "step": 575 }, { "epoch": 0.9305331179321487, "grad_norm": 2.925022110653371, "learning_rate": 4.900485466292363e-06, "loss": 1.0758, "step": 576 }, { "epoch": 0.9321486268174475, "grad_norm": 3.048334804615051, "learning_rate": 4.900129583167404e-06, "loss": 1.0161, "step": 577 }, { "epoch": 0.9337641357027464, "grad_norm": 3.153094608874741, "learning_rate": 4.899773077793603e-06, "loss": 1.0399, "step": 578 }, { "epoch": 0.9353796445880452, "grad_norm": 2.8639524977018396, "learning_rate": 4.8994159502633846e-06, "loss": 1.0493, "step": 579 }, { "epoch": 0.9369951534733441, "grad_norm": 2.7959866779084854, "learning_rate": 4.899058200669337e-06, "loss": 1.0017, "step": 580 }, { "epoch": 0.938610662358643, "grad_norm": 2.8440493157833693, "learning_rate": 4.898699829104208e-06, "loss": 1.0293, "step": 581 }, { "epoch": 0.9402261712439418, "grad_norm": 3.746373891486742, "learning_rate": 4.8983408356609095e-06, "loss": 0.9004, "step": 582 }, { "epoch": 0.9418416801292407, "grad_norm": 3.0045005953981585, "learning_rate": 4.8979812204325115e-06, "loss": 1.1114, "step": 583 }, { "epoch": 0.9434571890145396, "grad_norm": 3.231960815517469, "learning_rate": 4.897620983512248e-06, "loss": 1.0358, "step": 584 }, { "epoch": 0.9450726978998385, "grad_norm": 3.1104419951424145, "learning_rate": 4.897260124993511e-06, "loss": 1.0851, "step": 585 }, { "epoch": 0.9466882067851373, "grad_norm": 3.1636750412043475, "learning_rate": 4.8968986449698555e-06, "loss": 1.0109, "step": 586 }, { "epoch": 0.9483037156704361, "grad_norm": 2.933828294491268, "learning_rate": 4.896536543534999e-06, "loss": 0.9461, "step": 587 }, { "epoch": 0.9499192245557351, "grad_norm": 2.9116333410969184, "learning_rate": 4.896173820782818e-06, "loss": 1.0259, "step": 588 }, { "epoch": 0.9515347334410339, "grad_norm": 2.932009649098289, "learning_rate": 4.8958104768073495e-06, "loss": 0.9743, "step": 589 }, { "epoch": 0.9531502423263328, "grad_norm": 2.396105032429587, "learning_rate": 4.895446511702794e-06, "loss": 0.9448, "step": 590 }, { "epoch": 0.9547657512116317, "grad_norm": 2.9822643180731876, "learning_rate": 4.895081925563511e-06, "loss": 1.0013, "step": 591 }, { "epoch": 0.9563812600969306, "grad_norm": 3.383819957905949, "learning_rate": 4.894716718484022e-06, "loss": 0.9562, "step": 592 }, { "epoch": 0.9579967689822294, "grad_norm": 2.71303524764786, "learning_rate": 4.894350890559011e-06, "loss": 0.897, "step": 593 }, { "epoch": 0.9596122778675282, "grad_norm": 3.218985444152019, "learning_rate": 4.893984441883318e-06, "loss": 0.9969, "step": 594 }, { "epoch": 0.9612277867528272, "grad_norm": 3.4307252833388047, "learning_rate": 4.8936173725519495e-06, "loss": 0.999, "step": 595 }, { "epoch": 0.962843295638126, "grad_norm": 3.3188221267184734, "learning_rate": 4.8932496826600695e-06, "loss": 1.0874, "step": 596 }, { "epoch": 0.9644588045234249, "grad_norm": 3.3783515659806316, "learning_rate": 4.892881372303005e-06, "loss": 0.9847, "step": 597 }, { "epoch": 0.9660743134087237, "grad_norm": 3.136470193942187, "learning_rate": 4.892512441576241e-06, "loss": 1.0537, "step": 598 }, { "epoch": 0.9676898222940227, "grad_norm": 3.0754830975415883, "learning_rate": 4.8921428905754256e-06, "loss": 1.0235, "step": 599 }, { "epoch": 0.9693053311793215, "grad_norm": 2.8963835882346296, "learning_rate": 4.891772719396369e-06, "loss": 0.8871, "step": 600 }, { "epoch": 0.9693053311793215, "eval_loss": 1.0480477809906006, "eval_runtime": 2.3477, "eval_samples_per_second": 127.787, "eval_steps_per_second": 2.982, "step": 600 }, { "epoch": 0.9709208400646203, "grad_norm": 3.1439477244069254, "learning_rate": 4.891401928135039e-06, "loss": 1.0419, "step": 601 }, { "epoch": 0.9725363489499192, "grad_norm": 3.082906988237605, "learning_rate": 4.8910305168875664e-06, "loss": 1.0659, "step": 602 }, { "epoch": 0.9741518578352181, "grad_norm": 2.820904754758825, "learning_rate": 4.8906584857502405e-06, "loss": 0.9421, "step": 603 }, { "epoch": 0.975767366720517, "grad_norm": 3.0788813565464253, "learning_rate": 4.890285834819513e-06, "loss": 0.9271, "step": 604 }, { "epoch": 0.9773828756058158, "grad_norm": 2.6297273652655755, "learning_rate": 4.8899125641919985e-06, "loss": 0.9335, "step": 605 }, { "epoch": 0.9789983844911146, "grad_norm": 2.7734009517362, "learning_rate": 4.889538673964468e-06, "loss": 0.9839, "step": 606 }, { "epoch": 0.9806138933764136, "grad_norm": 2.947667720403827, "learning_rate": 4.889164164233853e-06, "loss": 1.0574, "step": 607 }, { "epoch": 0.9822294022617124, "grad_norm": 3.1580810562919126, "learning_rate": 4.88878903509725e-06, "loss": 1.0752, "step": 608 }, { "epoch": 0.9838449111470113, "grad_norm": 3.1336572093382613, "learning_rate": 4.888413286651914e-06, "loss": 1.0364, "step": 609 }, { "epoch": 0.9854604200323102, "grad_norm": 3.4559308400420945, "learning_rate": 4.888036918995258e-06, "loss": 0.973, "step": 610 }, { "epoch": 0.9870759289176091, "grad_norm": 2.819098781524212, "learning_rate": 4.887659932224859e-06, "loss": 0.9114, "step": 611 }, { "epoch": 0.9886914378029079, "grad_norm": 3.240365036006373, "learning_rate": 4.887282326438453e-06, "loss": 1.1139, "step": 612 }, { "epoch": 0.9903069466882067, "grad_norm": 3.1203929319049797, "learning_rate": 4.886904101733937e-06, "loss": 0.9361, "step": 613 }, { "epoch": 0.9919224555735057, "grad_norm": 3.0607051359324244, "learning_rate": 4.886525258209368e-06, "loss": 1.074, "step": 614 }, { "epoch": 0.9935379644588045, "grad_norm": 3.41782050769325, "learning_rate": 4.8861457959629645e-06, "loss": 1.1422, "step": 615 }, { "epoch": 0.9951534733441034, "grad_norm": 2.905477457427711, "learning_rate": 4.885765715093102e-06, "loss": 0.96, "step": 616 }, { "epoch": 0.9967689822294022, "grad_norm": 2.911564647499011, "learning_rate": 4.8853850156983204e-06, "loss": 1.0043, "step": 617 }, { "epoch": 0.9983844911147012, "grad_norm": 3.428974378849924, "learning_rate": 4.8850036978773194e-06, "loss": 1.0315, "step": 618 }, { "epoch": 1.0, "grad_norm": 3.183197856948851, "learning_rate": 4.884621761728957e-06, "loss": 0.9605, "step": 619 }, { "epoch": 1.001615508885299, "grad_norm": 2.993887922972881, "learning_rate": 4.884239207352252e-06, "loss": 0.752, "step": 620 }, { "epoch": 1.0032310177705976, "grad_norm": 3.1694443355823982, "learning_rate": 4.8838560348463845e-06, "loss": 0.7868, "step": 621 }, { "epoch": 1.0048465266558966, "grad_norm": 2.8962141706925317, "learning_rate": 4.8834722443106955e-06, "loss": 0.7788, "step": 622 }, { "epoch": 1.0064620355411955, "grad_norm": 3.1479718127151273, "learning_rate": 4.883087835844684e-06, "loss": 0.8132, "step": 623 }, { "epoch": 1.0080775444264944, "grad_norm": 3.2482495694774856, "learning_rate": 4.88270280954801e-06, "loss": 0.6739, "step": 624 }, { "epoch": 1.0096930533117932, "grad_norm": 3.225295849720571, "learning_rate": 4.882317165520496e-06, "loss": 0.7403, "step": 625 }, { "epoch": 1.011308562197092, "grad_norm": 3.5233470333872825, "learning_rate": 4.88193090386212e-06, "loss": 0.7472, "step": 626 }, { "epoch": 1.012924071082391, "grad_norm": 2.936391135051867, "learning_rate": 4.881544024673025e-06, "loss": 0.7197, "step": 627 }, { "epoch": 1.0145395799676897, "grad_norm": 3.0326080315768587, "learning_rate": 4.88115652805351e-06, "loss": 0.6638, "step": 628 }, { "epoch": 1.0161550888529887, "grad_norm": 2.826242957866351, "learning_rate": 4.8807684141040376e-06, "loss": 0.7617, "step": 629 }, { "epoch": 1.0177705977382876, "grad_norm": 2.8992010283144767, "learning_rate": 4.880379682925228e-06, "loss": 0.6541, "step": 630 }, { "epoch": 1.0193861066235865, "grad_norm": 3.1293635900097954, "learning_rate": 4.879990334617862e-06, "loss": 0.7395, "step": 631 }, { "epoch": 1.0210016155088852, "grad_norm": 2.989895765096856, "learning_rate": 4.879600369282882e-06, "loss": 0.7348, "step": 632 }, { "epoch": 1.0226171243941842, "grad_norm": 3.4413972119837677, "learning_rate": 4.8792097870213866e-06, "loss": 0.732, "step": 633 }, { "epoch": 1.024232633279483, "grad_norm": 3.0974712695108235, "learning_rate": 4.8788185879346386e-06, "loss": 0.6916, "step": 634 }, { "epoch": 1.0258481421647818, "grad_norm": 2.972718310010981, "learning_rate": 4.878426772124059e-06, "loss": 0.5986, "step": 635 }, { "epoch": 1.0274636510500808, "grad_norm": 3.4336065466984143, "learning_rate": 4.878034339691227e-06, "loss": 0.8056, "step": 636 }, { "epoch": 1.0290791599353797, "grad_norm": 2.868008302979513, "learning_rate": 4.8776412907378845e-06, "loss": 0.6776, "step": 637 }, { "epoch": 1.0306946688206786, "grad_norm": 3.315302065872779, "learning_rate": 4.877247625365931e-06, "loss": 0.7361, "step": 638 }, { "epoch": 1.0323101777059773, "grad_norm": 2.9444902570110774, "learning_rate": 4.876853343677428e-06, "loss": 0.6303, "step": 639 }, { "epoch": 1.0339256865912763, "grad_norm": 3.3623475627366557, "learning_rate": 4.8764584457745944e-06, "loss": 0.7124, "step": 640 }, { "epoch": 1.0355411954765752, "grad_norm": 3.251625328456299, "learning_rate": 4.876062931759811e-06, "loss": 0.7006, "step": 641 }, { "epoch": 1.037156704361874, "grad_norm": 3.8977705484816636, "learning_rate": 4.875666801735616e-06, "loss": 0.8286, "step": 642 }, { "epoch": 1.0387722132471728, "grad_norm": 3.3154025439279, "learning_rate": 4.875270055804709e-06, "loss": 0.7121, "step": 643 }, { "epoch": 1.0403877221324718, "grad_norm": 3.0291161912205062, "learning_rate": 4.8748726940699505e-06, "loss": 0.701, "step": 644 }, { "epoch": 1.0420032310177707, "grad_norm": 2.915271965024517, "learning_rate": 4.874474716634357e-06, "loss": 0.6754, "step": 645 }, { "epoch": 1.0436187399030694, "grad_norm": 2.873596791107623, "learning_rate": 4.8740761236011075e-06, "loss": 0.6377, "step": 646 }, { "epoch": 1.0452342487883683, "grad_norm": 2.969213983538028, "learning_rate": 4.87367691507354e-06, "loss": 0.7744, "step": 647 }, { "epoch": 1.0468497576736673, "grad_norm": 2.8178801893477425, "learning_rate": 4.8732770911551515e-06, "loss": 0.679, "step": 648 }, { "epoch": 1.048465266558966, "grad_norm": 3.22865507899258, "learning_rate": 4.872876651949598e-06, "loss": 0.6882, "step": 649 }, { "epoch": 1.050080775444265, "grad_norm": 2.9043579424392933, "learning_rate": 4.872475597560698e-06, "loss": 0.6447, "step": 650 }, { "epoch": 1.0516962843295639, "grad_norm": 3.1296703415330005, "learning_rate": 4.872073928092427e-06, "loss": 0.7208, "step": 651 }, { "epoch": 1.0533117932148626, "grad_norm": 3.2118223610300065, "learning_rate": 4.871671643648919e-06, "loss": 0.8064, "step": 652 }, { "epoch": 1.0549273021001615, "grad_norm": 2.680455476421138, "learning_rate": 4.871268744334468e-06, "loss": 0.649, "step": 653 }, { "epoch": 1.0565428109854604, "grad_norm": 3.254593806174601, "learning_rate": 4.870865230253532e-06, "loss": 0.8303, "step": 654 }, { "epoch": 1.0581583198707594, "grad_norm": 3.4755870654853416, "learning_rate": 4.8704611015107215e-06, "loss": 0.7865, "step": 655 }, { "epoch": 1.059773828756058, "grad_norm": 2.7990594277331207, "learning_rate": 4.870056358210809e-06, "loss": 0.6313, "step": 656 }, { "epoch": 1.061389337641357, "grad_norm": 2.75201427784568, "learning_rate": 4.86965100045873e-06, "loss": 0.6445, "step": 657 }, { "epoch": 1.063004846526656, "grad_norm": 3.1560461570113425, "learning_rate": 4.8692450283595724e-06, "loss": 0.6615, "step": 658 }, { "epoch": 1.0646203554119547, "grad_norm": 3.3727236238005722, "learning_rate": 4.86883844201859e-06, "loss": 0.7703, "step": 659 }, { "epoch": 1.0662358642972536, "grad_norm": 3.086529451218489, "learning_rate": 4.86843124154119e-06, "loss": 0.7138, "step": 660 }, { "epoch": 1.0678513731825525, "grad_norm": 2.8062917031738763, "learning_rate": 4.868023427032944e-06, "loss": 0.6773, "step": 661 }, { "epoch": 1.0694668820678515, "grad_norm": 3.0697800657536694, "learning_rate": 4.86761499859958e-06, "loss": 0.7084, "step": 662 }, { "epoch": 1.0710823909531502, "grad_norm": 2.9659137306930803, "learning_rate": 4.867205956346985e-06, "loss": 0.6468, "step": 663 }, { "epoch": 1.072697899838449, "grad_norm": 3.1225784321592402, "learning_rate": 4.866796300381207e-06, "loss": 0.6808, "step": 664 }, { "epoch": 1.074313408723748, "grad_norm": 3.020644215603099, "learning_rate": 4.866386030808451e-06, "loss": 0.7381, "step": 665 }, { "epoch": 1.0759289176090467, "grad_norm": 2.812535819605783, "learning_rate": 4.865975147735082e-06, "loss": 0.6378, "step": 666 }, { "epoch": 1.0775444264943457, "grad_norm": 3.125064218489593, "learning_rate": 4.865563651267624e-06, "loss": 0.7482, "step": 667 }, { "epoch": 1.0791599353796446, "grad_norm": 2.9917763932990074, "learning_rate": 4.865151541512761e-06, "loss": 0.6864, "step": 668 }, { "epoch": 1.0807754442649435, "grad_norm": 3.568926931666473, "learning_rate": 4.864738818577334e-06, "loss": 0.6893, "step": 669 }, { "epoch": 1.0823909531502423, "grad_norm": 3.5138259795346594, "learning_rate": 4.864325482568344e-06, "loss": 0.7479, "step": 670 }, { "epoch": 1.0840064620355412, "grad_norm": 3.5027773609067907, "learning_rate": 4.863911533592951e-06, "loss": 0.7197, "step": 671 }, { "epoch": 1.0856219709208401, "grad_norm": 2.9150821505861355, "learning_rate": 4.863496971758475e-06, "loss": 0.6086, "step": 672 }, { "epoch": 1.0872374798061388, "grad_norm": 3.4744251475516568, "learning_rate": 4.863081797172393e-06, "loss": 0.6981, "step": 673 }, { "epoch": 1.0888529886914378, "grad_norm": 2.687432207750185, "learning_rate": 4.862666009942342e-06, "loss": 0.6508, "step": 674 }, { "epoch": 1.0904684975767367, "grad_norm": 3.1174438371754, "learning_rate": 4.862249610176117e-06, "loss": 0.7156, "step": 675 }, { "epoch": 1.0920840064620356, "grad_norm": 3.498542607720077, "learning_rate": 4.861832597981672e-06, "loss": 0.7679, "step": 676 }, { "epoch": 1.0936995153473343, "grad_norm": 3.4826293873519925, "learning_rate": 4.8614149734671215e-06, "loss": 0.7167, "step": 677 }, { "epoch": 1.0953150242326333, "grad_norm": 3.5590834616328126, "learning_rate": 4.860996736740736e-06, "loss": 0.6868, "step": 678 }, { "epoch": 1.0969305331179322, "grad_norm": 3.5863445135116807, "learning_rate": 4.860577887910946e-06, "loss": 0.8169, "step": 679 }, { "epoch": 1.098546042003231, "grad_norm": 2.8325631403394693, "learning_rate": 4.860158427086341e-06, "loss": 0.6919, "step": 680 }, { "epoch": 1.1001615508885298, "grad_norm": 2.846108305200719, "learning_rate": 4.8597383543756695e-06, "loss": 0.6706, "step": 681 }, { "epoch": 1.1017770597738288, "grad_norm": 3.434747809544567, "learning_rate": 4.859317669887838e-06, "loss": 0.7716, "step": 682 }, { "epoch": 1.1033925686591277, "grad_norm": 2.7216986849908373, "learning_rate": 4.858896373731911e-06, "loss": 0.6787, "step": 683 }, { "epoch": 1.1050080775444264, "grad_norm": 3.4104861638501047, "learning_rate": 4.858474466017111e-06, "loss": 0.618, "step": 684 }, { "epoch": 1.1066235864297254, "grad_norm": 2.9062544588155816, "learning_rate": 4.858051946852823e-06, "loss": 0.614, "step": 685 }, { "epoch": 1.1082390953150243, "grad_norm": 3.3876881086855213, "learning_rate": 4.857628816348586e-06, "loss": 0.8644, "step": 686 }, { "epoch": 1.109854604200323, "grad_norm": 3.1693517309104267, "learning_rate": 4.8572050746141e-06, "loss": 0.6061, "step": 687 }, { "epoch": 1.111470113085622, "grad_norm": 2.720838776604994, "learning_rate": 4.856780721759221e-06, "loss": 0.6614, "step": 688 }, { "epoch": 1.1130856219709209, "grad_norm": 3.0923540419092297, "learning_rate": 4.856355757893968e-06, "loss": 0.6563, "step": 689 }, { "epoch": 1.1147011308562198, "grad_norm": 3.565443447854608, "learning_rate": 4.855930183128513e-06, "loss": 0.7576, "step": 690 }, { "epoch": 1.1163166397415185, "grad_norm": 3.0321075597543268, "learning_rate": 4.855503997573191e-06, "loss": 0.723, "step": 691 }, { "epoch": 1.1179321486268174, "grad_norm": 3.228306634099831, "learning_rate": 4.855077201338493e-06, "loss": 0.7287, "step": 692 }, { "epoch": 1.1195476575121164, "grad_norm": 3.448644336365073, "learning_rate": 4.8546497945350665e-06, "loss": 0.8125, "step": 693 }, { "epoch": 1.121163166397415, "grad_norm": 2.7752336058021685, "learning_rate": 4.854221777273722e-06, "loss": 0.6415, "step": 694 }, { "epoch": 1.122778675282714, "grad_norm": 3.156932251333771, "learning_rate": 4.8537931496654245e-06, "loss": 0.7861, "step": 695 }, { "epoch": 1.124394184168013, "grad_norm": 3.009285203841208, "learning_rate": 4.8533639118212984e-06, "loss": 0.7155, "step": 696 }, { "epoch": 1.1260096930533119, "grad_norm": 4.247830163902359, "learning_rate": 4.852934063852627e-06, "loss": 0.7835, "step": 697 }, { "epoch": 1.1276252019386106, "grad_norm": 3.1774815063182835, "learning_rate": 4.852503605870851e-06, "loss": 0.7003, "step": 698 }, { "epoch": 1.1292407108239095, "grad_norm": 2.5682781169811704, "learning_rate": 4.852072537987568e-06, "loss": 0.678, "step": 699 }, { "epoch": 1.1308562197092085, "grad_norm": 3.2779304822781685, "learning_rate": 4.851640860314536e-06, "loss": 0.7609, "step": 700 }, { "epoch": 1.1324717285945072, "grad_norm": 3.004748409480101, "learning_rate": 4.851208572963671e-06, "loss": 0.7385, "step": 701 }, { "epoch": 1.134087237479806, "grad_norm": 3.5983601216095105, "learning_rate": 4.8507756760470455e-06, "loss": 0.749, "step": 702 }, { "epoch": 1.135702746365105, "grad_norm": 3.2383583150629414, "learning_rate": 4.85034216967689e-06, "loss": 0.7509, "step": 703 }, { "epoch": 1.137318255250404, "grad_norm": 2.6028624554395203, "learning_rate": 4.849908053965597e-06, "loss": 0.6288, "step": 704 }, { "epoch": 1.1389337641357027, "grad_norm": 3.1013991033630375, "learning_rate": 4.849473329025708e-06, "loss": 0.6381, "step": 705 }, { "epoch": 1.1405492730210016, "grad_norm": 3.0800446715523866, "learning_rate": 4.849037994969934e-06, "loss": 0.6946, "step": 706 }, { "epoch": 1.1421647819063006, "grad_norm": 3.040669597024604, "learning_rate": 4.848602051911134e-06, "loss": 0.7098, "step": 707 }, { "epoch": 1.1437802907915993, "grad_norm": 3.189506153395751, "learning_rate": 4.848165499962333e-06, "loss": 0.7103, "step": 708 }, { "epoch": 1.1453957996768982, "grad_norm": 3.2962233257018183, "learning_rate": 4.847728339236706e-06, "loss": 0.7204, "step": 709 }, { "epoch": 1.1470113085621971, "grad_norm": 2.9165630950672785, "learning_rate": 4.8472905698475904e-06, "loss": 0.6598, "step": 710 }, { "epoch": 1.148626817447496, "grad_norm": 3.063220062831063, "learning_rate": 4.8468521919084835e-06, "loss": 0.7449, "step": 711 }, { "epoch": 1.1502423263327948, "grad_norm": 3.3553647378937224, "learning_rate": 4.846413205533034e-06, "loss": 0.7594, "step": 712 }, { "epoch": 1.1518578352180937, "grad_norm": 3.335653221957337, "learning_rate": 4.845973610835054e-06, "loss": 0.6369, "step": 713 }, { "epoch": 1.1534733441033926, "grad_norm": 3.3742014705895076, "learning_rate": 4.845533407928511e-06, "loss": 0.6845, "step": 714 }, { "epoch": 1.1550888529886914, "grad_norm": 3.230984374740863, "learning_rate": 4.84509259692753e-06, "loss": 0.7147, "step": 715 }, { "epoch": 1.1567043618739903, "grad_norm": 2.8097504492528453, "learning_rate": 4.844651177946395e-06, "loss": 0.6053, "step": 716 }, { "epoch": 1.1583198707592892, "grad_norm": 3.3237433580245828, "learning_rate": 4.844209151099544e-06, "loss": 0.7236, "step": 717 }, { "epoch": 1.1599353796445881, "grad_norm": 2.9918129126809268, "learning_rate": 4.843766516501578e-06, "loss": 0.7298, "step": 718 }, { "epoch": 1.1615508885298869, "grad_norm": 3.1360623150632985, "learning_rate": 4.843323274267253e-06, "loss": 0.7169, "step": 719 }, { "epoch": 1.1631663974151858, "grad_norm": 3.850892408033537, "learning_rate": 4.8428794245114806e-06, "loss": 0.7814, "step": 720 }, { "epoch": 1.1647819063004847, "grad_norm": 2.8861310057268903, "learning_rate": 4.842434967349332e-06, "loss": 0.6193, "step": 721 }, { "epoch": 1.1663974151857834, "grad_norm": 3.035399066159624, "learning_rate": 4.841989902896036e-06, "loss": 0.6573, "step": 722 }, { "epoch": 1.1680129240710824, "grad_norm": 3.214759861083825, "learning_rate": 4.841544231266978e-06, "loss": 0.7547, "step": 723 }, { "epoch": 1.1696284329563813, "grad_norm": 2.986270272425936, "learning_rate": 4.841097952577702e-06, "loss": 0.5718, "step": 724 }, { "epoch": 1.1712439418416802, "grad_norm": 3.4854057059249066, "learning_rate": 4.840651066943909e-06, "loss": 0.7725, "step": 725 }, { "epoch": 1.172859450726979, "grad_norm": 4.246464849350729, "learning_rate": 4.8402035744814555e-06, "loss": 0.5649, "step": 726 }, { "epoch": 1.1744749596122779, "grad_norm": 2.786662535412773, "learning_rate": 4.839755475306357e-06, "loss": 0.6012, "step": 727 }, { "epoch": 1.1760904684975768, "grad_norm": 3.324738673171018, "learning_rate": 4.839306769534786e-06, "loss": 0.6355, "step": 728 }, { "epoch": 1.1777059773828755, "grad_norm": 2.99310330032351, "learning_rate": 4.838857457283073e-06, "loss": 0.5695, "step": 729 }, { "epoch": 1.1793214862681745, "grad_norm": 3.2781699103469473, "learning_rate": 4.838407538667705e-06, "loss": 0.6393, "step": 730 }, { "epoch": 1.1809369951534734, "grad_norm": 3.3980085667213533, "learning_rate": 4.837957013805326e-06, "loss": 0.7399, "step": 731 }, { "epoch": 1.1825525040387723, "grad_norm": 3.1699729979780082, "learning_rate": 4.837505882812737e-06, "loss": 0.6352, "step": 732 }, { "epoch": 1.184168012924071, "grad_norm": 3.313854841536698, "learning_rate": 4.837054145806898e-06, "loss": 0.7608, "step": 733 }, { "epoch": 1.18578352180937, "grad_norm": 3.5054210420210588, "learning_rate": 4.8366018029049225e-06, "loss": 0.7448, "step": 734 }, { "epoch": 1.187399030694669, "grad_norm": 3.1991633814619957, "learning_rate": 4.836148854224086e-06, "loss": 0.7497, "step": 735 }, { "epoch": 1.1890145395799676, "grad_norm": 3.3927564471643157, "learning_rate": 4.835695299881815e-06, "loss": 0.7695, "step": 736 }, { "epoch": 1.1906300484652665, "grad_norm": 3.091160879003833, "learning_rate": 4.835241139995699e-06, "loss": 0.7348, "step": 737 }, { "epoch": 1.1922455573505655, "grad_norm": 2.8369995487374244, "learning_rate": 4.834786374683481e-06, "loss": 0.7848, "step": 738 }, { "epoch": 1.1938610662358644, "grad_norm": 3.500641605280263, "learning_rate": 4.834331004063062e-06, "loss": 0.7957, "step": 739 }, { "epoch": 1.1954765751211631, "grad_norm": 3.1717618396824463, "learning_rate": 4.8338750282525e-06, "loss": 0.7043, "step": 740 }, { "epoch": 1.197092084006462, "grad_norm": 3.7901192904986, "learning_rate": 4.8334184473700076e-06, "loss": 0.7784, "step": 741 }, { "epoch": 1.198707592891761, "grad_norm": 3.221237478905773, "learning_rate": 4.832961261533959e-06, "loss": 0.82, "step": 742 }, { "epoch": 1.2003231017770597, "grad_norm": 3.3466882275792513, "learning_rate": 4.83250347086288e-06, "loss": 0.757, "step": 743 }, { "epoch": 1.2019386106623586, "grad_norm": 3.0017567411887502, "learning_rate": 4.832045075475459e-06, "loss": 0.591, "step": 744 }, { "epoch": 1.2035541195476576, "grad_norm": 3.1083095049035494, "learning_rate": 4.8315860754905354e-06, "loss": 0.7524, "step": 745 }, { "epoch": 1.2051696284329565, "grad_norm": 3.905391188019516, "learning_rate": 4.831126471027108e-06, "loss": 0.7853, "step": 746 }, { "epoch": 1.2067851373182552, "grad_norm": 2.976526655492793, "learning_rate": 4.830666262204332e-06, "loss": 0.7066, "step": 747 }, { "epoch": 1.2084006462035541, "grad_norm": 3.5106410888193333, "learning_rate": 4.830205449141522e-06, "loss": 0.6689, "step": 748 }, { "epoch": 1.210016155088853, "grad_norm": 2.9868657790781676, "learning_rate": 4.829744031958143e-06, "loss": 0.7768, "step": 749 }, { "epoch": 1.2116316639741518, "grad_norm": 3.3115772327184865, "learning_rate": 4.829282010773823e-06, "loss": 0.674, "step": 750 }, { "epoch": 1.2132471728594507, "grad_norm": 2.9586396006504723, "learning_rate": 4.828819385708344e-06, "loss": 0.7023, "step": 751 }, { "epoch": 1.2148626817447497, "grad_norm": 2.999154085893149, "learning_rate": 4.828356156881642e-06, "loss": 0.7754, "step": 752 }, { "epoch": 1.2164781906300486, "grad_norm": 3.0026152889296793, "learning_rate": 4.827892324413814e-06, "loss": 0.654, "step": 753 }, { "epoch": 1.2180936995153473, "grad_norm": 3.04102844970326, "learning_rate": 4.8274278884251115e-06, "loss": 0.6627, "step": 754 }, { "epoch": 1.2197092084006462, "grad_norm": 3.332418016763296, "learning_rate": 4.826962849035942e-06, "loss": 0.6743, "step": 755 }, { "epoch": 1.2213247172859452, "grad_norm": 2.687573089716002, "learning_rate": 4.82649720636687e-06, "loss": 0.6995, "step": 756 }, { "epoch": 1.2229402261712439, "grad_norm": 3.1449224630607087, "learning_rate": 4.826030960538617e-06, "loss": 0.7022, "step": 757 }, { "epoch": 1.2245557350565428, "grad_norm": 2.802559736814678, "learning_rate": 4.825564111672059e-06, "loss": 0.7559, "step": 758 }, { "epoch": 1.2261712439418417, "grad_norm": 3.2939709001621305, "learning_rate": 4.82509665988823e-06, "loss": 0.7384, "step": 759 }, { "epoch": 1.2277867528271407, "grad_norm": 2.8673118462124387, "learning_rate": 4.824628605308319e-06, "loss": 0.7297, "step": 760 }, { "epoch": 1.2294022617124394, "grad_norm": 3.48267119157368, "learning_rate": 4.824159948053674e-06, "loss": 0.7915, "step": 761 }, { "epoch": 1.2310177705977383, "grad_norm": 2.4147082511069633, "learning_rate": 4.8236906882457964e-06, "loss": 0.6161, "step": 762 }, { "epoch": 1.2326332794830372, "grad_norm": 3.0783845484164276, "learning_rate": 4.823220826006345e-06, "loss": 0.6717, "step": 763 }, { "epoch": 1.234248788368336, "grad_norm": 3.5247724855325027, "learning_rate": 4.822750361457134e-06, "loss": 0.7261, "step": 764 }, { "epoch": 1.235864297253635, "grad_norm": 2.89595043978703, "learning_rate": 4.822279294720135e-06, "loss": 0.6846, "step": 765 }, { "epoch": 1.2374798061389338, "grad_norm": 3.0291005746259207, "learning_rate": 4.821807625917474e-06, "loss": 0.7005, "step": 766 }, { "epoch": 1.2390953150242328, "grad_norm": 3.411644158896843, "learning_rate": 4.821335355171435e-06, "loss": 0.7053, "step": 767 }, { "epoch": 1.2407108239095315, "grad_norm": 3.42817445976008, "learning_rate": 4.820862482604457e-06, "loss": 0.7528, "step": 768 }, { "epoch": 1.2423263327948304, "grad_norm": 2.932037173561474, "learning_rate": 4.8203890083391355e-06, "loss": 0.729, "step": 769 }, { "epoch": 1.2439418416801293, "grad_norm": 3.292907486554429, "learning_rate": 4.819914932498222e-06, "loss": 0.7623, "step": 770 }, { "epoch": 1.245557350565428, "grad_norm": 3.2258391778402964, "learning_rate": 4.819440255204623e-06, "loss": 0.7234, "step": 771 }, { "epoch": 1.247172859450727, "grad_norm": 2.964383816157534, "learning_rate": 4.818964976581401e-06, "loss": 0.6425, "step": 772 }, { "epoch": 1.248788368336026, "grad_norm": 3.36561422198071, "learning_rate": 4.818489096751776e-06, "loss": 0.5844, "step": 773 }, { "epoch": 1.2504038772213248, "grad_norm": 3.583136682485798, "learning_rate": 4.818012615839123e-06, "loss": 0.7092, "step": 774 }, { "epoch": 1.2520193861066236, "grad_norm": 2.582771296376407, "learning_rate": 4.817535533966973e-06, "loss": 0.619, "step": 775 }, { "epoch": 1.2536348949919225, "grad_norm": 2.8799432748763385, "learning_rate": 4.8170578512590105e-06, "loss": 0.6268, "step": 776 }, { "epoch": 1.2552504038772212, "grad_norm": 3.002314125889941, "learning_rate": 4.816579567839079e-06, "loss": 0.7052, "step": 777 }, { "epoch": 1.2568659127625201, "grad_norm": 4.0133319421858875, "learning_rate": 4.816100683831178e-06, "loss": 0.7466, "step": 778 }, { "epoch": 1.258481421647819, "grad_norm": 3.5065692790338665, "learning_rate": 4.815621199359459e-06, "loss": 0.7132, "step": 779 }, { "epoch": 1.260096930533118, "grad_norm": 3.4865677910780444, "learning_rate": 4.815141114548232e-06, "loss": 0.7251, "step": 780 }, { "epoch": 1.261712439418417, "grad_norm": 2.975755708709986, "learning_rate": 4.814660429521963e-06, "loss": 0.6471, "step": 781 }, { "epoch": 1.2633279483037156, "grad_norm": 3.7469741105317196, "learning_rate": 4.8141791444052715e-06, "loss": 0.7635, "step": 782 }, { "epoch": 1.2649434571890146, "grad_norm": 3.1781615008153423, "learning_rate": 4.813697259322934e-06, "loss": 0.7685, "step": 783 }, { "epoch": 1.2665589660743133, "grad_norm": 3.053944730962014, "learning_rate": 4.813214774399882e-06, "loss": 0.656, "step": 784 }, { "epoch": 1.2681744749596122, "grad_norm": 3.005672333984633, "learning_rate": 4.8127316897612045e-06, "loss": 0.7502, "step": 785 }, { "epoch": 1.2697899838449112, "grad_norm": 3.2057276057286694, "learning_rate": 4.812248005532142e-06, "loss": 0.7122, "step": 786 }, { "epoch": 1.27140549273021, "grad_norm": 3.083613483874024, "learning_rate": 4.8117637218380945e-06, "loss": 0.7813, "step": 787 }, { "epoch": 1.273021001615509, "grad_norm": 3.03580189594934, "learning_rate": 4.811278838804614e-06, "loss": 0.7481, "step": 788 }, { "epoch": 1.2746365105008077, "grad_norm": 3.0165661996725692, "learning_rate": 4.81079335655741e-06, "loss": 0.6372, "step": 789 }, { "epoch": 1.2762520193861067, "grad_norm": 3.203269889071186, "learning_rate": 4.810307275222349e-06, "loss": 0.7101, "step": 790 }, { "epoch": 1.2778675282714054, "grad_norm": 3.07000724901014, "learning_rate": 4.809820594925447e-06, "loss": 0.6561, "step": 791 }, { "epoch": 1.2794830371567043, "grad_norm": 3.396216128872047, "learning_rate": 4.809333315792882e-06, "loss": 0.642, "step": 792 }, { "epoch": 1.2810985460420032, "grad_norm": 3.1506310653233776, "learning_rate": 4.808845437950982e-06, "loss": 0.72, "step": 793 }, { "epoch": 1.2827140549273022, "grad_norm": 3.3776338418377496, "learning_rate": 4.808356961526234e-06, "loss": 0.7064, "step": 794 }, { "epoch": 1.284329563812601, "grad_norm": 3.443518402819285, "learning_rate": 4.807867886645278e-06, "loss": 0.7952, "step": 795 }, { "epoch": 1.2859450726978998, "grad_norm": 3.2183269921658684, "learning_rate": 4.80737821343491e-06, "loss": 0.7012, "step": 796 }, { "epoch": 1.2875605815831987, "grad_norm": 3.146008918682736, "learning_rate": 4.806887942022081e-06, "loss": 0.6699, "step": 797 }, { "epoch": 1.2891760904684975, "grad_norm": 3.435787906715582, "learning_rate": 4.806397072533896e-06, "loss": 0.8248, "step": 798 }, { "epoch": 1.2907915993537964, "grad_norm": 3.120035259539539, "learning_rate": 4.805905605097616e-06, "loss": 0.6812, "step": 799 }, { "epoch": 1.2924071082390953, "grad_norm": 3.3800653698531655, "learning_rate": 4.805413539840659e-06, "loss": 0.7471, "step": 800 }, { "epoch": 1.2924071082390953, "eval_loss": 1.0639913082122803, "eval_runtime": 2.3532, "eval_samples_per_second": 127.487, "eval_steps_per_second": 2.975, "step": 800 }, { "epoch": 1.2940226171243943, "grad_norm": 3.4658791727239877, "learning_rate": 4.8049208768905944e-06, "loss": 0.6927, "step": 801 }, { "epoch": 1.2956381260096932, "grad_norm": 3.3503130274020925, "learning_rate": 4.804427616375149e-06, "loss": 0.6821, "step": 802 }, { "epoch": 1.297253634894992, "grad_norm": 3.2278395984597403, "learning_rate": 4.803933758422203e-06, "loss": 0.6531, "step": 803 }, { "epoch": 1.2988691437802908, "grad_norm": 2.894304692779474, "learning_rate": 4.803439303159792e-06, "loss": 0.6389, "step": 804 }, { "epoch": 1.3004846526655895, "grad_norm": 3.552687557865419, "learning_rate": 4.802944250716108e-06, "loss": 0.8374, "step": 805 }, { "epoch": 1.3021001615508885, "grad_norm": 3.7165679954193824, "learning_rate": 4.802448601219496e-06, "loss": 0.6768, "step": 806 }, { "epoch": 1.3037156704361874, "grad_norm": 3.826432739836166, "learning_rate": 4.801952354798456e-06, "loss": 0.8161, "step": 807 }, { "epoch": 1.3053311793214863, "grad_norm": 2.7159984820463996, "learning_rate": 4.801455511581643e-06, "loss": 0.6519, "step": 808 }, { "epoch": 1.306946688206785, "grad_norm": 3.6969202341177176, "learning_rate": 4.800958071697866e-06, "loss": 0.8314, "step": 809 }, { "epoch": 1.308562197092084, "grad_norm": 3.4182666616565895, "learning_rate": 4.80046003527609e-06, "loss": 0.7897, "step": 810 }, { "epoch": 1.310177705977383, "grad_norm": 3.2550584077423155, "learning_rate": 4.799961402445435e-06, "loss": 0.7475, "step": 811 }, { "epoch": 1.3117932148626816, "grad_norm": 3.032595455283263, "learning_rate": 4.799462173335175e-06, "loss": 0.7089, "step": 812 }, { "epoch": 1.3134087237479806, "grad_norm": 3.1358398769483933, "learning_rate": 4.798962348074736e-06, "loss": 0.7529, "step": 813 }, { "epoch": 1.3150242326332795, "grad_norm": 2.910817681936819, "learning_rate": 4.798461926793703e-06, "loss": 0.7174, "step": 814 }, { "epoch": 1.3166397415185784, "grad_norm": 3.0019599686763367, "learning_rate": 4.797960909621813e-06, "loss": 0.7031, "step": 815 }, { "epoch": 1.3182552504038771, "grad_norm": 3.662971255551093, "learning_rate": 4.797459296688956e-06, "loss": 0.7154, "step": 816 }, { "epoch": 1.319870759289176, "grad_norm": 3.1592446835393897, "learning_rate": 4.7969570881251815e-06, "loss": 0.7549, "step": 817 }, { "epoch": 1.321486268174475, "grad_norm": 3.349536531988181, "learning_rate": 4.7964542840606886e-06, "loss": 0.6992, "step": 818 }, { "epoch": 1.3231017770597737, "grad_norm": 3.074586084373942, "learning_rate": 4.795950884625832e-06, "loss": 0.6819, "step": 819 }, { "epoch": 1.3247172859450727, "grad_norm": 3.5554341414026704, "learning_rate": 4.795446889951122e-06, "loss": 0.6429, "step": 820 }, { "epoch": 1.3263327948303716, "grad_norm": 3.3827728473609, "learning_rate": 4.794942300167221e-06, "loss": 0.7692, "step": 821 }, { "epoch": 1.3279483037156705, "grad_norm": 3.611750615384829, "learning_rate": 4.79443711540495e-06, "loss": 0.8016, "step": 822 }, { "epoch": 1.3295638126009692, "grad_norm": 2.742111831770905, "learning_rate": 4.793931335795279e-06, "loss": 0.6274, "step": 823 }, { "epoch": 1.3311793214862682, "grad_norm": 3.374795637218273, "learning_rate": 4.793424961469335e-06, "loss": 0.7447, "step": 824 }, { "epoch": 1.332794830371567, "grad_norm": 3.247709015791372, "learning_rate": 4.792917992558398e-06, "loss": 0.7959, "step": 825 }, { "epoch": 1.3344103392568658, "grad_norm": 3.2054587872005316, "learning_rate": 4.792410429193905e-06, "loss": 0.6564, "step": 826 }, { "epoch": 1.3360258481421647, "grad_norm": 3.1252268197071733, "learning_rate": 4.791902271507444e-06, "loss": 0.6881, "step": 827 }, { "epoch": 1.3376413570274637, "grad_norm": 3.7464272283749476, "learning_rate": 4.791393519630757e-06, "loss": 0.7932, "step": 828 }, { "epoch": 1.3392568659127626, "grad_norm": 2.9589385997807542, "learning_rate": 4.790884173695743e-06, "loss": 0.6635, "step": 829 }, { "epoch": 1.3408723747980613, "grad_norm": 3.079593099621962, "learning_rate": 4.790374233834452e-06, "loss": 0.7319, "step": 830 }, { "epoch": 1.3424878836833603, "grad_norm": 2.735689761820437, "learning_rate": 4.7898637001790896e-06, "loss": 0.7371, "step": 831 }, { "epoch": 1.3441033925686592, "grad_norm": 2.8856332530243, "learning_rate": 4.789352572862015e-06, "loss": 0.6157, "step": 832 }, { "epoch": 1.345718901453958, "grad_norm": 3.378658310908716, "learning_rate": 4.788840852015741e-06, "loss": 0.7515, "step": 833 }, { "epoch": 1.3473344103392568, "grad_norm": 3.218541038746627, "learning_rate": 4.788328537772934e-06, "loss": 0.608, "step": 834 }, { "epoch": 1.3489499192245558, "grad_norm": 2.8948322968657267, "learning_rate": 4.787815630266415e-06, "loss": 0.6981, "step": 835 }, { "epoch": 1.3505654281098547, "grad_norm": 2.957995740340623, "learning_rate": 4.787302129629159e-06, "loss": 0.632, "step": 836 }, { "epoch": 1.3521809369951534, "grad_norm": 3.2228631077421537, "learning_rate": 4.786788035994293e-06, "loss": 0.7852, "step": 837 }, { "epoch": 1.3537964458804523, "grad_norm": 3.0775393328416483, "learning_rate": 4.786273349495102e-06, "loss": 0.6894, "step": 838 }, { "epoch": 1.3554119547657513, "grad_norm": 3.2876113053239493, "learning_rate": 4.785758070265018e-06, "loss": 0.7141, "step": 839 }, { "epoch": 1.35702746365105, "grad_norm": 2.963540382740698, "learning_rate": 4.785242198437633e-06, "loss": 0.6723, "step": 840 }, { "epoch": 1.358642972536349, "grad_norm": 3.59203040281942, "learning_rate": 4.784725734146689e-06, "loss": 0.7146, "step": 841 }, { "epoch": 1.3602584814216478, "grad_norm": 3.1787184244409454, "learning_rate": 4.784208677526084e-06, "loss": 0.7747, "step": 842 }, { "epoch": 1.3618739903069468, "grad_norm": 2.953365146172307, "learning_rate": 4.783691028709865e-06, "loss": 0.5898, "step": 843 }, { "epoch": 1.3634894991922455, "grad_norm": 3.6342944027110735, "learning_rate": 4.783172787832239e-06, "loss": 0.7356, "step": 844 }, { "epoch": 1.3651050080775444, "grad_norm": 3.460180139879553, "learning_rate": 4.782653955027562e-06, "loss": 0.6716, "step": 845 }, { "epoch": 1.3667205169628434, "grad_norm": 3.3603353319518328, "learning_rate": 4.782134530430345e-06, "loss": 0.6898, "step": 846 }, { "epoch": 1.368336025848142, "grad_norm": 3.1159131659792103, "learning_rate": 4.781614514175252e-06, "loss": 0.6203, "step": 847 }, { "epoch": 1.369951534733441, "grad_norm": 3.2967586932792305, "learning_rate": 4.7810939063971005e-06, "loss": 0.6898, "step": 848 }, { "epoch": 1.37156704361874, "grad_norm": 3.351495037770187, "learning_rate": 4.78057270723086e-06, "loss": 0.6906, "step": 849 }, { "epoch": 1.3731825525040389, "grad_norm": 3.2896075529258924, "learning_rate": 4.780050916811658e-06, "loss": 0.6847, "step": 850 }, { "epoch": 1.3747980613893376, "grad_norm": 3.688257498626984, "learning_rate": 4.779528535274769e-06, "loss": 0.748, "step": 851 }, { "epoch": 1.3764135702746365, "grad_norm": 3.248159208440795, "learning_rate": 4.7790055627556245e-06, "loss": 0.6911, "step": 852 }, { "epoch": 1.3780290791599354, "grad_norm": 3.2168549906984274, "learning_rate": 4.778481999389809e-06, "loss": 0.7027, "step": 853 }, { "epoch": 1.3796445880452342, "grad_norm": 2.900225088630893, "learning_rate": 4.777957845313059e-06, "loss": 0.64, "step": 854 }, { "epoch": 1.381260096930533, "grad_norm": 2.8421389824778953, "learning_rate": 4.7774331006612655e-06, "loss": 0.6737, "step": 855 }, { "epoch": 1.382875605815832, "grad_norm": 3.2518125535548714, "learning_rate": 4.776907765570471e-06, "loss": 0.7565, "step": 856 }, { "epoch": 1.384491114701131, "grad_norm": 3.041587701420527, "learning_rate": 4.776381840176873e-06, "loss": 0.7404, "step": 857 }, { "epoch": 1.3861066235864297, "grad_norm": 3.6815358473554385, "learning_rate": 4.775855324616821e-06, "loss": 0.7279, "step": 858 }, { "epoch": 1.3877221324717286, "grad_norm": 2.974414072951809, "learning_rate": 4.775328219026815e-06, "loss": 0.6481, "step": 859 }, { "epoch": 1.3893376413570275, "grad_norm": 3.058658202676708, "learning_rate": 4.774800523543514e-06, "loss": 0.6872, "step": 860 }, { "epoch": 1.3909531502423262, "grad_norm": 3.095904561494669, "learning_rate": 4.774272238303724e-06, "loss": 0.6947, "step": 861 }, { "epoch": 1.3925686591276252, "grad_norm": 3.0050890921986353, "learning_rate": 4.773743363444407e-06, "loss": 0.6375, "step": 862 }, { "epoch": 1.394184168012924, "grad_norm": 3.1604822549393785, "learning_rate": 4.773213899102677e-06, "loss": 0.7186, "step": 863 }, { "epoch": 1.395799676898223, "grad_norm": 2.9718066475803084, "learning_rate": 4.772683845415802e-06, "loss": 0.6852, "step": 864 }, { "epoch": 1.3974151857835218, "grad_norm": 3.0559313758031688, "learning_rate": 4.7721532025212015e-06, "loss": 0.6945, "step": 865 }, { "epoch": 1.3990306946688207, "grad_norm": 3.013724211589888, "learning_rate": 4.7716219705564465e-06, "loss": 0.6427, "step": 866 }, { "epoch": 1.4006462035541196, "grad_norm": 3.78080572354102, "learning_rate": 4.771090149659264e-06, "loss": 0.7813, "step": 867 }, { "epoch": 1.4022617124394183, "grad_norm": 3.4602601152545613, "learning_rate": 4.770557739967532e-06, "loss": 0.6979, "step": 868 }, { "epoch": 1.4038772213247173, "grad_norm": 3.0008169425840734, "learning_rate": 4.770024741619278e-06, "loss": 0.6735, "step": 869 }, { "epoch": 1.4054927302100162, "grad_norm": 3.246356553881531, "learning_rate": 4.76949115475269e-06, "loss": 0.6565, "step": 870 }, { "epoch": 1.4071082390953151, "grad_norm": 3.900128541697682, "learning_rate": 4.7689569795061e-06, "loss": 0.7851, "step": 871 }, { "epoch": 1.4087237479806138, "grad_norm": 2.8486263782425656, "learning_rate": 4.768422216017999e-06, "loss": 0.5841, "step": 872 }, { "epoch": 1.4103392568659128, "grad_norm": 2.9578204027598796, "learning_rate": 4.767886864427025e-06, "loss": 0.7003, "step": 873 }, { "epoch": 1.4119547657512117, "grad_norm": 3.3536218374311657, "learning_rate": 4.767350924871974e-06, "loss": 0.6417, "step": 874 }, { "epoch": 1.4135702746365104, "grad_norm": 2.911154636768606, "learning_rate": 4.76681439749179e-06, "loss": 0.6732, "step": 875 }, { "epoch": 1.4151857835218093, "grad_norm": 2.914583205318537, "learning_rate": 4.766277282425572e-06, "loss": 0.7123, "step": 876 }, { "epoch": 1.4168012924071083, "grad_norm": 3.1662317416508343, "learning_rate": 4.76573957981257e-06, "loss": 0.5803, "step": 877 }, { "epoch": 1.4184168012924072, "grad_norm": 3.150090328596128, "learning_rate": 4.765201289792187e-06, "loss": 0.7102, "step": 878 }, { "epoch": 1.420032310177706, "grad_norm": 3.457939758596963, "learning_rate": 4.764662412503979e-06, "loss": 0.7033, "step": 879 }, { "epoch": 1.4216478190630049, "grad_norm": 4.005369728468916, "learning_rate": 4.7641229480876515e-06, "loss": 0.7248, "step": 880 }, { "epoch": 1.4232633279483038, "grad_norm": 3.687286518590374, "learning_rate": 4.7635828966830645e-06, "loss": 0.7438, "step": 881 }, { "epoch": 1.4248788368336025, "grad_norm": 3.037895487640561, "learning_rate": 4.7630422584302315e-06, "loss": 0.7249, "step": 882 }, { "epoch": 1.4264943457189014, "grad_norm": 2.8539602172278986, "learning_rate": 4.762501033469316e-06, "loss": 0.6987, "step": 883 }, { "epoch": 1.4281098546042004, "grad_norm": 2.9757067895083917, "learning_rate": 4.7619592219406315e-06, "loss": 0.7797, "step": 884 }, { "epoch": 1.4297253634894993, "grad_norm": 3.5791474325709074, "learning_rate": 4.761416823984648e-06, "loss": 0.8014, "step": 885 }, { "epoch": 1.431340872374798, "grad_norm": 3.048482773904972, "learning_rate": 4.7608738397419866e-06, "loss": 0.6573, "step": 886 }, { "epoch": 1.432956381260097, "grad_norm": 3.658545393570446, "learning_rate": 4.760330269353417e-06, "loss": 0.7334, "step": 887 }, { "epoch": 1.4345718901453959, "grad_norm": 2.9235593083118854, "learning_rate": 4.7597861129598655e-06, "loss": 0.6555, "step": 888 }, { "epoch": 1.4361873990306946, "grad_norm": 3.2386277673478783, "learning_rate": 4.759241370702406e-06, "loss": 0.6561, "step": 889 }, { "epoch": 1.4378029079159935, "grad_norm": 3.1717151516714575, "learning_rate": 4.758696042722269e-06, "loss": 0.7283, "step": 890 }, { "epoch": 1.4394184168012925, "grad_norm": 3.0921449802702194, "learning_rate": 4.758150129160832e-06, "loss": 0.6677, "step": 891 }, { "epoch": 1.4410339256865914, "grad_norm": 3.455240342239564, "learning_rate": 4.757603630159627e-06, "loss": 0.747, "step": 892 }, { "epoch": 1.44264943457189, "grad_norm": 2.9476542946401016, "learning_rate": 4.757056545860338e-06, "loss": 0.6341, "step": 893 }, { "epoch": 1.444264943457189, "grad_norm": 2.7677409358275558, "learning_rate": 4.7565088764048e-06, "loss": 0.6351, "step": 894 }, { "epoch": 1.445880452342488, "grad_norm": 3.1142573579616895, "learning_rate": 4.755960621935e-06, "loss": 0.6575, "step": 895 }, { "epoch": 1.4474959612277867, "grad_norm": 3.187132603010141, "learning_rate": 4.755411782593075e-06, "loss": 0.6667, "step": 896 }, { "epoch": 1.4491114701130856, "grad_norm": 3.3826981484656122, "learning_rate": 4.754862358521316e-06, "loss": 0.6242, "step": 897 }, { "epoch": 1.4507269789983845, "grad_norm": 2.8517925088219447, "learning_rate": 4.754312349862165e-06, "loss": 0.6587, "step": 898 }, { "epoch": 1.4523424878836835, "grad_norm": 2.876115805177881, "learning_rate": 4.753761756758215e-06, "loss": 0.6444, "step": 899 }, { "epoch": 1.4539579967689822, "grad_norm": 2.9626213278802447, "learning_rate": 4.753210579352211e-06, "loss": 0.645, "step": 900 }, { "epoch": 1.4555735056542811, "grad_norm": 3.0152574017785723, "learning_rate": 4.752658817787049e-06, "loss": 0.7748, "step": 901 }, { "epoch": 1.4571890145395798, "grad_norm": 3.465136742553175, "learning_rate": 4.752106472205776e-06, "loss": 0.7429, "step": 902 }, { "epoch": 1.4588045234248788, "grad_norm": 3.0410621861713776, "learning_rate": 4.7515535427515924e-06, "loss": 0.7084, "step": 903 }, { "epoch": 1.4604200323101777, "grad_norm": 3.401405157648176, "learning_rate": 4.751000029567848e-06, "loss": 0.6583, "step": 904 }, { "epoch": 1.4620355411954766, "grad_norm": 3.0560838332309337, "learning_rate": 4.750445932798045e-06, "loss": 0.7663, "step": 905 }, { "epoch": 1.4636510500807756, "grad_norm": 3.14798731675863, "learning_rate": 4.749891252585836e-06, "loss": 0.6107, "step": 906 }, { "epoch": 1.4652665589660743, "grad_norm": 3.056462339903594, "learning_rate": 4.7493359890750255e-06, "loss": 0.6385, "step": 907 }, { "epoch": 1.4668820678513732, "grad_norm": 3.8064179342786115, "learning_rate": 4.748780142409569e-06, "loss": 0.7688, "step": 908 }, { "epoch": 1.468497576736672, "grad_norm": 3.3243212878411925, "learning_rate": 4.748223712733573e-06, "loss": 0.7215, "step": 909 }, { "epoch": 1.4701130856219708, "grad_norm": 3.2079163012890066, "learning_rate": 4.747666700191297e-06, "loss": 0.693, "step": 910 }, { "epoch": 1.4717285945072698, "grad_norm": 3.1059102702879278, "learning_rate": 4.747109104927148e-06, "loss": 0.6942, "step": 911 }, { "epoch": 1.4733441033925687, "grad_norm": 3.0954471922172133, "learning_rate": 4.746550927085687e-06, "loss": 0.6879, "step": 912 }, { "epoch": 1.4749596122778676, "grad_norm": 2.8544398400806994, "learning_rate": 4.745992166811626e-06, "loss": 0.7267, "step": 913 }, { "epoch": 1.4765751211631664, "grad_norm": 3.571665479164358, "learning_rate": 4.745432824249825e-06, "loss": 0.7299, "step": 914 }, { "epoch": 1.4781906300484653, "grad_norm": 3.527505554793847, "learning_rate": 4.744872899545299e-06, "loss": 0.7549, "step": 915 }, { "epoch": 1.479806138933764, "grad_norm": 2.9470317654227927, "learning_rate": 4.744312392843212e-06, "loss": 0.7316, "step": 916 }, { "epoch": 1.481421647819063, "grad_norm": 3.3435955903440484, "learning_rate": 4.743751304288877e-06, "loss": 0.6215, "step": 917 }, { "epoch": 1.4830371567043619, "grad_norm": 2.929636222002324, "learning_rate": 4.743189634027762e-06, "loss": 0.6271, "step": 918 }, { "epoch": 1.4846526655896608, "grad_norm": 3.824085029771376, "learning_rate": 4.742627382205484e-06, "loss": 0.6744, "step": 919 }, { "epoch": 1.4862681744749597, "grad_norm": 2.811683799249565, "learning_rate": 4.742064548967808e-06, "loss": 0.6446, "step": 920 }, { "epoch": 1.4878836833602584, "grad_norm": 3.2557476444976525, "learning_rate": 4.7415011344606524e-06, "loss": 0.7305, "step": 921 }, { "epoch": 1.4894991922455574, "grad_norm": 3.1723805055815246, "learning_rate": 4.740937138830088e-06, "loss": 0.7462, "step": 922 }, { "epoch": 1.491114701130856, "grad_norm": 3.030175420024594, "learning_rate": 4.7403725622223315e-06, "loss": 0.7129, "step": 923 }, { "epoch": 1.492730210016155, "grad_norm": 3.5152777105664463, "learning_rate": 4.739807404783756e-06, "loss": 0.7662, "step": 924 }, { "epoch": 1.494345718901454, "grad_norm": 3.013996713852223, "learning_rate": 4.73924166666088e-06, "loss": 0.6095, "step": 925 }, { "epoch": 1.495961227786753, "grad_norm": 2.7688490709551012, "learning_rate": 4.738675348000375e-06, "loss": 0.7175, "step": 926 }, { "epoch": 1.4975767366720518, "grad_norm": 3.1953413487049382, "learning_rate": 4.7381084489490646e-06, "loss": 0.7231, "step": 927 }, { "epoch": 1.4991922455573505, "grad_norm": 3.128538147430442, "learning_rate": 4.7375409696539186e-06, "loss": 0.5826, "step": 928 }, { "epoch": 1.5008077544426495, "grad_norm": 3.062270190647156, "learning_rate": 4.73697291026206e-06, "loss": 0.6911, "step": 929 }, { "epoch": 1.5024232633279482, "grad_norm": 3.1822572721793545, "learning_rate": 4.736404270920763e-06, "loss": 0.6921, "step": 930 }, { "epoch": 1.504038772213247, "grad_norm": 2.8050020248779135, "learning_rate": 4.735835051777449e-06, "loss": 0.608, "step": 931 }, { "epoch": 1.505654281098546, "grad_norm": 3.136529709016175, "learning_rate": 4.7352652529796935e-06, "loss": 0.6919, "step": 932 }, { "epoch": 1.507269789983845, "grad_norm": 3.2621553855676204, "learning_rate": 4.734694874675221e-06, "loss": 0.7048, "step": 933 }, { "epoch": 1.508885298869144, "grad_norm": 3.0969322262031125, "learning_rate": 4.734123917011904e-06, "loss": 0.6116, "step": 934 }, { "epoch": 1.5105008077544426, "grad_norm": 2.88877819652444, "learning_rate": 4.733552380137767e-06, "loss": 0.7122, "step": 935 }, { "epoch": 1.5121163166397416, "grad_norm": 3.25764416331997, "learning_rate": 4.732980264200985e-06, "loss": 0.671, "step": 936 }, { "epoch": 1.5137318255250403, "grad_norm": 3.116196923477232, "learning_rate": 4.732407569349883e-06, "loss": 0.7199, "step": 937 }, { "epoch": 1.5153473344103392, "grad_norm": 2.712083826261615, "learning_rate": 4.731834295732936e-06, "loss": 0.6267, "step": 938 }, { "epoch": 1.5169628432956381, "grad_norm": 3.1115702255727964, "learning_rate": 4.7312604434987666e-06, "loss": 0.7079, "step": 939 }, { "epoch": 1.518578352180937, "grad_norm": 3.2006613544632323, "learning_rate": 4.730686012796153e-06, "loss": 0.6843, "step": 940 }, { "epoch": 1.520193861066236, "grad_norm": 3.580516772399891, "learning_rate": 4.730111003774018e-06, "loss": 0.7756, "step": 941 }, { "epoch": 1.5218093699515347, "grad_norm": 3.1511525221959373, "learning_rate": 4.729535416581436e-06, "loss": 0.6888, "step": 942 }, { "epoch": 1.5234248788368336, "grad_norm": 3.1443530515942224, "learning_rate": 4.728959251367634e-06, "loss": 0.7109, "step": 943 }, { "epoch": 1.5250403877221324, "grad_norm": 3.142389106503143, "learning_rate": 4.728382508281983e-06, "loss": 0.6002, "step": 944 }, { "epoch": 1.5266558966074313, "grad_norm": 3.103315629726421, "learning_rate": 4.727805187474011e-06, "loss": 0.6346, "step": 945 }, { "epoch": 1.5282714054927302, "grad_norm": 2.9701280693501038, "learning_rate": 4.727227289093388e-06, "loss": 0.661, "step": 946 }, { "epoch": 1.5298869143780292, "grad_norm": 2.5876010869145967, "learning_rate": 4.726648813289942e-06, "loss": 0.5628, "step": 947 }, { "epoch": 1.531502423263328, "grad_norm": 3.1737880015452054, "learning_rate": 4.726069760213644e-06, "loss": 0.6331, "step": 948 }, { "epoch": 1.5331179321486268, "grad_norm": 4.1229414095881225, "learning_rate": 4.7254901300146176e-06, "loss": 0.8245, "step": 949 }, { "epoch": 1.5347334410339257, "grad_norm": 3.3561345406033065, "learning_rate": 4.724909922843136e-06, "loss": 0.6537, "step": 950 }, { "epoch": 1.5363489499192244, "grad_norm": 3.303227842959778, "learning_rate": 4.724329138849622e-06, "loss": 0.7314, "step": 951 }, { "epoch": 1.5379644588045234, "grad_norm": 3.1730539475929445, "learning_rate": 4.723747778184645e-06, "loss": 0.7056, "step": 952 }, { "epoch": 1.5395799676898223, "grad_norm": 3.320748990125193, "learning_rate": 4.72316584099893e-06, "loss": 0.7514, "step": 953 }, { "epoch": 1.5411954765751212, "grad_norm": 2.965627740870685, "learning_rate": 4.722583327443346e-06, "loss": 0.6898, "step": 954 }, { "epoch": 1.5428109854604202, "grad_norm": 2.9532275643786035, "learning_rate": 4.7220002376689135e-06, "loss": 0.6604, "step": 955 }, { "epoch": 1.5444264943457189, "grad_norm": 2.9513748727775897, "learning_rate": 4.721416571826803e-06, "loss": 0.7715, "step": 956 }, { "epoch": 1.5460420032310178, "grad_norm": 3.415802504053735, "learning_rate": 4.720832330068333e-06, "loss": 0.7398, "step": 957 }, { "epoch": 1.5476575121163165, "grad_norm": 3.0317168799136285, "learning_rate": 4.7202475125449705e-06, "loss": 0.7433, "step": 958 }, { "epoch": 1.5492730210016155, "grad_norm": 3.074696037816036, "learning_rate": 4.719662119408335e-06, "loss": 0.6327, "step": 959 }, { "epoch": 1.5508885298869144, "grad_norm": 3.591202973681794, "learning_rate": 4.719076150810193e-06, "loss": 0.7842, "step": 960 }, { "epoch": 1.5525040387722133, "grad_norm": 3.016660538132828, "learning_rate": 4.718489606902461e-06, "loss": 0.695, "step": 961 }, { "epoch": 1.5541195476575123, "grad_norm": 2.932750925657532, "learning_rate": 4.7179024878372035e-06, "loss": 0.6796, "step": 962 }, { "epoch": 1.555735056542811, "grad_norm": 3.2195309391581226, "learning_rate": 4.717314793766634e-06, "loss": 0.8182, "step": 963 }, { "epoch": 1.55735056542811, "grad_norm": 3.115559661011772, "learning_rate": 4.716726524843118e-06, "loss": 0.7154, "step": 964 }, { "epoch": 1.5589660743134086, "grad_norm": 3.4653510018301983, "learning_rate": 4.7161376812191675e-06, "loss": 0.6564, "step": 965 }, { "epoch": 1.5605815831987075, "grad_norm": 2.7396929212315198, "learning_rate": 4.715548263047443e-06, "loss": 0.6359, "step": 966 }, { "epoch": 1.5621970920840065, "grad_norm": 3.3038278213580745, "learning_rate": 4.714958270480754e-06, "loss": 0.6837, "step": 967 }, { "epoch": 1.5638126009693054, "grad_norm": 3.338686893143973, "learning_rate": 4.714367703672062e-06, "loss": 0.7129, "step": 968 }, { "epoch": 1.5654281098546043, "grad_norm": 3.2968290076914126, "learning_rate": 4.713776562774474e-06, "loss": 0.7157, "step": 969 }, { "epoch": 1.567043618739903, "grad_norm": 3.698391479548602, "learning_rate": 4.713184847941248e-06, "loss": 0.7264, "step": 970 }, { "epoch": 1.568659127625202, "grad_norm": 3.201733928932503, "learning_rate": 4.7125925593257886e-06, "loss": 0.7424, "step": 971 }, { "epoch": 1.5702746365105007, "grad_norm": 3.274871189914862, "learning_rate": 4.711999697081651e-06, "loss": 0.6785, "step": 972 }, { "epoch": 1.5718901453957996, "grad_norm": 2.948927531273667, "learning_rate": 4.711406261362539e-06, "loss": 0.6958, "step": 973 }, { "epoch": 1.5735056542810986, "grad_norm": 3.4545691540775927, "learning_rate": 4.7108122523223035e-06, "loss": 0.735, "step": 974 }, { "epoch": 1.5751211631663975, "grad_norm": 3.2863973478945145, "learning_rate": 4.710217670114945e-06, "loss": 0.7281, "step": 975 }, { "epoch": 1.5767366720516964, "grad_norm": 3.4898131340374863, "learning_rate": 4.709622514894615e-06, "loss": 0.7138, "step": 976 }, { "epoch": 1.5783521809369951, "grad_norm": 3.490834322766079, "learning_rate": 4.709026786815608e-06, "loss": 0.7226, "step": 977 }, { "epoch": 1.579967689822294, "grad_norm": 3.0399952415152223, "learning_rate": 4.708430486032372e-06, "loss": 0.5742, "step": 978 }, { "epoch": 1.5815831987075928, "grad_norm": 2.7137822858503227, "learning_rate": 4.707833612699503e-06, "loss": 0.6574, "step": 979 }, { "epoch": 1.5831987075928917, "grad_norm": 3.5850750672722302, "learning_rate": 4.707236166971742e-06, "loss": 0.8044, "step": 980 }, { "epoch": 1.5848142164781907, "grad_norm": 3.43849119311326, "learning_rate": 4.706638149003983e-06, "loss": 0.7241, "step": 981 }, { "epoch": 1.5864297253634896, "grad_norm": 3.068289249153385, "learning_rate": 4.706039558951262e-06, "loss": 0.6983, "step": 982 }, { "epoch": 1.5880452342487885, "grad_norm": 2.9297184081771825, "learning_rate": 4.705440396968771e-06, "loss": 0.569, "step": 983 }, { "epoch": 1.5896607431340872, "grad_norm": 4.219557523337789, "learning_rate": 4.7048406632118455e-06, "loss": 0.778, "step": 984 }, { "epoch": 1.5912762520193862, "grad_norm": 2.710376288566505, "learning_rate": 4.704240357835969e-06, "loss": 0.598, "step": 985 }, { "epoch": 1.5928917609046849, "grad_norm": 3.384836975827682, "learning_rate": 4.703639480996777e-06, "loss": 0.7448, "step": 986 }, { "epoch": 1.5945072697899838, "grad_norm": 3.4810186281813915, "learning_rate": 4.703038032850048e-06, "loss": 0.7527, "step": 987 }, { "epoch": 1.5961227786752827, "grad_norm": 3.124612389569777, "learning_rate": 4.702436013551713e-06, "loss": 0.6802, "step": 988 }, { "epoch": 1.5977382875605817, "grad_norm": 2.8268160165447536, "learning_rate": 4.701833423257849e-06, "loss": 0.5981, "step": 989 }, { "epoch": 1.5993537964458806, "grad_norm": 3.2664389064781347, "learning_rate": 4.7012302621246806e-06, "loss": 0.7006, "step": 990 }, { "epoch": 1.6009693053311793, "grad_norm": 3.4581727484946976, "learning_rate": 4.7006265303085816e-06, "loss": 0.7675, "step": 991 }, { "epoch": 1.602584814216478, "grad_norm": 3.036206797254767, "learning_rate": 4.700022227966074e-06, "loss": 0.7358, "step": 992 }, { "epoch": 1.604200323101777, "grad_norm": 3.0026477824282582, "learning_rate": 4.699417355253827e-06, "loss": 0.6357, "step": 993 }, { "epoch": 1.605815831987076, "grad_norm": 3.087100570040653, "learning_rate": 4.698811912328655e-06, "loss": 0.6861, "step": 994 }, { "epoch": 1.6074313408723748, "grad_norm": 3.4577585939946873, "learning_rate": 4.698205899347526e-06, "loss": 0.6407, "step": 995 }, { "epoch": 1.6090468497576738, "grad_norm": 2.9356535409906415, "learning_rate": 4.697599316467552e-06, "loss": 0.6811, "step": 996 }, { "epoch": 1.6106623586429727, "grad_norm": 2.87058350386487, "learning_rate": 4.696992163845994e-06, "loss": 0.6715, "step": 997 }, { "epoch": 1.6122778675282714, "grad_norm": 2.5958986189241835, "learning_rate": 4.696384441640259e-06, "loss": 0.6358, "step": 998 }, { "epoch": 1.6138933764135701, "grad_norm": 3.1957872883412173, "learning_rate": 4.695776150007902e-06, "loss": 0.7982, "step": 999 }, { "epoch": 1.615508885298869, "grad_norm": 3.2083590975335015, "learning_rate": 4.695167289106629e-06, "loss": 0.6547, "step": 1000 }, { "epoch": 1.615508885298869, "eval_loss": 1.0406948328018188, "eval_runtime": 2.3526, "eval_samples_per_second": 127.52, "eval_steps_per_second": 2.975, "step": 1000 }, { "epoch": 1.617124394184168, "grad_norm": 3.160630003530307, "learning_rate": 4.69455785909429e-06, "loss": 0.6444, "step": 1001 }, { "epoch": 1.618739903069467, "grad_norm": 3.0490971385491643, "learning_rate": 4.693947860128882e-06, "loss": 0.7095, "step": 1002 }, { "epoch": 1.6203554119547658, "grad_norm": 3.255348216931837, "learning_rate": 4.693337292368553e-06, "loss": 0.7387, "step": 1003 }, { "epoch": 1.6219709208400648, "grad_norm": 2.735675078286154, "learning_rate": 4.692726155971596e-06, "loss": 0.627, "step": 1004 }, { "epoch": 1.6235864297253635, "grad_norm": 3.042046486193686, "learning_rate": 4.692114451096452e-06, "loss": 0.6665, "step": 1005 }, { "epoch": 1.6252019386106622, "grad_norm": 3.626003853147279, "learning_rate": 4.69150217790171e-06, "loss": 0.7121, "step": 1006 }, { "epoch": 1.6268174474959611, "grad_norm": 3.4163232768381175, "learning_rate": 4.690889336546105e-06, "loss": 0.6934, "step": 1007 }, { "epoch": 1.62843295638126, "grad_norm": 3.4057386037578823, "learning_rate": 4.69027592718852e-06, "loss": 0.6533, "step": 1008 }, { "epoch": 1.630048465266559, "grad_norm": 3.3747371541385984, "learning_rate": 4.689661949987984e-06, "loss": 0.7057, "step": 1009 }, { "epoch": 1.631663974151858, "grad_norm": 3.5595608010883755, "learning_rate": 4.689047405103678e-06, "loss": 0.736, "step": 1010 }, { "epoch": 1.6332794830371569, "grad_norm": 3.3670598119410524, "learning_rate": 4.688432292694924e-06, "loss": 0.7564, "step": 1011 }, { "epoch": 1.6348949919224556, "grad_norm": 3.2453719290425282, "learning_rate": 4.687816612921194e-06, "loss": 0.6927, "step": 1012 }, { "epoch": 1.6365105008077543, "grad_norm": 3.1084010829024318, "learning_rate": 4.6872003659421075e-06, "loss": 0.727, "step": 1013 }, { "epoch": 1.6381260096930532, "grad_norm": 2.645450463492228, "learning_rate": 4.68658355191743e-06, "loss": 0.5755, "step": 1014 }, { "epoch": 1.6397415185783522, "grad_norm": 3.1132096737511388, "learning_rate": 4.685966171007074e-06, "loss": 0.7289, "step": 1015 }, { "epoch": 1.641357027463651, "grad_norm": 3.0204215631659026, "learning_rate": 4.685348223371102e-06, "loss": 0.7069, "step": 1016 }, { "epoch": 1.64297253634895, "grad_norm": 3.8686811618595924, "learning_rate": 4.684729709169718e-06, "loss": 0.8538, "step": 1017 }, { "epoch": 1.644588045234249, "grad_norm": 3.269547327373384, "learning_rate": 4.684110628563276e-06, "loss": 0.6989, "step": 1018 }, { "epoch": 1.6462035541195477, "grad_norm": 3.097653622360717, "learning_rate": 4.683490981712278e-06, "loss": 0.7449, "step": 1019 }, { "epoch": 1.6478190630048464, "grad_norm": 3.614536066064326, "learning_rate": 4.68287076877737e-06, "loss": 0.8059, "step": 1020 }, { "epoch": 1.6494345718901453, "grad_norm": 3.4623055328867345, "learning_rate": 4.6822499899193465e-06, "loss": 0.6589, "step": 1021 }, { "epoch": 1.6510500807754442, "grad_norm": 3.21584732818722, "learning_rate": 4.68162864529915e-06, "loss": 0.7051, "step": 1022 }, { "epoch": 1.6526655896607432, "grad_norm": 3.3142380611259368, "learning_rate": 4.681006735077865e-06, "loss": 0.6875, "step": 1023 }, { "epoch": 1.654281098546042, "grad_norm": 3.2299248587155627, "learning_rate": 4.680384259416729e-06, "loss": 0.5489, "step": 1024 }, { "epoch": 1.655896607431341, "grad_norm": 2.9344695945828385, "learning_rate": 4.67976121847712e-06, "loss": 0.6579, "step": 1025 }, { "epoch": 1.6575121163166397, "grad_norm": 3.1859415658327537, "learning_rate": 4.679137612420567e-06, "loss": 0.6668, "step": 1026 }, { "epoch": 1.6591276252019385, "grad_norm": 3.4904940057048304, "learning_rate": 4.6785134414087435e-06, "loss": 0.6911, "step": 1027 }, { "epoch": 1.6607431340872374, "grad_norm": 3.5004108883811558, "learning_rate": 4.677888705603469e-06, "loss": 0.7968, "step": 1028 }, { "epoch": 1.6623586429725363, "grad_norm": 4.002543112745558, "learning_rate": 4.677263405166711e-06, "loss": 0.7424, "step": 1029 }, { "epoch": 1.6639741518578353, "grad_norm": 3.4382762408807177, "learning_rate": 4.6766375402605825e-06, "loss": 0.743, "step": 1030 }, { "epoch": 1.6655896607431342, "grad_norm": 45.63935356301048, "learning_rate": 4.676011111047342e-06, "loss": 0.6867, "step": 1031 }, { "epoch": 1.667205169628433, "grad_norm": 3.1308651123316795, "learning_rate": 4.675384117689398e-06, "loss": 0.6761, "step": 1032 }, { "epoch": 1.6688206785137318, "grad_norm": 3.336585660403874, "learning_rate": 4.674756560349299e-06, "loss": 0.6822, "step": 1033 }, { "epoch": 1.6704361873990305, "grad_norm": 3.472007794737559, "learning_rate": 4.674128439189746e-06, "loss": 0.6234, "step": 1034 }, { "epoch": 1.6720516962843295, "grad_norm": 3.199288150870133, "learning_rate": 4.673499754373581e-06, "loss": 0.6805, "step": 1035 }, { "epoch": 1.6736672051696284, "grad_norm": 3.03394334539678, "learning_rate": 4.672870506063797e-06, "loss": 0.6968, "step": 1036 }, { "epoch": 1.6752827140549273, "grad_norm": 3.24216503124861, "learning_rate": 4.672240694423529e-06, "loss": 0.7386, "step": 1037 }, { "epoch": 1.6768982229402263, "grad_norm": 3.357838135351319, "learning_rate": 4.67161031961606e-06, "loss": 0.6386, "step": 1038 }, { "epoch": 1.678513731825525, "grad_norm": 3.6051559800462316, "learning_rate": 4.670979381804819e-06, "loss": 0.7719, "step": 1039 }, { "epoch": 1.680129240710824, "grad_norm": 3.3431299876896916, "learning_rate": 4.670347881153379e-06, "loss": 0.6709, "step": 1040 }, { "epoch": 1.6817447495961226, "grad_norm": 4.03753435258716, "learning_rate": 4.6697158178254646e-06, "loss": 0.8121, "step": 1041 }, { "epoch": 1.6833602584814216, "grad_norm": 2.9868273652578057, "learning_rate": 4.669083191984938e-06, "loss": 0.63, "step": 1042 }, { "epoch": 1.6849757673667205, "grad_norm": 3.518425600693153, "learning_rate": 4.668450003795813e-06, "loss": 0.7098, "step": 1043 }, { "epoch": 1.6865912762520194, "grad_norm": 2.993675915148754, "learning_rate": 4.6678162534222485e-06, "loss": 0.6156, "step": 1044 }, { "epoch": 1.6882067851373184, "grad_norm": 3.199683418113162, "learning_rate": 4.667181941028547e-06, "loss": 0.7136, "step": 1045 }, { "epoch": 1.689822294022617, "grad_norm": 3.0086846311393383, "learning_rate": 4.666547066779158e-06, "loss": 0.6461, "step": 1046 }, { "epoch": 1.691437802907916, "grad_norm": 2.694328413023972, "learning_rate": 4.6659116308386775e-06, "loss": 0.6643, "step": 1047 }, { "epoch": 1.6930533117932147, "grad_norm": 3.448890821420795, "learning_rate": 4.665275633371847e-06, "loss": 0.6941, "step": 1048 }, { "epoch": 1.6946688206785137, "grad_norm": 3.329281104700185, "learning_rate": 4.664639074543551e-06, "loss": 0.7501, "step": 1049 }, { "epoch": 1.6962843295638126, "grad_norm": 3.3007044296025, "learning_rate": 4.664001954518822e-06, "loss": 0.6906, "step": 1050 }, { "epoch": 1.6978998384491115, "grad_norm": 3.5724841530002514, "learning_rate": 4.663364273462838e-06, "loss": 0.7789, "step": 1051 }, { "epoch": 1.6995153473344105, "grad_norm": 3.3335500985101985, "learning_rate": 4.662726031540922e-06, "loss": 0.6997, "step": 1052 }, { "epoch": 1.7011308562197092, "grad_norm": 3.527535788909279, "learning_rate": 4.662087228918542e-06, "loss": 0.7906, "step": 1053 }, { "epoch": 1.702746365105008, "grad_norm": 3.381637394110631, "learning_rate": 4.661447865761312e-06, "loss": 0.7226, "step": 1054 }, { "epoch": 1.7043618739903068, "grad_norm": 3.1896775716605843, "learning_rate": 4.660807942234989e-06, "loss": 0.6932, "step": 1055 }, { "epoch": 1.7059773828756057, "grad_norm": 2.5680461366872636, "learning_rate": 4.66016745850548e-06, "loss": 0.6054, "step": 1056 }, { "epoch": 1.7075928917609047, "grad_norm": 3.655004360442089, "learning_rate": 4.659526414738832e-06, "loss": 0.8068, "step": 1057 }, { "epoch": 1.7092084006462036, "grad_norm": 3.320663469617057, "learning_rate": 4.658884811101242e-06, "loss": 0.6309, "step": 1058 }, { "epoch": 1.7108239095315025, "grad_norm": 3.503051061028231, "learning_rate": 4.658242647759048e-06, "loss": 0.7498, "step": 1059 }, { "epoch": 1.7124394184168013, "grad_norm": 3.1281105866044814, "learning_rate": 4.657599924878736e-06, "loss": 0.6676, "step": 1060 }, { "epoch": 1.7140549273021002, "grad_norm": 3.2651724486247207, "learning_rate": 4.656956642626935e-06, "loss": 0.7465, "step": 1061 }, { "epoch": 1.715670436187399, "grad_norm": 3.5147256874257677, "learning_rate": 4.656312801170422e-06, "loss": 0.7478, "step": 1062 }, { "epoch": 1.7172859450726978, "grad_norm": 3.129186575429102, "learning_rate": 4.655668400676114e-06, "loss": 0.7771, "step": 1063 }, { "epoch": 1.7189014539579968, "grad_norm": 3.5635292755081838, "learning_rate": 4.655023441311079e-06, "loss": 0.751, "step": 1064 }, { "epoch": 1.7205169628432957, "grad_norm": 3.19700341359944, "learning_rate": 4.6543779232425245e-06, "loss": 0.7032, "step": 1065 }, { "epoch": 1.7221324717285946, "grad_norm": 3.2657943815575843, "learning_rate": 4.653731846637806e-06, "loss": 0.7474, "step": 1066 }, { "epoch": 1.7237479806138933, "grad_norm": 4.008290980037219, "learning_rate": 4.653085211664423e-06, "loss": 0.7078, "step": 1067 }, { "epoch": 1.7253634894991923, "grad_norm": 3.2578293654250685, "learning_rate": 4.65243801849002e-06, "loss": 0.6553, "step": 1068 }, { "epoch": 1.726978998384491, "grad_norm": 3.137141107600606, "learning_rate": 4.651790267282386e-06, "loss": 0.7725, "step": 1069 }, { "epoch": 1.72859450726979, "grad_norm": 3.286427355831908, "learning_rate": 4.651141958209453e-06, "loss": 0.656, "step": 1070 }, { "epoch": 1.7302100161550888, "grad_norm": 2.9929704993315345, "learning_rate": 4.650493091439301e-06, "loss": 0.5721, "step": 1071 }, { "epoch": 1.7318255250403878, "grad_norm": 3.5702892876725545, "learning_rate": 4.6498436671401535e-06, "loss": 0.6918, "step": 1072 }, { "epoch": 1.7334410339256867, "grad_norm": 3.232194199731843, "learning_rate": 4.649193685480375e-06, "loss": 0.7676, "step": 1073 }, { "epoch": 1.7350565428109854, "grad_norm": 3.3669268834128703, "learning_rate": 4.64854314662848e-06, "loss": 0.6679, "step": 1074 }, { "epoch": 1.7366720516962844, "grad_norm": 2.9490824234073734, "learning_rate": 4.647892050753124e-06, "loss": 0.6062, "step": 1075 }, { "epoch": 1.738287560581583, "grad_norm": 3.5588789334404822, "learning_rate": 4.6472403980231074e-06, "loss": 0.6377, "step": 1076 }, { "epoch": 1.739903069466882, "grad_norm": 3.2558948134903867, "learning_rate": 4.646588188607376e-06, "loss": 0.7224, "step": 1077 }, { "epoch": 1.741518578352181, "grad_norm": 3.104400887851608, "learning_rate": 4.645935422675017e-06, "loss": 0.6455, "step": 1078 }, { "epoch": 1.7431340872374799, "grad_norm": 3.1014633254436377, "learning_rate": 4.645282100395268e-06, "loss": 0.6836, "step": 1079 }, { "epoch": 1.7447495961227788, "grad_norm": 3.5190105079862, "learning_rate": 4.644628221937505e-06, "loss": 0.7349, "step": 1080 }, { "epoch": 1.7463651050080775, "grad_norm": 3.230204225738683, "learning_rate": 4.643973787471249e-06, "loss": 0.6651, "step": 1081 }, { "epoch": 1.7479806138933764, "grad_norm": 3.2082829740675067, "learning_rate": 4.643318797166167e-06, "loss": 0.691, "step": 1082 }, { "epoch": 1.7495961227786752, "grad_norm": 2.955096368256971, "learning_rate": 4.64266325119207e-06, "loss": 0.7515, "step": 1083 }, { "epoch": 1.751211631663974, "grad_norm": 3.170521879710204, "learning_rate": 4.642007149718913e-06, "loss": 0.7156, "step": 1084 }, { "epoch": 1.752827140549273, "grad_norm": 3.2507765119477643, "learning_rate": 4.641350492916793e-06, "loss": 0.6833, "step": 1085 }, { "epoch": 1.754442649434572, "grad_norm": 3.1708106415019546, "learning_rate": 4.6406932809559526e-06, "loss": 0.6482, "step": 1086 }, { "epoch": 1.7560581583198709, "grad_norm": 3.3406554773154546, "learning_rate": 4.640035514006779e-06, "loss": 0.7832, "step": 1087 }, { "epoch": 1.7576736672051696, "grad_norm": 4.437207550957053, "learning_rate": 4.639377192239802e-06, "loss": 0.6604, "step": 1088 }, { "epoch": 1.7592891760904685, "grad_norm": 3.3482162183691786, "learning_rate": 4.638718315825697e-06, "loss": 0.7323, "step": 1089 }, { "epoch": 1.7609046849757672, "grad_norm": 3.404457025081948, "learning_rate": 4.6380588849352794e-06, "loss": 0.6488, "step": 1090 }, { "epoch": 1.7625201938610662, "grad_norm": 3.3162256574880478, "learning_rate": 4.6373988997395125e-06, "loss": 0.7925, "step": 1091 }, { "epoch": 1.764135702746365, "grad_norm": 2.976252279507659, "learning_rate": 4.636738360409501e-06, "loss": 0.567, "step": 1092 }, { "epoch": 1.765751211631664, "grad_norm": 3.885385007295431, "learning_rate": 4.636077267116494e-06, "loss": 0.8057, "step": 1093 }, { "epoch": 1.767366720516963, "grad_norm": 3.12555905329591, "learning_rate": 4.635415620031886e-06, "loss": 0.6266, "step": 1094 }, { "epoch": 1.7689822294022617, "grad_norm": 3.502496059516474, "learning_rate": 4.63475341932721e-06, "loss": 0.7636, "step": 1095 }, { "epoch": 1.7705977382875606, "grad_norm": 3.7258090299773867, "learning_rate": 4.634090665174148e-06, "loss": 0.6777, "step": 1096 }, { "epoch": 1.7722132471728593, "grad_norm": 2.8884812192583804, "learning_rate": 4.633427357744523e-06, "loss": 0.6409, "step": 1097 }, { "epoch": 1.7738287560581583, "grad_norm": 3.612757963636344, "learning_rate": 4.6327634972103005e-06, "loss": 0.7217, "step": 1098 }, { "epoch": 1.7754442649434572, "grad_norm": 3.4637651436184282, "learning_rate": 4.632099083743591e-06, "loss": 0.7244, "step": 1099 }, { "epoch": 1.7770597738287561, "grad_norm": 3.163987838858152, "learning_rate": 4.631434117516649e-06, "loss": 0.6587, "step": 1100 }, { "epoch": 1.778675282714055, "grad_norm": 3.185864606785003, "learning_rate": 4.6307685987018706e-06, "loss": 0.754, "step": 1101 }, { "epoch": 1.7802907915993538, "grad_norm": 3.3835317366438566, "learning_rate": 4.630102527471795e-06, "loss": 0.7264, "step": 1102 }, { "epoch": 1.7819063004846527, "grad_norm": 13.823789945285986, "learning_rate": 4.6294359039991054e-06, "loss": 0.9055, "step": 1103 }, { "epoch": 1.7835218093699514, "grad_norm": 2.9793339634379192, "learning_rate": 4.628768728456629e-06, "loss": 0.6091, "step": 1104 }, { "epoch": 1.7851373182552503, "grad_norm": 3.0096330683269965, "learning_rate": 4.628101001017336e-06, "loss": 0.6005, "step": 1105 }, { "epoch": 1.7867528271405493, "grad_norm": 3.808113867525369, "learning_rate": 4.627432721854337e-06, "loss": 0.8013, "step": 1106 }, { "epoch": 1.7883683360258482, "grad_norm": 3.505721152682734, "learning_rate": 4.626763891140889e-06, "loss": 0.6996, "step": 1107 }, { "epoch": 1.7899838449111471, "grad_norm": 3.256032201744535, "learning_rate": 4.626094509050391e-06, "loss": 0.684, "step": 1108 }, { "epoch": 1.7915993537964459, "grad_norm": 2.897678322410273, "learning_rate": 4.625424575756383e-06, "loss": 0.6396, "step": 1109 }, { "epoch": 1.7932148626817448, "grad_norm": 3.1250887356017483, "learning_rate": 4.624754091432551e-06, "loss": 0.6363, "step": 1110 }, { "epoch": 1.7948303715670435, "grad_norm": 3.057754765714098, "learning_rate": 4.624083056252721e-06, "loss": 0.7082, "step": 1111 }, { "epoch": 1.7964458804523424, "grad_norm": 3.2618044812754037, "learning_rate": 4.623411470390864e-06, "loss": 0.6556, "step": 1112 }, { "epoch": 1.7980613893376414, "grad_norm": 3.2933183065030476, "learning_rate": 4.622739334021092e-06, "loss": 0.6926, "step": 1113 }, { "epoch": 1.7996768982229403, "grad_norm": 3.2782356968386823, "learning_rate": 4.622066647317662e-06, "loss": 0.7056, "step": 1114 }, { "epoch": 1.8012924071082392, "grad_norm": 4.263167500693584, "learning_rate": 4.621393410454972e-06, "loss": 0.7959, "step": 1115 }, { "epoch": 1.802907915993538, "grad_norm": 3.393549271990807, "learning_rate": 4.620719623607562e-06, "loss": 0.6987, "step": 1116 }, { "epoch": 1.8045234248788369, "grad_norm": 3.7106982242616264, "learning_rate": 4.620045286950115e-06, "loss": 0.7741, "step": 1117 }, { "epoch": 1.8061389337641356, "grad_norm": 3.725243909957609, "learning_rate": 4.619370400657459e-06, "loss": 0.8112, "step": 1118 }, { "epoch": 1.8077544426494345, "grad_norm": 3.5039354133540854, "learning_rate": 4.618694964904562e-06, "loss": 0.7991, "step": 1119 }, { "epoch": 1.8093699515347335, "grad_norm": 3.2640403417418065, "learning_rate": 4.618018979866534e-06, "loss": 0.6479, "step": 1120 }, { "epoch": 1.8109854604200324, "grad_norm": 2.8412788489278173, "learning_rate": 4.617342445718629e-06, "loss": 0.6498, "step": 1121 }, { "epoch": 1.8126009693053313, "grad_norm": 4.1600903015515875, "learning_rate": 4.616665362636243e-06, "loss": 0.7004, "step": 1122 }, { "epoch": 1.81421647819063, "grad_norm": 3.62793167541856, "learning_rate": 4.6159877307949155e-06, "loss": 0.7511, "step": 1123 }, { "epoch": 1.815831987075929, "grad_norm": 3.4801676385047964, "learning_rate": 4.6153095503703234e-06, "loss": 0.7268, "step": 1124 }, { "epoch": 1.8174474959612277, "grad_norm": 3.376491050043304, "learning_rate": 4.614630821538292e-06, "loss": 0.6796, "step": 1125 }, { "epoch": 1.8190630048465266, "grad_norm": 3.1432430172678503, "learning_rate": 4.6139515444747855e-06, "loss": 0.6512, "step": 1126 }, { "epoch": 1.8206785137318255, "grad_norm": 3.3631643266721936, "learning_rate": 4.61327171935591e-06, "loss": 0.6773, "step": 1127 }, { "epoch": 1.8222940226171245, "grad_norm": 2.8431623530837924, "learning_rate": 4.612591346357914e-06, "loss": 0.7136, "step": 1128 }, { "epoch": 1.8239095315024234, "grad_norm": 3.483074406432927, "learning_rate": 4.6119104256571915e-06, "loss": 0.6841, "step": 1129 }, { "epoch": 1.8255250403877221, "grad_norm": 3.258650517003323, "learning_rate": 4.611228957430272e-06, "loss": 0.6539, "step": 1130 }, { "epoch": 1.827140549273021, "grad_norm": 3.5079026465772234, "learning_rate": 4.610546941853833e-06, "loss": 0.7891, "step": 1131 }, { "epoch": 1.8287560581583198, "grad_norm": 2.9641568604037674, "learning_rate": 4.60986437910469e-06, "loss": 0.6158, "step": 1132 }, { "epoch": 1.8303715670436187, "grad_norm": 3.1993725468198755, "learning_rate": 4.609181269359802e-06, "loss": 0.66, "step": 1133 }, { "epoch": 1.8319870759289176, "grad_norm": 3.3493167532433863, "learning_rate": 4.60849761279627e-06, "loss": 0.6911, "step": 1134 }, { "epoch": 1.8336025848142166, "grad_norm": 69.33553316294442, "learning_rate": 4.6078134095913366e-06, "loss": 0.8501, "step": 1135 }, { "epoch": 1.8352180936995155, "grad_norm": 3.8329976175170746, "learning_rate": 4.607128659922384e-06, "loss": 0.7616, "step": 1136 }, { "epoch": 1.8368336025848142, "grad_norm": 2.9818938333910308, "learning_rate": 4.60644336396694e-06, "loss": 0.6539, "step": 1137 }, { "epoch": 1.8384491114701131, "grad_norm": 3.9024640436561415, "learning_rate": 4.605757521902671e-06, "loss": 0.7968, "step": 1138 }, { "epoch": 1.8400646203554119, "grad_norm": 3.480649519696312, "learning_rate": 4.605071133907385e-06, "loss": 0.7824, "step": 1139 }, { "epoch": 1.8416801292407108, "grad_norm": 3.22595492866121, "learning_rate": 4.6043842001590344e-06, "loss": 0.6375, "step": 1140 }, { "epoch": 1.8432956381260097, "grad_norm": 3.1626250897292123, "learning_rate": 4.603696720835709e-06, "loss": 0.7792, "step": 1141 }, { "epoch": 1.8449111470113086, "grad_norm": 3.1157673274395616, "learning_rate": 4.6030086961156446e-06, "loss": 0.6911, "step": 1142 }, { "epoch": 1.8465266558966076, "grad_norm": 3.851338932955812, "learning_rate": 4.602320126177214e-06, "loss": 0.7898, "step": 1143 }, { "epoch": 1.8481421647819063, "grad_norm": 3.276974316262791, "learning_rate": 4.601631011198934e-06, "loss": 0.7317, "step": 1144 }, { "epoch": 1.849757673667205, "grad_norm": 3.0235091101528293, "learning_rate": 4.600941351359462e-06, "loss": 0.6458, "step": 1145 }, { "epoch": 1.851373182552504, "grad_norm": 3.674641098545654, "learning_rate": 4.600251146837597e-06, "loss": 0.7319, "step": 1146 }, { "epoch": 1.8529886914378029, "grad_norm": 3.43419186661694, "learning_rate": 4.599560397812279e-06, "loss": 0.6814, "step": 1147 }, { "epoch": 1.8546042003231018, "grad_norm": 2.8226391133533486, "learning_rate": 4.598869104462589e-06, "loss": 0.7021, "step": 1148 }, { "epoch": 1.8562197092084007, "grad_norm": 2.9750852332345707, "learning_rate": 4.5981772669677485e-06, "loss": 0.6817, "step": 1149 }, { "epoch": 1.8578352180936997, "grad_norm": 2.828838437788304, "learning_rate": 4.597484885507121e-06, "loss": 0.6573, "step": 1150 }, { "epoch": 1.8594507269789984, "grad_norm": 3.0714403980415743, "learning_rate": 4.5967919602602105e-06, "loss": 0.6823, "step": 1151 }, { "epoch": 1.861066235864297, "grad_norm": 3.905377148880026, "learning_rate": 4.596098491406664e-06, "loss": 0.6617, "step": 1152 }, { "epoch": 1.862681744749596, "grad_norm": 2.8573779907355457, "learning_rate": 4.595404479126266e-06, "loss": 0.5986, "step": 1153 }, { "epoch": 1.864297253634895, "grad_norm": 3.1769549556279246, "learning_rate": 4.594709923598944e-06, "loss": 0.7913, "step": 1154 }, { "epoch": 1.865912762520194, "grad_norm": 3.2021561756196917, "learning_rate": 4.594014825004766e-06, "loss": 0.6878, "step": 1155 }, { "epoch": 1.8675282714054928, "grad_norm": 3.2183636332898686, "learning_rate": 4.59331918352394e-06, "loss": 0.6006, "step": 1156 }, { "epoch": 1.8691437802907918, "grad_norm": 3.276838024011354, "learning_rate": 4.592622999336817e-06, "loss": 0.7052, "step": 1157 }, { "epoch": 1.8707592891760905, "grad_norm": 3.028822449908614, "learning_rate": 4.591926272623887e-06, "loss": 0.6509, "step": 1158 }, { "epoch": 1.8723747980613892, "grad_norm": 3.632092178700806, "learning_rate": 4.5912290035657795e-06, "loss": 0.6962, "step": 1159 }, { "epoch": 1.8739903069466881, "grad_norm": 3.067701153651212, "learning_rate": 4.590531192343266e-06, "loss": 0.6484, "step": 1160 }, { "epoch": 1.875605815831987, "grad_norm": 3.205292963698303, "learning_rate": 4.589832839137259e-06, "loss": 0.676, "step": 1161 }, { "epoch": 1.877221324717286, "grad_norm": 3.6053103322651077, "learning_rate": 4.589133944128813e-06, "loss": 0.6968, "step": 1162 }, { "epoch": 1.878836833602585, "grad_norm": 3.353839433366993, "learning_rate": 4.5884345074991175e-06, "loss": 0.6711, "step": 1163 }, { "epoch": 1.8804523424878838, "grad_norm": 5.005461183183196, "learning_rate": 4.587734529429508e-06, "loss": 0.797, "step": 1164 }, { "epoch": 1.8820678513731826, "grad_norm": 3.2361405970870285, "learning_rate": 4.587034010101457e-06, "loss": 0.7387, "step": 1165 }, { "epoch": 1.8836833602584813, "grad_norm": 3.613269356535257, "learning_rate": 4.58633294969658e-06, "loss": 0.6646, "step": 1166 }, { "epoch": 1.8852988691437802, "grad_norm": 3.1368846199489004, "learning_rate": 4.585631348396631e-06, "loss": 0.6515, "step": 1167 }, { "epoch": 1.8869143780290791, "grad_norm": 2.760610188946028, "learning_rate": 4.5849292063835045e-06, "loss": 0.657, "step": 1168 }, { "epoch": 1.888529886914378, "grad_norm": 2.9749611852623037, "learning_rate": 4.584226523839235e-06, "loss": 0.7223, "step": 1169 }, { "epoch": 1.890145395799677, "grad_norm": 3.14087014662755, "learning_rate": 4.5835233009459965e-06, "loss": 0.759, "step": 1170 }, { "epoch": 1.891760904684976, "grad_norm": 3.6251416831431356, "learning_rate": 4.582819537886105e-06, "loss": 0.68, "step": 1171 }, { "epoch": 1.8933764135702746, "grad_norm": 3.477425464213681, "learning_rate": 4.582115234842017e-06, "loss": 0.7511, "step": 1172 }, { "epoch": 1.8949919224555734, "grad_norm": 3.194482145205737, "learning_rate": 4.581410391996325e-06, "loss": 0.6989, "step": 1173 }, { "epoch": 1.8966074313408723, "grad_norm": 3.2612720053738187, "learning_rate": 4.5807050095317655e-06, "loss": 0.6882, "step": 1174 }, { "epoch": 1.8982229402261712, "grad_norm": 3.3196698330447427, "learning_rate": 4.579999087631213e-06, "loss": 0.7068, "step": 1175 }, { "epoch": 1.8998384491114702, "grad_norm": 2.896081752090587, "learning_rate": 4.579292626477681e-06, "loss": 0.6599, "step": 1176 }, { "epoch": 1.901453957996769, "grad_norm": 3.0509723004923104, "learning_rate": 4.578585626254327e-06, "loss": 0.7285, "step": 1177 }, { "epoch": 1.903069466882068, "grad_norm": 2.9598625267197396, "learning_rate": 4.5778780871444425e-06, "loss": 0.6715, "step": 1178 }, { "epoch": 1.9046849757673667, "grad_norm": 3.936307820472635, "learning_rate": 4.577170009331463e-06, "loss": 0.732, "step": 1179 }, { "epoch": 1.9063004846526654, "grad_norm": 3.458899001611642, "learning_rate": 4.576461392998961e-06, "loss": 0.6977, "step": 1180 }, { "epoch": 1.9079159935379644, "grad_norm": 2.996490730564444, "learning_rate": 4.57575223833065e-06, "loss": 0.653, "step": 1181 }, { "epoch": 1.9095315024232633, "grad_norm": 3.046197917903791, "learning_rate": 4.5750425455103855e-06, "loss": 0.661, "step": 1182 }, { "epoch": 1.9111470113085622, "grad_norm": 3.7587138745226674, "learning_rate": 4.574332314722156e-06, "loss": 0.7024, "step": 1183 }, { "epoch": 1.9127625201938612, "grad_norm": 3.6826858978665076, "learning_rate": 4.573621546150096e-06, "loss": 0.7356, "step": 1184 }, { "epoch": 1.9143780290791599, "grad_norm": 3.6231158444374247, "learning_rate": 4.572910239978475e-06, "loss": 0.6695, "step": 1185 }, { "epoch": 1.9159935379644588, "grad_norm": 3.4377606834103296, "learning_rate": 4.572198396391706e-06, "loss": 0.7155, "step": 1186 }, { "epoch": 1.9176090468497575, "grad_norm": 3.5003295058780193, "learning_rate": 4.571486015574336e-06, "loss": 0.6604, "step": 1187 }, { "epoch": 1.9192245557350565, "grad_norm": 3.1654582076620605, "learning_rate": 4.570773097711056e-06, "loss": 0.6075, "step": 1188 }, { "epoch": 1.9208400646203554, "grad_norm": 4.127082337278471, "learning_rate": 4.570059642986694e-06, "loss": 0.7438, "step": 1189 }, { "epoch": 1.9224555735056543, "grad_norm": 3.5790155431641537, "learning_rate": 4.569345651586218e-06, "loss": 0.7368, "step": 1190 }, { "epoch": 1.9240710823909533, "grad_norm": 3.3589412321507623, "learning_rate": 4.568631123694735e-06, "loss": 0.7863, "step": 1191 }, { "epoch": 1.925686591276252, "grad_norm": 3.3241450957062, "learning_rate": 4.56791605949749e-06, "loss": 0.7554, "step": 1192 }, { "epoch": 1.927302100161551, "grad_norm": 2.8737475927777885, "learning_rate": 4.567200459179869e-06, "loss": 0.6769, "step": 1193 }, { "epoch": 1.9289176090468496, "grad_norm": 3.000287443095188, "learning_rate": 4.566484322927396e-06, "loss": 0.7903, "step": 1194 }, { "epoch": 1.9305331179321485, "grad_norm": 3.6118340861091025, "learning_rate": 4.565767650925732e-06, "loss": 0.6989, "step": 1195 }, { "epoch": 1.9321486268174475, "grad_norm": 3.498935952036064, "learning_rate": 4.565050443360681e-06, "loss": 0.8008, "step": 1196 }, { "epoch": 1.9337641357027464, "grad_norm": 3.8217500121140144, "learning_rate": 4.564332700418182e-06, "loss": 0.7549, "step": 1197 }, { "epoch": 1.9353796445880453, "grad_norm": 2.805504211889597, "learning_rate": 4.563614422284316e-06, "loss": 0.6469, "step": 1198 }, { "epoch": 1.936995153473344, "grad_norm": 3.369244832484239, "learning_rate": 4.562895609145299e-06, "loss": 0.69, "step": 1199 }, { "epoch": 1.938610662358643, "grad_norm": 3.4885979009261257, "learning_rate": 4.56217626118749e-06, "loss": 0.7506, "step": 1200 }, { "epoch": 1.938610662358643, "eval_loss": 1.0279576778411865, "eval_runtime": 2.3571, "eval_samples_per_second": 127.273, "eval_steps_per_second": 2.97, "step": 1200 }, { "epoch": 1.9402261712439417, "grad_norm": 3.681079142218604, "learning_rate": 4.561456378597384e-06, "loss": 0.6837, "step": 1201 }, { "epoch": 1.9418416801292406, "grad_norm": 2.8749343409328287, "learning_rate": 4.560735961561614e-06, "loss": 0.5929, "step": 1202 }, { "epoch": 1.9434571890145396, "grad_norm": 3.2511551132526204, "learning_rate": 4.560015010266954e-06, "loss": 0.5858, "step": 1203 }, { "epoch": 1.9450726978998385, "grad_norm": 3.8205943699903413, "learning_rate": 4.559293524900314e-06, "loss": 0.7551, "step": 1204 }, { "epoch": 1.9466882067851374, "grad_norm": 3.2178655583922025, "learning_rate": 4.558571505648745e-06, "loss": 0.6558, "step": 1205 }, { "epoch": 1.9483037156704361, "grad_norm": 3.012085609696442, "learning_rate": 4.557848952699435e-06, "loss": 0.6356, "step": 1206 }, { "epoch": 1.949919224555735, "grad_norm": 3.467066198173169, "learning_rate": 4.557125866239709e-06, "loss": 0.643, "step": 1207 }, { "epoch": 1.9515347334410338, "grad_norm": 3.4196025420046077, "learning_rate": 4.5564022464570336e-06, "loss": 0.7397, "step": 1208 }, { "epoch": 1.9531502423263327, "grad_norm": 3.137001597645926, "learning_rate": 4.55567809353901e-06, "loss": 0.6551, "step": 1209 }, { "epoch": 1.9547657512116317, "grad_norm": 3.0007263817553445, "learning_rate": 4.554953407673381e-06, "loss": 0.6814, "step": 1210 }, { "epoch": 1.9563812600969306, "grad_norm": 3.4053683765348155, "learning_rate": 4.554228189048024e-06, "loss": 0.7337, "step": 1211 }, { "epoch": 1.9579967689822295, "grad_norm": 3.1775124026969195, "learning_rate": 4.55350243785096e-06, "loss": 0.6203, "step": 1212 }, { "epoch": 1.9596122778675282, "grad_norm": 3.752019836770551, "learning_rate": 4.552776154270341e-06, "loss": 0.6905, "step": 1213 }, { "epoch": 1.9612277867528272, "grad_norm": 3.294732853539312, "learning_rate": 4.552049338494462e-06, "loss": 0.6704, "step": 1214 }, { "epoch": 1.9628432956381259, "grad_norm": 3.628183625674725, "learning_rate": 4.551321990711755e-06, "loss": 0.8043, "step": 1215 }, { "epoch": 1.9644588045234248, "grad_norm": 3.390484083464067, "learning_rate": 4.550594111110789e-06, "loss": 0.6385, "step": 1216 }, { "epoch": 1.9660743134087237, "grad_norm": 3.096896658919059, "learning_rate": 4.549865699880273e-06, "loss": 0.6272, "step": 1217 }, { "epoch": 1.9676898222940227, "grad_norm": 3.6081381377936372, "learning_rate": 4.54913675720905e-06, "loss": 0.7309, "step": 1218 }, { "epoch": 1.9693053311793216, "grad_norm": 3.3965434237844847, "learning_rate": 4.548407283286104e-06, "loss": 0.675, "step": 1219 }, { "epoch": 1.9709208400646203, "grad_norm": 2.9837796057887886, "learning_rate": 4.547677278300555e-06, "loss": 0.6251, "step": 1220 }, { "epoch": 1.9725363489499192, "grad_norm": 3.156429480046301, "learning_rate": 4.5469467424416624e-06, "loss": 0.7149, "step": 1221 }, { "epoch": 1.974151857835218, "grad_norm": 3.209614969473873, "learning_rate": 4.546215675898822e-06, "loss": 0.6624, "step": 1222 }, { "epoch": 1.975767366720517, "grad_norm": 3.1733550475887813, "learning_rate": 4.545484078861568e-06, "loss": 0.7106, "step": 1223 }, { "epoch": 1.9773828756058158, "grad_norm": 3.624249241961771, "learning_rate": 4.54475195151957e-06, "loss": 0.7673, "step": 1224 }, { "epoch": 1.9789983844911148, "grad_norm": 3.292473248074704, "learning_rate": 4.54401929406264e-06, "loss": 0.781, "step": 1225 }, { "epoch": 1.9806138933764137, "grad_norm": 3.0887401768845346, "learning_rate": 4.5432861066807184e-06, "loss": 0.6385, "step": 1226 }, { "epoch": 1.9822294022617124, "grad_norm": 3.33963361739408, "learning_rate": 4.542552389563895e-06, "loss": 0.6947, "step": 1227 }, { "epoch": 1.9838449111470113, "grad_norm": 3.4768864634011547, "learning_rate": 4.541818142902386e-06, "loss": 0.6292, "step": 1228 }, { "epoch": 1.98546042003231, "grad_norm": 3.161159990306139, "learning_rate": 4.541083366886552e-06, "loss": 0.7054, "step": 1229 }, { "epoch": 1.987075928917609, "grad_norm": 2.86817840681304, "learning_rate": 4.540348061706886e-06, "loss": 0.6475, "step": 1230 }, { "epoch": 1.988691437802908, "grad_norm": 3.2605565722188645, "learning_rate": 4.539612227554024e-06, "loss": 0.7459, "step": 1231 }, { "epoch": 1.9903069466882068, "grad_norm": 2.773198926596929, "learning_rate": 4.5388758646187326e-06, "loss": 0.6368, "step": 1232 }, { "epoch": 1.9919224555735058, "grad_norm": 3.834867170062262, "learning_rate": 4.5381389730919205e-06, "loss": 0.7593, "step": 1233 }, { "epoch": 1.9935379644588045, "grad_norm": 3.336902418927337, "learning_rate": 4.53740155316463e-06, "loss": 0.7113, "step": 1234 }, { "epoch": 1.9951534733441034, "grad_norm": 3.2591191380885456, "learning_rate": 4.536663605028043e-06, "loss": 0.6599, "step": 1235 }, { "epoch": 1.9967689822294021, "grad_norm": 3.687135098425971, "learning_rate": 4.535925128873477e-06, "loss": 0.6751, "step": 1236 }, { "epoch": 1.998384491114701, "grad_norm": 2.8902022325231274, "learning_rate": 4.535186124892386e-06, "loss": 0.6246, "step": 1237 }, { "epoch": 2.0, "grad_norm": 2.5069322476172466, "learning_rate": 4.534446593276362e-06, "loss": 0.5399, "step": 1238 }, { "epoch": 2.001615508885299, "grad_norm": 4.2639487791424076, "learning_rate": 4.533706534217134e-06, "loss": 0.3502, "step": 1239 }, { "epoch": 2.003231017770598, "grad_norm": 3.8942516149727084, "learning_rate": 4.532965947906566e-06, "loss": 0.4451, "step": 1240 }, { "epoch": 2.004846526655897, "grad_norm": 3.3055886820898857, "learning_rate": 4.532224834536659e-06, "loss": 0.4528, "step": 1241 }, { "epoch": 2.0064620355411953, "grad_norm": 3.4006212008333225, "learning_rate": 4.5314831942995544e-06, "loss": 0.4353, "step": 1242 }, { "epoch": 2.008077544426494, "grad_norm": 4.287211578133182, "learning_rate": 4.530741027387523e-06, "loss": 0.4146, "step": 1243 }, { "epoch": 2.009693053311793, "grad_norm": 4.002310126983108, "learning_rate": 4.52999833399298e-06, "loss": 0.3739, "step": 1244 }, { "epoch": 2.011308562197092, "grad_norm": 4.1625145311630725, "learning_rate": 4.52925511430847e-06, "loss": 0.4087, "step": 1245 }, { "epoch": 2.012924071082391, "grad_norm": 4.0236264517231035, "learning_rate": 4.5285113685266805e-06, "loss": 0.457, "step": 1246 }, { "epoch": 2.01453957996769, "grad_norm": 4.2059974033590555, "learning_rate": 4.5277670968404305e-06, "loss": 0.3976, "step": 1247 }, { "epoch": 2.016155088852989, "grad_norm": 3.412640169137161, "learning_rate": 4.527022299442678e-06, "loss": 0.3258, "step": 1248 }, { "epoch": 2.0177705977382874, "grad_norm": 3.5097845652843787, "learning_rate": 4.526276976526515e-06, "loss": 0.3734, "step": 1249 }, { "epoch": 2.0193861066235863, "grad_norm": 2.839135462553247, "learning_rate": 4.5255311282851735e-06, "loss": 0.3073, "step": 1250 }, { "epoch": 2.0210016155088852, "grad_norm": 3.2939032704931166, "learning_rate": 4.524784754912018e-06, "loss": 0.3762, "step": 1251 }, { "epoch": 2.022617124394184, "grad_norm": 3.0881816455053643, "learning_rate": 4.524037856600551e-06, "loss": 0.4102, "step": 1252 }, { "epoch": 2.024232633279483, "grad_norm": 3.011694890649903, "learning_rate": 4.523290433544409e-06, "loss": 0.3647, "step": 1253 }, { "epoch": 2.025848142164782, "grad_norm": 3.7740439707190494, "learning_rate": 4.522542485937369e-06, "loss": 0.4505, "step": 1254 }, { "epoch": 2.027463651050081, "grad_norm": 4.056600720144823, "learning_rate": 4.521794013973339e-06, "loss": 0.4152, "step": 1255 }, { "epoch": 2.0290791599353795, "grad_norm": 4.268885886250012, "learning_rate": 4.521045017846366e-06, "loss": 0.4283, "step": 1256 }, { "epoch": 2.0306946688206784, "grad_norm": 3.607030413177723, "learning_rate": 4.520295497750632e-06, "loss": 0.3537, "step": 1257 }, { "epoch": 2.0323101777059773, "grad_norm": 3.51733523671672, "learning_rate": 4.5195454538804544e-06, "loss": 0.3748, "step": 1258 }, { "epoch": 2.0339256865912763, "grad_norm": 3.6365904540261282, "learning_rate": 4.518794886430288e-06, "loss": 0.3185, "step": 1259 }, { "epoch": 2.035541195476575, "grad_norm": 2.908267141410896, "learning_rate": 4.51804379559472e-06, "loss": 0.3431, "step": 1260 }, { "epoch": 2.037156704361874, "grad_norm": 3.4564437996984494, "learning_rate": 4.517292181568477e-06, "loss": 0.3318, "step": 1261 }, { "epoch": 2.038772213247173, "grad_norm": 3.4909955608578134, "learning_rate": 4.51654004454642e-06, "loss": 0.3654, "step": 1262 }, { "epoch": 2.0403877221324715, "grad_norm": 3.2054436656038052, "learning_rate": 4.515787384723545e-06, "loss": 0.4137, "step": 1263 }, { "epoch": 2.0420032310177705, "grad_norm": 3.2676066892981765, "learning_rate": 4.515034202294983e-06, "loss": 0.371, "step": 1264 }, { "epoch": 2.0436187399030694, "grad_norm": 3.413040022572559, "learning_rate": 4.514280497456002e-06, "loss": 0.3783, "step": 1265 }, { "epoch": 2.0452342487883683, "grad_norm": 3.7857581973769014, "learning_rate": 4.513526270402005e-06, "loss": 0.3773, "step": 1266 }, { "epoch": 2.0468497576736673, "grad_norm": 3.3425396529152547, "learning_rate": 4.512771521328529e-06, "loss": 0.3728, "step": 1267 }, { "epoch": 2.048465266558966, "grad_norm": 3.198604159512752, "learning_rate": 4.5120162504312495e-06, "loss": 0.3393, "step": 1268 }, { "epoch": 2.050080775444265, "grad_norm": 3.4510806593234116, "learning_rate": 4.511260457905974e-06, "loss": 0.34, "step": 1269 }, { "epoch": 2.0516962843295636, "grad_norm": 3.4993135308812056, "learning_rate": 4.510504143948646e-06, "loss": 0.3849, "step": 1270 }, { "epoch": 2.0533117932148626, "grad_norm": 3.3418426090113313, "learning_rate": 4.509747308755347e-06, "loss": 0.3808, "step": 1271 }, { "epoch": 2.0549273021001615, "grad_norm": 3.204012298337511, "learning_rate": 4.508989952522288e-06, "loss": 0.4323, "step": 1272 }, { "epoch": 2.0565428109854604, "grad_norm": 3.9610604584423097, "learning_rate": 4.50823207544582e-06, "loss": 0.4146, "step": 1273 }, { "epoch": 2.0581583198707594, "grad_norm": 3.321591995588428, "learning_rate": 4.507473677722429e-06, "loss": 0.3767, "step": 1274 }, { "epoch": 2.0597738287560583, "grad_norm": 3.1837727231354642, "learning_rate": 4.506714759548731e-06, "loss": 0.4273, "step": 1275 }, { "epoch": 2.0613893376413572, "grad_norm": 3.3922082245131553, "learning_rate": 4.505955321121485e-06, "loss": 0.4276, "step": 1276 }, { "epoch": 2.0630048465266557, "grad_norm": 3.125159754131894, "learning_rate": 4.505195362637576e-06, "loss": 0.3798, "step": 1277 }, { "epoch": 2.0646203554119547, "grad_norm": 3.144137110558031, "learning_rate": 4.504434884294029e-06, "loss": 0.3409, "step": 1278 }, { "epoch": 2.0662358642972536, "grad_norm": 2.8757321107876876, "learning_rate": 4.503673886288004e-06, "loss": 0.2956, "step": 1279 }, { "epoch": 2.0678513731825525, "grad_norm": 3.516839320003112, "learning_rate": 4.502912368816794e-06, "loss": 0.5039, "step": 1280 }, { "epoch": 2.0694668820678515, "grad_norm": 3.2692557835489455, "learning_rate": 4.5021503320778285e-06, "loss": 0.3546, "step": 1281 }, { "epoch": 2.0710823909531504, "grad_norm": 2.7703008254346244, "learning_rate": 4.501387776268667e-06, "loss": 0.2875, "step": 1282 }, { "epoch": 2.0726978998384493, "grad_norm": 3.6263278158369157, "learning_rate": 4.500624701587011e-06, "loss": 0.4222, "step": 1283 }, { "epoch": 2.074313408723748, "grad_norm": 3.327110376975682, "learning_rate": 4.49986110823069e-06, "loss": 0.4051, "step": 1284 }, { "epoch": 2.0759289176090467, "grad_norm": 3.016053672621238, "learning_rate": 4.4990969963976705e-06, "loss": 0.3775, "step": 1285 }, { "epoch": 2.0775444264943457, "grad_norm": 3.47117971866676, "learning_rate": 4.4983323662860545e-06, "loss": 0.3808, "step": 1286 }, { "epoch": 2.0791599353796446, "grad_norm": 3.8509610124343365, "learning_rate": 4.497567218094076e-06, "loss": 0.3645, "step": 1287 }, { "epoch": 2.0807754442649435, "grad_norm": 3.4450311505539837, "learning_rate": 4.496801552020106e-06, "loss": 0.3639, "step": 1288 }, { "epoch": 2.0823909531502425, "grad_norm": 3.8166435129924725, "learning_rate": 4.4960353682626465e-06, "loss": 0.3971, "step": 1289 }, { "epoch": 2.0840064620355414, "grad_norm": 3.3124160486133576, "learning_rate": 4.495268667020336e-06, "loss": 0.3353, "step": 1290 }, { "epoch": 2.08562197092084, "grad_norm": 3.641399578056048, "learning_rate": 4.494501448491948e-06, "loss": 0.383, "step": 1291 }, { "epoch": 2.087237479806139, "grad_norm": 3.382500430845839, "learning_rate": 4.493733712876387e-06, "loss": 0.4147, "step": 1292 }, { "epoch": 2.0888529886914378, "grad_norm": 3.8840548518898994, "learning_rate": 4.492965460372695e-06, "loss": 0.3962, "step": 1293 }, { "epoch": 2.0904684975767367, "grad_norm": 3.8875119933290176, "learning_rate": 4.492196691180045e-06, "loss": 0.4537, "step": 1294 }, { "epoch": 2.0920840064620356, "grad_norm": 3.5787342678158174, "learning_rate": 4.4914274054977455e-06, "loss": 0.4117, "step": 1295 }, { "epoch": 2.0936995153473346, "grad_norm": 3.234394979423552, "learning_rate": 4.4906576035252406e-06, "loss": 0.3809, "step": 1296 }, { "epoch": 2.0953150242326335, "grad_norm": 2.9607644205774317, "learning_rate": 4.4898872854621026e-06, "loss": 0.3623, "step": 1297 }, { "epoch": 2.096930533117932, "grad_norm": 3.5440458126886427, "learning_rate": 4.489116451508044e-06, "loss": 0.4038, "step": 1298 }, { "epoch": 2.098546042003231, "grad_norm": 3.2206818765134275, "learning_rate": 4.488345101862909e-06, "loss": 0.3832, "step": 1299 }, { "epoch": 2.10016155088853, "grad_norm": 3.3888543164759954, "learning_rate": 4.487573236726674e-06, "loss": 0.3621, "step": 1300 }, { "epoch": 2.101777059773829, "grad_norm": 3.0709928923756507, "learning_rate": 4.486800856299448e-06, "loss": 0.3343, "step": 1301 }, { "epoch": 2.1033925686591277, "grad_norm": 2.9408046982062555, "learning_rate": 4.486027960781478e-06, "loss": 0.2943, "step": 1302 }, { "epoch": 2.1050080775444266, "grad_norm": 3.929167686208214, "learning_rate": 4.4852545503731415e-06, "loss": 0.4124, "step": 1303 }, { "epoch": 2.106623586429725, "grad_norm": 3.7754213217571033, "learning_rate": 4.484480625274951e-06, "loss": 0.3625, "step": 1304 }, { "epoch": 2.108239095315024, "grad_norm": 3.6326761510887016, "learning_rate": 4.483706185687548e-06, "loss": 0.4472, "step": 1305 }, { "epoch": 2.109854604200323, "grad_norm": 4.447850443680651, "learning_rate": 4.482931231811716e-06, "loss": 0.343, "step": 1306 }, { "epoch": 2.111470113085622, "grad_norm": 3.42579067551481, "learning_rate": 4.4821557638483626e-06, "loss": 0.3767, "step": 1307 }, { "epoch": 2.113085621970921, "grad_norm": 4.043057998564379, "learning_rate": 4.4813797819985335e-06, "loss": 0.4206, "step": 1308 }, { "epoch": 2.11470113085622, "grad_norm": 3.7201544847804775, "learning_rate": 4.4806032864634085e-06, "loss": 0.3607, "step": 1309 }, { "epoch": 2.1163166397415187, "grad_norm": 3.4873256119675387, "learning_rate": 4.479826277444299e-06, "loss": 0.4054, "step": 1310 }, { "epoch": 2.1179321486268172, "grad_norm": 3.276318977594615, "learning_rate": 4.479048755142648e-06, "loss": 0.3837, "step": 1311 }, { "epoch": 2.119547657512116, "grad_norm": 2.9486705194236937, "learning_rate": 4.478270719760033e-06, "loss": 0.337, "step": 1312 }, { "epoch": 2.121163166397415, "grad_norm": 3.6014604181771284, "learning_rate": 4.477492171498166e-06, "loss": 0.4076, "step": 1313 }, { "epoch": 2.122778675282714, "grad_norm": 3.782521515551011, "learning_rate": 4.476713110558889e-06, "loss": 0.4045, "step": 1314 }, { "epoch": 2.124394184168013, "grad_norm": 3.623889283978945, "learning_rate": 4.47593353714418e-06, "loss": 0.4194, "step": 1315 }, { "epoch": 2.126009693053312, "grad_norm": 3.0143201116466014, "learning_rate": 4.475153451456149e-06, "loss": 0.4344, "step": 1316 }, { "epoch": 2.127625201938611, "grad_norm": 3.5378380401762026, "learning_rate": 4.474372853697036e-06, "loss": 0.3671, "step": 1317 }, { "epoch": 2.1292407108239093, "grad_norm": 3.3194474622512566, "learning_rate": 4.4735917440692165e-06, "loss": 0.3673, "step": 1318 }, { "epoch": 2.1308562197092082, "grad_norm": 3.4746464933389594, "learning_rate": 4.472810122775199e-06, "loss": 0.3785, "step": 1319 }, { "epoch": 2.132471728594507, "grad_norm": 3.541535549168205, "learning_rate": 4.472027990017623e-06, "loss": 0.3365, "step": 1320 }, { "epoch": 2.134087237479806, "grad_norm": 3.6389781664661323, "learning_rate": 4.471245345999262e-06, "loss": 0.4622, "step": 1321 }, { "epoch": 2.135702746365105, "grad_norm": 3.6197639696660464, "learning_rate": 4.470462190923022e-06, "loss": 0.4305, "step": 1322 }, { "epoch": 2.137318255250404, "grad_norm": 4.126882980291173, "learning_rate": 4.469678524991939e-06, "loss": 0.3479, "step": 1323 }, { "epoch": 2.138933764135703, "grad_norm": 3.720463023818055, "learning_rate": 4.4688943484091855e-06, "loss": 0.4555, "step": 1324 }, { "epoch": 2.1405492730210014, "grad_norm": 2.616870124627718, "learning_rate": 4.468109661378062e-06, "loss": 0.2908, "step": 1325 }, { "epoch": 2.1421647819063003, "grad_norm": 2.9934427173465807, "learning_rate": 4.467324464102005e-06, "loss": 0.3496, "step": 1326 }, { "epoch": 2.1437802907915993, "grad_norm": 3.0221168509035867, "learning_rate": 4.466538756784582e-06, "loss": 0.3324, "step": 1327 }, { "epoch": 2.145395799676898, "grad_norm": 3.653052888068948, "learning_rate": 4.465752539629492e-06, "loss": 0.3711, "step": 1328 }, { "epoch": 2.147011308562197, "grad_norm": 3.3236178139730126, "learning_rate": 4.464965812840567e-06, "loss": 0.3241, "step": 1329 }, { "epoch": 2.148626817447496, "grad_norm": 2.924289780504899, "learning_rate": 4.464178576621772e-06, "loss": 0.3089, "step": 1330 }, { "epoch": 2.150242326332795, "grad_norm": 3.366038360603095, "learning_rate": 4.463390831177201e-06, "loss": 0.3548, "step": 1331 }, { "epoch": 2.1518578352180935, "grad_norm": 3.3896192589591987, "learning_rate": 4.462602576711084e-06, "loss": 0.3811, "step": 1332 }, { "epoch": 2.1534733441033924, "grad_norm": 3.3740663959917137, "learning_rate": 4.461813813427779e-06, "loss": 0.353, "step": 1333 }, { "epoch": 2.1550888529886914, "grad_norm": 3.7052923586903983, "learning_rate": 4.461024541531779e-06, "loss": 0.3966, "step": 1334 }, { "epoch": 2.1567043618739903, "grad_norm": 3.75096410327994, "learning_rate": 4.460234761227709e-06, "loss": 0.4071, "step": 1335 }, { "epoch": 2.158319870759289, "grad_norm": 3.9708318894404484, "learning_rate": 4.4594444727203215e-06, "loss": 0.3894, "step": 1336 }, { "epoch": 2.159935379644588, "grad_norm": 3.3836351295580545, "learning_rate": 4.458653676214506e-06, "loss": 0.3879, "step": 1337 }, { "epoch": 2.161550888529887, "grad_norm": 3.6889298437301163, "learning_rate": 4.457862371915281e-06, "loss": 0.4706, "step": 1338 }, { "epoch": 2.1631663974151856, "grad_norm": 3.6154036224468555, "learning_rate": 4.457070560027797e-06, "loss": 0.3786, "step": 1339 }, { "epoch": 2.1647819063004845, "grad_norm": 3.7701398659916108, "learning_rate": 4.456278240757338e-06, "loss": 0.4101, "step": 1340 }, { "epoch": 2.1663974151857834, "grad_norm": 3.8253439744679594, "learning_rate": 4.455485414309315e-06, "loss": 0.3464, "step": 1341 }, { "epoch": 2.1680129240710824, "grad_norm": 3.2477904679071794, "learning_rate": 4.4546920808892744e-06, "loss": 0.3784, "step": 1342 }, { "epoch": 2.1696284329563813, "grad_norm": 3.5368001914056193, "learning_rate": 4.4538982407028936e-06, "loss": 0.3452, "step": 1343 }, { "epoch": 2.1712439418416802, "grad_norm": 3.213453562486292, "learning_rate": 4.45310389395598e-06, "loss": 0.3719, "step": 1344 }, { "epoch": 2.172859450726979, "grad_norm": 3.2045424172477706, "learning_rate": 4.452309040854474e-06, "loss": 0.3564, "step": 1345 }, { "epoch": 2.1744749596122777, "grad_norm": 3.828741972964734, "learning_rate": 4.451513681604444e-06, "loss": 0.4359, "step": 1346 }, { "epoch": 2.1760904684975766, "grad_norm": 3.7669774135600367, "learning_rate": 4.450717816412095e-06, "loss": 0.4104, "step": 1347 }, { "epoch": 2.1777059773828755, "grad_norm": 3.6674622843347926, "learning_rate": 4.449921445483758e-06, "loss": 0.3797, "step": 1348 }, { "epoch": 2.1793214862681745, "grad_norm": 3.6151618902132654, "learning_rate": 4.449124569025898e-06, "loss": 0.3935, "step": 1349 }, { "epoch": 2.1809369951534734, "grad_norm": 3.2329401082115345, "learning_rate": 4.448327187245109e-06, "loss": 0.3211, "step": 1350 }, { "epoch": 2.1825525040387723, "grad_norm": 3.2308709231951207, "learning_rate": 4.44752930034812e-06, "loss": 0.3762, "step": 1351 }, { "epoch": 2.1841680129240713, "grad_norm": 3.2140079084712103, "learning_rate": 4.446730908541785e-06, "loss": 0.3414, "step": 1352 }, { "epoch": 2.1857835218093697, "grad_norm": 4.490339976311839, "learning_rate": 4.445932012033096e-06, "loss": 0.4013, "step": 1353 }, { "epoch": 2.1873990306946687, "grad_norm": 3.630676221398067, "learning_rate": 4.445132611029168e-06, "loss": 0.3868, "step": 1354 }, { "epoch": 2.1890145395799676, "grad_norm": 3.5811751346698832, "learning_rate": 4.4443327057372536e-06, "loss": 0.3889, "step": 1355 }, { "epoch": 2.1906300484652665, "grad_norm": 3.3437686018878003, "learning_rate": 4.443532296364732e-06, "loss": 0.4078, "step": 1356 }, { "epoch": 2.1922455573505655, "grad_norm": 3.6796639631882924, "learning_rate": 4.442731383119115e-06, "loss": 0.3998, "step": 1357 }, { "epoch": 2.1938610662358644, "grad_norm": 4.081310106548665, "learning_rate": 4.441929966208043e-06, "loss": 0.3957, "step": 1358 }, { "epoch": 2.1954765751211633, "grad_norm": 3.5102475446634323, "learning_rate": 4.441128045839291e-06, "loss": 0.3484, "step": 1359 }, { "epoch": 2.197092084006462, "grad_norm": 3.2881121866569103, "learning_rate": 4.44032562222076e-06, "loss": 0.351, "step": 1360 }, { "epoch": 2.1987075928917608, "grad_norm": 3.393935096904267, "learning_rate": 4.4395226955604845e-06, "loss": 0.3052, "step": 1361 }, { "epoch": 2.2003231017770597, "grad_norm": 4.00234336798314, "learning_rate": 4.4387192660666265e-06, "loss": 0.4085, "step": 1362 }, { "epoch": 2.2019386106623586, "grad_norm": 3.32369710698198, "learning_rate": 4.4379153339474825e-06, "loss": 0.4397, "step": 1363 }, { "epoch": 2.2035541195476576, "grad_norm": 3.4137861728481225, "learning_rate": 4.437110899411477e-06, "loss": 0.3428, "step": 1364 }, { "epoch": 2.2051696284329565, "grad_norm": 4.325500995477943, "learning_rate": 4.436305962667162e-06, "loss": 0.3721, "step": 1365 }, { "epoch": 2.2067851373182554, "grad_norm": 3.4450182936423612, "learning_rate": 4.435500523923225e-06, "loss": 0.3845, "step": 1366 }, { "epoch": 2.208400646203554, "grad_norm": 3.2769648813014034, "learning_rate": 4.43469458338848e-06, "loss": 0.3578, "step": 1367 }, { "epoch": 2.210016155088853, "grad_norm": 3.0526649378999475, "learning_rate": 4.433888141271873e-06, "loss": 0.3892, "step": 1368 }, { "epoch": 2.211631663974152, "grad_norm": 3.8828786677080243, "learning_rate": 4.433081197782479e-06, "loss": 0.4464, "step": 1369 }, { "epoch": 2.2132471728594507, "grad_norm": 3.5583194203338, "learning_rate": 4.432273753129503e-06, "loss": 0.3824, "step": 1370 }, { "epoch": 2.2148626817447497, "grad_norm": 3.966822261995666, "learning_rate": 4.43146580752228e-06, "loss": 0.4395, "step": 1371 }, { "epoch": 2.2164781906300486, "grad_norm": 3.8568341516222397, "learning_rate": 4.430657361170274e-06, "loss": 0.4208, "step": 1372 }, { "epoch": 2.2180936995153475, "grad_norm": 3.748455560266487, "learning_rate": 4.4298484142830835e-06, "loss": 0.3502, "step": 1373 }, { "epoch": 2.219709208400646, "grad_norm": 3.4013757470708677, "learning_rate": 4.429038967070429e-06, "loss": 0.3696, "step": 1374 }, { "epoch": 2.221324717285945, "grad_norm": 4.71576879868742, "learning_rate": 4.4282290197421674e-06, "loss": 0.352, "step": 1375 }, { "epoch": 2.222940226171244, "grad_norm": 3.203409752777254, "learning_rate": 4.427418572508281e-06, "loss": 0.351, "step": 1376 }, { "epoch": 2.224555735056543, "grad_norm": 3.227852443589977, "learning_rate": 4.4266076255788845e-06, "loss": 0.3705, "step": 1377 }, { "epoch": 2.2261712439418417, "grad_norm": 3.746716473009662, "learning_rate": 4.425796179164221e-06, "loss": 0.4233, "step": 1378 }, { "epoch": 2.2277867528271407, "grad_norm": 3.076468350586058, "learning_rate": 4.424984233474663e-06, "loss": 0.3492, "step": 1379 }, { "epoch": 2.2294022617124396, "grad_norm": 3.342514759904531, "learning_rate": 4.424171788720713e-06, "loss": 0.3909, "step": 1380 }, { "epoch": 2.231017770597738, "grad_norm": 3.477203434565954, "learning_rate": 4.4233588451130005e-06, "loss": 0.3249, "step": 1381 }, { "epoch": 2.232633279483037, "grad_norm": 2.8919212135331005, "learning_rate": 4.422545402862289e-06, "loss": 0.3029, "step": 1382 }, { "epoch": 2.234248788368336, "grad_norm": 3.4505305130104826, "learning_rate": 4.421731462179466e-06, "loss": 0.3982, "step": 1383 }, { "epoch": 2.235864297253635, "grad_norm": 3.647214650014195, "learning_rate": 4.420917023275554e-06, "loss": 0.3572, "step": 1384 }, { "epoch": 2.237479806138934, "grad_norm": 4.3111771580428755, "learning_rate": 4.420102086361699e-06, "loss": 0.4096, "step": 1385 }, { "epoch": 2.2390953150242328, "grad_norm": 3.912223728269677, "learning_rate": 4.419286651649178e-06, "loss": 0.3875, "step": 1386 }, { "epoch": 2.2407108239095317, "grad_norm": 3.3936850602611917, "learning_rate": 4.418470719349399e-06, "loss": 0.3492, "step": 1387 }, { "epoch": 2.24232633279483, "grad_norm": 3.116989772204181, "learning_rate": 4.417654289673898e-06, "loss": 0.3343, "step": 1388 }, { "epoch": 2.243941841680129, "grad_norm": 3.594697319812993, "learning_rate": 4.416837362834339e-06, "loss": 0.3557, "step": 1389 }, { "epoch": 2.245557350565428, "grad_norm": 3.8077426153202656, "learning_rate": 4.416019939042515e-06, "loss": 0.4197, "step": 1390 }, { "epoch": 2.247172859450727, "grad_norm": 3.6039248526995853, "learning_rate": 4.415202018510349e-06, "loss": 0.4948, "step": 1391 }, { "epoch": 2.248788368336026, "grad_norm": 3.003012336542264, "learning_rate": 4.414383601449892e-06, "loss": 0.3238, "step": 1392 }, { "epoch": 2.250403877221325, "grad_norm": 3.742287440055839, "learning_rate": 4.413564688073324e-06, "loss": 0.452, "step": 1393 }, { "epoch": 2.2520193861066238, "grad_norm": 4.053602188795989, "learning_rate": 4.412745278592953e-06, "loss": 0.4002, "step": 1394 }, { "epoch": 2.2536348949919223, "grad_norm": 3.1749721570383955, "learning_rate": 4.411925373221215e-06, "loss": 0.3584, "step": 1395 }, { "epoch": 2.255250403877221, "grad_norm": 3.9315392393232393, "learning_rate": 4.411104972170679e-06, "loss": 0.3684, "step": 1396 }, { "epoch": 2.25686591276252, "grad_norm": 3.6324687616184956, "learning_rate": 4.4102840756540355e-06, "loss": 0.4332, "step": 1397 }, { "epoch": 2.258481421647819, "grad_norm": 3.6046664386463125, "learning_rate": 4.409462683884109e-06, "loss": 0.3395, "step": 1398 }, { "epoch": 2.260096930533118, "grad_norm": 3.4334187193556254, "learning_rate": 4.408640797073851e-06, "loss": 0.3561, "step": 1399 }, { "epoch": 2.261712439418417, "grad_norm": 3.7006927473761024, "learning_rate": 4.4078184154363405e-06, "loss": 0.3826, "step": 1400 }, { "epoch": 2.261712439418417, "eval_loss": 1.1243202686309814, "eval_runtime": 2.3521, "eval_samples_per_second": 127.543, "eval_steps_per_second": 2.976, "step": 1400 }, { "epoch": 2.263327948303716, "grad_norm": 3.4371057647008527, "learning_rate": 4.406995539184785e-06, "loss": 0.381, "step": 1401 }, { "epoch": 2.2649434571890144, "grad_norm": 3.8891993895009005, "learning_rate": 4.4061721685325196e-06, "loss": 0.406, "step": 1402 }, { "epoch": 2.2665589660743133, "grad_norm": 3.510214037579037, "learning_rate": 4.405348303693009e-06, "loss": 0.4097, "step": 1403 }, { "epoch": 2.268174474959612, "grad_norm": 3.662688139119417, "learning_rate": 4.404523944879845e-06, "loss": 0.3688, "step": 1404 }, { "epoch": 2.269789983844911, "grad_norm": 4.227744203611783, "learning_rate": 4.403699092306749e-06, "loss": 0.4105, "step": 1405 }, { "epoch": 2.27140549273021, "grad_norm": 3.812848220615581, "learning_rate": 4.402873746187568e-06, "loss": 0.4323, "step": 1406 }, { "epoch": 2.273021001615509, "grad_norm": 3.583162311964632, "learning_rate": 4.4020479067362795e-06, "loss": 0.4162, "step": 1407 }, { "epoch": 2.274636510500808, "grad_norm": 3.456251023602101, "learning_rate": 4.401221574166986e-06, "loss": 0.3736, "step": 1408 }, { "epoch": 2.2762520193861064, "grad_norm": 3.7174482520697443, "learning_rate": 4.400394748693921e-06, "loss": 0.4016, "step": 1409 }, { "epoch": 2.2778675282714054, "grad_norm": 3.6116364781130006, "learning_rate": 4.399567430531444e-06, "loss": 0.3716, "step": 1410 }, { "epoch": 2.2794830371567043, "grad_norm": 3.4802314015010407, "learning_rate": 4.398739619894042e-06, "loss": 0.4348, "step": 1411 }, { "epoch": 2.2810985460420032, "grad_norm": 3.737084608189117, "learning_rate": 4.397911316996329e-06, "loss": 0.4094, "step": 1412 }, { "epoch": 2.282714054927302, "grad_norm": 3.0879921931207415, "learning_rate": 4.397082522053051e-06, "loss": 0.3039, "step": 1413 }, { "epoch": 2.284329563812601, "grad_norm": 3.1946730906395295, "learning_rate": 4.396253235279076e-06, "loss": 0.3563, "step": 1414 }, { "epoch": 2.2859450726979, "grad_norm": 3.9649633649336042, "learning_rate": 4.395423456889401e-06, "loss": 0.4626, "step": 1415 }, { "epoch": 2.2875605815831985, "grad_norm": 3.2802219089457916, "learning_rate": 4.394593187099154e-06, "loss": 0.3887, "step": 1416 }, { "epoch": 2.2891760904684975, "grad_norm": 3.6296642233218916, "learning_rate": 4.3937624261235855e-06, "loss": 0.3948, "step": 1417 }, { "epoch": 2.2907915993537964, "grad_norm": 3.4519124520087896, "learning_rate": 4.3929311741780765e-06, "loss": 0.3485, "step": 1418 }, { "epoch": 2.2924071082390953, "grad_norm": 3.5425401241047734, "learning_rate": 4.392099431478135e-06, "loss": 0.3631, "step": 1419 }, { "epoch": 2.2940226171243943, "grad_norm": 3.0573678206882797, "learning_rate": 4.391267198239394e-06, "loss": 0.2756, "step": 1420 }, { "epoch": 2.295638126009693, "grad_norm": 3.6369927682876093, "learning_rate": 4.390434474677616e-06, "loss": 0.3591, "step": 1421 }, { "epoch": 2.297253634894992, "grad_norm": 3.509620503137033, "learning_rate": 4.389601261008691e-06, "loss": 0.4516, "step": 1422 }, { "epoch": 2.2988691437802906, "grad_norm": 4.394826236311602, "learning_rate": 4.388767557448633e-06, "loss": 0.4172, "step": 1423 }, { "epoch": 2.3004846526655895, "grad_norm": 3.845829069902198, "learning_rate": 4.3879333642135856e-06, "loss": 0.4131, "step": 1424 }, { "epoch": 2.3021001615508885, "grad_norm": 3.144177289961547, "learning_rate": 4.38709868151982e-06, "loss": 0.358, "step": 1425 }, { "epoch": 2.3037156704361874, "grad_norm": 3.807254056016914, "learning_rate": 4.386263509583731e-06, "loss": 0.4239, "step": 1426 }, { "epoch": 2.3053311793214863, "grad_norm": 3.1617147950631166, "learning_rate": 4.385427848621844e-06, "loss": 0.3689, "step": 1427 }, { "epoch": 2.3069466882067853, "grad_norm": 3.2573430960090026, "learning_rate": 4.384591698850808e-06, "loss": 0.3376, "step": 1428 }, { "epoch": 2.308562197092084, "grad_norm": 3.6022834867270217, "learning_rate": 4.3837550604874e-06, "loss": 0.4156, "step": 1429 }, { "epoch": 2.3101777059773827, "grad_norm": 3.365037006308906, "learning_rate": 4.382917933748525e-06, "loss": 0.4101, "step": 1430 }, { "epoch": 2.3117932148626816, "grad_norm": 3.5484367692450514, "learning_rate": 4.382080318851214e-06, "loss": 0.3198, "step": 1431 }, { "epoch": 2.3134087237479806, "grad_norm": 3.277461481815215, "learning_rate": 4.381242216012622e-06, "loss": 0.3876, "step": 1432 }, { "epoch": 2.3150242326332795, "grad_norm": 3.3722911627218033, "learning_rate": 4.380403625450034e-06, "loss": 0.3467, "step": 1433 }, { "epoch": 2.3166397415185784, "grad_norm": 3.37450988289246, "learning_rate": 4.379564547380859e-06, "loss": 0.4018, "step": 1434 }, { "epoch": 2.3182552504038774, "grad_norm": 3.6391062611246827, "learning_rate": 4.378724982022633e-06, "loss": 0.4307, "step": 1435 }, { "epoch": 2.3198707592891763, "grad_norm": 3.3573052233875997, "learning_rate": 4.37788492959302e-06, "loss": 0.3865, "step": 1436 }, { "epoch": 2.321486268174475, "grad_norm": 3.4362578117526743, "learning_rate": 4.377044390309808e-06, "loss": 0.3588, "step": 1437 }, { "epoch": 2.3231017770597737, "grad_norm": 3.534202486332664, "learning_rate": 4.376203364390913e-06, "loss": 0.4816, "step": 1438 }, { "epoch": 2.3247172859450727, "grad_norm": 3.34777930163264, "learning_rate": 4.375361852054375e-06, "loss": 0.3648, "step": 1439 }, { "epoch": 2.3263327948303716, "grad_norm": 3.1192653543398747, "learning_rate": 4.374519853518362e-06, "loss": 0.3626, "step": 1440 }, { "epoch": 2.3279483037156705, "grad_norm": 3.0455192977150554, "learning_rate": 4.373677369001168e-06, "loss": 0.3232, "step": 1441 }, { "epoch": 2.3295638126009695, "grad_norm": 3.704499353624665, "learning_rate": 4.3728343987212125e-06, "loss": 0.4238, "step": 1442 }, { "epoch": 2.3311793214862684, "grad_norm": 3.5114745767137485, "learning_rate": 4.3719909428970394e-06, "loss": 0.3513, "step": 1443 }, { "epoch": 2.332794830371567, "grad_norm": 5.631901169983182, "learning_rate": 4.371147001747322e-06, "loss": 0.3892, "step": 1444 }, { "epoch": 2.334410339256866, "grad_norm": 3.8791402780116555, "learning_rate": 4.3703025754908565e-06, "loss": 0.4356, "step": 1445 }, { "epoch": 2.3360258481421647, "grad_norm": 3.01379178789745, "learning_rate": 4.369457664346565e-06, "loss": 0.3279, "step": 1446 }, { "epoch": 2.3376413570274637, "grad_norm": 4.250679564465684, "learning_rate": 4.368612268533498e-06, "loss": 0.3637, "step": 1447 }, { "epoch": 2.3392568659127626, "grad_norm": 3.6919405374700043, "learning_rate": 4.367766388270827e-06, "loss": 0.3729, "step": 1448 }, { "epoch": 2.3408723747980615, "grad_norm": 3.2742547271061437, "learning_rate": 4.366920023777854e-06, "loss": 0.3875, "step": 1449 }, { "epoch": 2.3424878836833605, "grad_norm": 2.7771554781082513, "learning_rate": 4.3660731752740035e-06, "loss": 0.3446, "step": 1450 }, { "epoch": 2.344103392568659, "grad_norm": 3.245100063073289, "learning_rate": 4.3652258429788275e-06, "loss": 0.3632, "step": 1451 }, { "epoch": 2.345718901453958, "grad_norm": 3.5002973545411282, "learning_rate": 4.364378027112001e-06, "loss": 0.3819, "step": 1452 }, { "epoch": 2.347334410339257, "grad_norm": 3.620440557189335, "learning_rate": 4.363529727893326e-06, "loss": 0.3904, "step": 1453 }, { "epoch": 2.3489499192245558, "grad_norm": 2.9305720271905162, "learning_rate": 4.362680945542729e-06, "loss": 0.3389, "step": 1454 }, { "epoch": 2.3505654281098547, "grad_norm": 3.245093398082542, "learning_rate": 4.361831680280263e-06, "loss": 0.3744, "step": 1455 }, { "epoch": 2.3521809369951536, "grad_norm": 3.3011560127087813, "learning_rate": 4.360981932326105e-06, "loss": 0.3419, "step": 1456 }, { "epoch": 2.3537964458804526, "grad_norm": 2.9896777922796236, "learning_rate": 4.3601317019005575e-06, "loss": 0.347, "step": 1457 }, { "epoch": 2.355411954765751, "grad_norm": 3.749804855020616, "learning_rate": 4.35928098922405e-06, "loss": 0.4003, "step": 1458 }, { "epoch": 2.35702746365105, "grad_norm": 3.1046522470362383, "learning_rate": 4.358429794517131e-06, "loss": 0.3197, "step": 1459 }, { "epoch": 2.358642972536349, "grad_norm": 3.7680091525601553, "learning_rate": 4.357578118000482e-06, "loss": 0.3391, "step": 1460 }, { "epoch": 2.360258481421648, "grad_norm": 3.4484105248789567, "learning_rate": 4.356725959894904e-06, "loss": 0.41, "step": 1461 }, { "epoch": 2.361873990306947, "grad_norm": 3.4537363356227027, "learning_rate": 4.355873320421325e-06, "loss": 0.3583, "step": 1462 }, { "epoch": 2.3634894991922457, "grad_norm": 3.6314594498696504, "learning_rate": 4.355020199800796e-06, "loss": 0.3905, "step": 1463 }, { "epoch": 2.3651050080775446, "grad_norm": 3.986949130008098, "learning_rate": 4.354166598254495e-06, "loss": 0.4219, "step": 1464 }, { "epoch": 2.366720516962843, "grad_norm": 3.352170945843849, "learning_rate": 4.353312516003724e-06, "loss": 0.3443, "step": 1465 }, { "epoch": 2.368336025848142, "grad_norm": 3.829077478378056, "learning_rate": 4.3524579532699076e-06, "loss": 0.4732, "step": 1466 }, { "epoch": 2.369951534733441, "grad_norm": 3.885959710190356, "learning_rate": 4.351602910274598e-06, "loss": 0.4126, "step": 1467 }, { "epoch": 2.37156704361874, "grad_norm": 3.597939448907965, "learning_rate": 4.35074738723947e-06, "loss": 0.3688, "step": 1468 }, { "epoch": 2.373182552504039, "grad_norm": 3.404993501274359, "learning_rate": 4.349891384386323e-06, "loss": 0.4214, "step": 1469 }, { "epoch": 2.374798061389338, "grad_norm": 3.640262221002177, "learning_rate": 4.349034901937082e-06, "loss": 0.5005, "step": 1470 }, { "epoch": 2.3764135702746367, "grad_norm": 3.021321340961074, "learning_rate": 4.3481779401137965e-06, "loss": 0.3791, "step": 1471 }, { "epoch": 2.378029079159935, "grad_norm": 4.195497516877711, "learning_rate": 4.347320499138636e-06, "loss": 0.5112, "step": 1472 }, { "epoch": 2.379644588045234, "grad_norm": 3.5559246110437783, "learning_rate": 4.346462579233901e-06, "loss": 0.4024, "step": 1473 }, { "epoch": 2.381260096930533, "grad_norm": 3.8941431159585584, "learning_rate": 4.345604180622011e-06, "loss": 0.3719, "step": 1474 }, { "epoch": 2.382875605815832, "grad_norm": 3.577924234058792, "learning_rate": 4.344745303525512e-06, "loss": 0.3706, "step": 1475 }, { "epoch": 2.384491114701131, "grad_norm": 3.899227919022932, "learning_rate": 4.343885948167072e-06, "loss": 0.4024, "step": 1476 }, { "epoch": 2.38610662358643, "grad_norm": 4.120798624139591, "learning_rate": 4.343026114769486e-06, "loss": 0.3799, "step": 1477 }, { "epoch": 2.387722132471729, "grad_norm": 3.6421162646412384, "learning_rate": 4.342165803555669e-06, "loss": 0.4009, "step": 1478 }, { "epoch": 2.3893376413570273, "grad_norm": 3.8422629308372636, "learning_rate": 4.341305014748664e-06, "loss": 0.4232, "step": 1479 }, { "epoch": 2.3909531502423262, "grad_norm": 3.4142674204895083, "learning_rate": 4.340443748571636e-06, "loss": 0.3517, "step": 1480 }, { "epoch": 2.392568659127625, "grad_norm": 3.5796795873431244, "learning_rate": 4.339582005247873e-06, "loss": 0.3498, "step": 1481 }, { "epoch": 2.394184168012924, "grad_norm": 3.1702092312272536, "learning_rate": 4.338719785000788e-06, "loss": 0.2984, "step": 1482 }, { "epoch": 2.395799676898223, "grad_norm": 3.8363542218885147, "learning_rate": 4.337857088053918e-06, "loss": 0.3775, "step": 1483 }, { "epoch": 2.397415185783522, "grad_norm": 3.4586275459496503, "learning_rate": 4.33699391463092e-06, "loss": 0.3742, "step": 1484 }, { "epoch": 2.399030694668821, "grad_norm": 2.9288941357314924, "learning_rate": 4.336130264955579e-06, "loss": 0.3495, "step": 1485 }, { "epoch": 2.4006462035541194, "grad_norm": 3.6287943618319973, "learning_rate": 4.335266139251801e-06, "loss": 0.3609, "step": 1486 }, { "epoch": 2.4022617124394183, "grad_norm": 3.505201474339452, "learning_rate": 4.3344015377436165e-06, "loss": 0.3525, "step": 1487 }, { "epoch": 2.4038772213247173, "grad_norm": 4.805259988929361, "learning_rate": 4.33353646065518e-06, "loss": 0.4319, "step": 1488 }, { "epoch": 2.405492730210016, "grad_norm": 3.7983910545647235, "learning_rate": 4.3326709082107655e-06, "loss": 0.4126, "step": 1489 }, { "epoch": 2.407108239095315, "grad_norm": 3.8079689333143074, "learning_rate": 4.3318048806347755e-06, "loss": 0.3631, "step": 1490 }, { "epoch": 2.408723747980614, "grad_norm": 3.6795748263464154, "learning_rate": 4.330938378151732e-06, "loss": 0.4263, "step": 1491 }, { "epoch": 2.410339256865913, "grad_norm": 3.5471886114075266, "learning_rate": 4.330071400986281e-06, "loss": 0.4267, "step": 1492 }, { "epoch": 2.4119547657512115, "grad_norm": 3.1109696007011007, "learning_rate": 4.329203949363193e-06, "loss": 0.3753, "step": 1493 }, { "epoch": 2.4135702746365104, "grad_norm": 3.830617647343668, "learning_rate": 4.328336023507359e-06, "loss": 0.3704, "step": 1494 }, { "epoch": 2.4151857835218093, "grad_norm": 3.749424544336995, "learning_rate": 4.3274676236437955e-06, "loss": 0.3719, "step": 1495 }, { "epoch": 2.4168012924071083, "grad_norm": 3.578405356302497, "learning_rate": 4.32659874999764e-06, "loss": 0.3832, "step": 1496 }, { "epoch": 2.418416801292407, "grad_norm": 3.3659079957610807, "learning_rate": 4.325729402794153e-06, "loss": 0.3887, "step": 1497 }, { "epoch": 2.420032310177706, "grad_norm": 3.556730179153217, "learning_rate": 4.32485958225872e-06, "loss": 0.3411, "step": 1498 }, { "epoch": 2.421647819063005, "grad_norm": 3.697755498927193, "learning_rate": 4.323989288616846e-06, "loss": 0.3976, "step": 1499 }, { "epoch": 2.4232633279483036, "grad_norm": 3.137790908149132, "learning_rate": 4.323118522094161e-06, "loss": 0.3439, "step": 1500 }, { "epoch": 2.4248788368336025, "grad_norm": 4.025871655985798, "learning_rate": 4.322247282916415e-06, "loss": 0.419, "step": 1501 }, { "epoch": 2.4264943457189014, "grad_norm": 3.204022590213666, "learning_rate": 4.321375571309485e-06, "loss": 0.407, "step": 1502 }, { "epoch": 2.4281098546042004, "grad_norm": 3.532996670469851, "learning_rate": 4.320503387499367e-06, "loss": 0.3661, "step": 1503 }, { "epoch": 2.4297253634894993, "grad_norm": 3.172379284231723, "learning_rate": 4.3196307317121776e-06, "loss": 0.319, "step": 1504 }, { "epoch": 2.4313408723747982, "grad_norm": 3.594308903987679, "learning_rate": 4.318757604174161e-06, "loss": 0.4321, "step": 1505 }, { "epoch": 2.432956381260097, "grad_norm": 3.9467140196067403, "learning_rate": 4.317884005111681e-06, "loss": 0.403, "step": 1506 }, { "epoch": 2.4345718901453957, "grad_norm": 3.6926310449410393, "learning_rate": 4.317009934751223e-06, "loss": 0.3871, "step": 1507 }, { "epoch": 2.4361873990306946, "grad_norm": 3.83917460292702, "learning_rate": 4.316135393319395e-06, "loss": 0.4116, "step": 1508 }, { "epoch": 2.4378029079159935, "grad_norm": 3.6056555078552517, "learning_rate": 4.315260381042929e-06, "loss": 0.3998, "step": 1509 }, { "epoch": 2.4394184168012925, "grad_norm": 3.6787894163477963, "learning_rate": 4.314384898148674e-06, "loss": 0.406, "step": 1510 }, { "epoch": 2.4410339256865914, "grad_norm": 4.251100163094634, "learning_rate": 4.313508944863609e-06, "loss": 0.399, "step": 1511 }, { "epoch": 2.4426494345718903, "grad_norm": 3.3214692143411733, "learning_rate": 4.312632521414828e-06, "loss": 0.3185, "step": 1512 }, { "epoch": 2.4442649434571893, "grad_norm": 3.879803943156938, "learning_rate": 4.311755628029549e-06, "loss": 0.447, "step": 1513 }, { "epoch": 2.4458804523424877, "grad_norm": 3.738487500942573, "learning_rate": 4.310878264935113e-06, "loss": 0.3535, "step": 1514 }, { "epoch": 2.4474959612277867, "grad_norm": 3.5431985864947104, "learning_rate": 4.310000432358982e-06, "loss": 0.468, "step": 1515 }, { "epoch": 2.4491114701130856, "grad_norm": 3.359114335768391, "learning_rate": 4.309122130528739e-06, "loss": 0.3224, "step": 1516 }, { "epoch": 2.4507269789983845, "grad_norm": 3.651354167235835, "learning_rate": 4.308243359672089e-06, "loss": 0.4096, "step": 1517 }, { "epoch": 2.4523424878836835, "grad_norm": 3.5349250511971277, "learning_rate": 4.307364120016862e-06, "loss": 0.3949, "step": 1518 }, { "epoch": 2.4539579967689824, "grad_norm": 3.7897902401492773, "learning_rate": 4.306484411791003e-06, "loss": 0.3597, "step": 1519 }, { "epoch": 2.4555735056542813, "grad_norm": 3.1841864298435802, "learning_rate": 4.305604235222583e-06, "loss": 0.4019, "step": 1520 }, { "epoch": 2.45718901453958, "grad_norm": 3.884284202670504, "learning_rate": 4.304723590539794e-06, "loss": 0.3934, "step": 1521 }, { "epoch": 2.4588045234248788, "grad_norm": 3.601423588865053, "learning_rate": 4.303842477970948e-06, "loss": 0.3263, "step": 1522 }, { "epoch": 2.4604200323101777, "grad_norm": 3.8698552305415714, "learning_rate": 4.3029608977444785e-06, "loss": 0.3888, "step": 1523 }, { "epoch": 2.4620355411954766, "grad_norm": 3.85754538196711, "learning_rate": 4.302078850088942e-06, "loss": 0.4176, "step": 1524 }, { "epoch": 2.4636510500807756, "grad_norm": 3.336671686329101, "learning_rate": 4.301196335233014e-06, "loss": 0.3685, "step": 1525 }, { "epoch": 2.4652665589660745, "grad_norm": 3.67351253514244, "learning_rate": 4.300313353405493e-06, "loss": 0.3867, "step": 1526 }, { "epoch": 2.4668820678513734, "grad_norm": 3.237437104271533, "learning_rate": 4.299429904835297e-06, "loss": 0.3438, "step": 1527 }, { "epoch": 2.468497576736672, "grad_norm": 3.9466926685945634, "learning_rate": 4.298545989751467e-06, "loss": 0.464, "step": 1528 }, { "epoch": 2.470113085621971, "grad_norm": 3.4870705935740154, "learning_rate": 4.297661608383161e-06, "loss": 0.3443, "step": 1529 }, { "epoch": 2.47172859450727, "grad_norm": 3.995231531092751, "learning_rate": 4.2967767609596626e-06, "loss": 0.4616, "step": 1530 }, { "epoch": 2.4733441033925687, "grad_norm": 3.254070804892749, "learning_rate": 4.295891447710373e-06, "loss": 0.3824, "step": 1531 }, { "epoch": 2.4749596122778676, "grad_norm": 3.4261597001416924, "learning_rate": 4.295005668864817e-06, "loss": 0.368, "step": 1532 }, { "epoch": 2.4765751211631666, "grad_norm": 3.5758506100363334, "learning_rate": 4.294119424652637e-06, "loss": 0.4263, "step": 1533 }, { "epoch": 2.4781906300484655, "grad_norm": 3.8896828982179716, "learning_rate": 4.293232715303596e-06, "loss": 0.4136, "step": 1534 }, { "epoch": 2.479806138933764, "grad_norm": 3.590342728177863, "learning_rate": 4.2923455410475815e-06, "loss": 0.4221, "step": 1535 }, { "epoch": 2.481421647819063, "grad_norm": 3.318002980762548, "learning_rate": 4.291457902114599e-06, "loss": 0.3895, "step": 1536 }, { "epoch": 2.483037156704362, "grad_norm": 3.8812363263270067, "learning_rate": 4.290569798734773e-06, "loss": 0.3609, "step": 1537 }, { "epoch": 2.484652665589661, "grad_norm": 3.4279691541075348, "learning_rate": 4.289681231138349e-06, "loss": 0.4196, "step": 1538 }, { "epoch": 2.4862681744749597, "grad_norm": 4.508880710203301, "learning_rate": 4.288792199555698e-06, "loss": 0.4542, "step": 1539 }, { "epoch": 2.4878836833602587, "grad_norm": 4.190216325848962, "learning_rate": 4.2879027042173035e-06, "loss": 0.4316, "step": 1540 }, { "epoch": 2.489499192245557, "grad_norm": 3.635108068383709, "learning_rate": 4.287012745353774e-06, "loss": 0.4107, "step": 1541 }, { "epoch": 2.491114701130856, "grad_norm": 3.2084537384545, "learning_rate": 4.286122323195837e-06, "loss": 0.3234, "step": 1542 }, { "epoch": 2.492730210016155, "grad_norm": 3.315334220671601, "learning_rate": 4.28523143797434e-06, "loss": 0.3566, "step": 1543 }, { "epoch": 2.494345718901454, "grad_norm": 3.7723948488837538, "learning_rate": 4.28434008992025e-06, "loss": 0.3634, "step": 1544 }, { "epoch": 2.495961227786753, "grad_norm": 3.8081635537818985, "learning_rate": 4.283448279264656e-06, "loss": 0.4483, "step": 1545 }, { "epoch": 2.497576736672052, "grad_norm": 3.0995413134869616, "learning_rate": 4.2825560062387635e-06, "loss": 0.4111, "step": 1546 }, { "epoch": 2.4991922455573508, "grad_norm": 2.954170432974333, "learning_rate": 4.281663271073903e-06, "loss": 0.4007, "step": 1547 }, { "epoch": 2.5008077544426497, "grad_norm": 3.8505182375833678, "learning_rate": 4.28077007400152e-06, "loss": 0.4581, "step": 1548 }, { "epoch": 2.502423263327948, "grad_norm": 4.5685716611143405, "learning_rate": 4.27987641525318e-06, "loss": 0.3976, "step": 1549 }, { "epoch": 2.504038772213247, "grad_norm": 4.625449656259532, "learning_rate": 4.278982295060573e-06, "loss": 0.4429, "step": 1550 }, { "epoch": 2.505654281098546, "grad_norm": 3.2991364930725897, "learning_rate": 4.278087713655502e-06, "loss": 0.4096, "step": 1551 }, { "epoch": 2.507269789983845, "grad_norm": 3.6755358312242583, "learning_rate": 4.277192671269895e-06, "loss": 0.3605, "step": 1552 }, { "epoch": 2.508885298869144, "grad_norm": 3.49010288330492, "learning_rate": 4.276297168135798e-06, "loss": 0.3769, "step": 1553 }, { "epoch": 2.5105008077544424, "grad_norm": 4.0030098093688125, "learning_rate": 4.275401204485374e-06, "loss": 0.4054, "step": 1554 }, { "epoch": 2.5121163166397418, "grad_norm": 4.072998086825831, "learning_rate": 4.274504780550908e-06, "loss": 0.4324, "step": 1555 }, { "epoch": 2.5137318255250403, "grad_norm": 3.200815780340767, "learning_rate": 4.273607896564804e-06, "loss": 0.3356, "step": 1556 }, { "epoch": 2.515347334410339, "grad_norm": 3.4117694610753646, "learning_rate": 4.272710552759584e-06, "loss": 0.4394, "step": 1557 }, { "epoch": 2.516962843295638, "grad_norm": 3.849803713766924, "learning_rate": 4.271812749367891e-06, "loss": 0.3972, "step": 1558 }, { "epoch": 2.518578352180937, "grad_norm": 3.10357578730498, "learning_rate": 4.2709144866224865e-06, "loss": 0.3026, "step": 1559 }, { "epoch": 2.520193861066236, "grad_norm": 3.37388619073066, "learning_rate": 4.2700157647562494e-06, "loss": 0.4004, "step": 1560 }, { "epoch": 2.5218093699515345, "grad_norm": 3.707338150164459, "learning_rate": 4.26911658400218e-06, "loss": 0.4276, "step": 1561 }, { "epoch": 2.523424878836834, "grad_norm": 3.7119554612126926, "learning_rate": 4.268216944593396e-06, "loss": 0.4483, "step": 1562 }, { "epoch": 2.5250403877221324, "grad_norm": 4.020586026104968, "learning_rate": 4.267316846763136e-06, "loss": 0.4423, "step": 1563 }, { "epoch": 2.5266558966074313, "grad_norm": 3.2641270082406635, "learning_rate": 4.2664162907447545e-06, "loss": 0.3887, "step": 1564 }, { "epoch": 2.52827140549273, "grad_norm": 3.530401790176257, "learning_rate": 4.265515276771728e-06, "loss": 0.3878, "step": 1565 }, { "epoch": 2.529886914378029, "grad_norm": 3.528602431895948, "learning_rate": 4.264613805077649e-06, "loss": 0.3608, "step": 1566 }, { "epoch": 2.531502423263328, "grad_norm": 3.626460348582335, "learning_rate": 4.263711875896229e-06, "loss": 0.4202, "step": 1567 }, { "epoch": 2.5331179321486266, "grad_norm": 3.5794751008447507, "learning_rate": 4.262809489461301e-06, "loss": 0.3797, "step": 1568 }, { "epoch": 2.534733441033926, "grad_norm": 3.618451090964425, "learning_rate": 4.261906646006812e-06, "loss": 0.4342, "step": 1569 }, { "epoch": 2.5363489499192244, "grad_norm": 3.4773763086920235, "learning_rate": 4.261003345766832e-06, "loss": 0.3589, "step": 1570 }, { "epoch": 2.5379644588045234, "grad_norm": 3.9138263901203905, "learning_rate": 4.2600995889755455e-06, "loss": 0.4365, "step": 1571 }, { "epoch": 2.5395799676898223, "grad_norm": 4.033238642270841, "learning_rate": 4.25919537586726e-06, "loss": 0.4326, "step": 1572 }, { "epoch": 2.5411954765751212, "grad_norm": 3.8536902747993116, "learning_rate": 4.258290706676394e-06, "loss": 0.4116, "step": 1573 }, { "epoch": 2.54281098546042, "grad_norm": 3.4600958959632355, "learning_rate": 4.257385581637493e-06, "loss": 0.4368, "step": 1574 }, { "epoch": 2.5444264943457187, "grad_norm": 3.800565398795318, "learning_rate": 4.256480000985215e-06, "loss": 0.3966, "step": 1575 }, { "epoch": 2.546042003231018, "grad_norm": 3.491339513876714, "learning_rate": 4.255573964954337e-06, "loss": 0.4122, "step": 1576 }, { "epoch": 2.5476575121163165, "grad_norm": 3.2926188934521274, "learning_rate": 4.254667473779754e-06, "loss": 0.3305, "step": 1577 }, { "epoch": 2.5492730210016155, "grad_norm": 3.478708194173451, "learning_rate": 4.253760527696482e-06, "loss": 0.3817, "step": 1578 }, { "epoch": 2.5508885298869144, "grad_norm": 3.9234884133009724, "learning_rate": 4.25285312693965e-06, "loss": 0.4013, "step": 1579 }, { "epoch": 2.5525040387722133, "grad_norm": 3.308874374724817, "learning_rate": 4.251945271744509e-06, "loss": 0.3607, "step": 1580 }, { "epoch": 2.5541195476575123, "grad_norm": 3.909371277936039, "learning_rate": 4.251036962346425e-06, "loss": 0.384, "step": 1581 }, { "epoch": 2.5557350565428107, "grad_norm": 3.943850144903356, "learning_rate": 4.250128198980885e-06, "loss": 0.5173, "step": 1582 }, { "epoch": 2.55735056542811, "grad_norm": 3.7679288481599755, "learning_rate": 4.2492189818834884e-06, "loss": 0.3715, "step": 1583 }, { "epoch": 2.5589660743134086, "grad_norm": 3.063093801413843, "learning_rate": 4.248309311289959e-06, "loss": 0.3755, "step": 1584 }, { "epoch": 2.5605815831987075, "grad_norm": 3.357299758150985, "learning_rate": 4.247399187436133e-06, "loss": 0.3942, "step": 1585 }, { "epoch": 2.5621970920840065, "grad_norm": 3.0913959177519534, "learning_rate": 4.246488610557965e-06, "loss": 0.3864, "step": 1586 }, { "epoch": 2.5638126009693054, "grad_norm": 3.363683395228247, "learning_rate": 4.245577580891529e-06, "loss": 0.3909, "step": 1587 }, { "epoch": 2.5654281098546043, "grad_norm": 3.758398243076015, "learning_rate": 4.244666098673016e-06, "loss": 0.3636, "step": 1588 }, { "epoch": 2.567043618739903, "grad_norm": 3.488664338432323, "learning_rate": 4.243754164138731e-06, "loss": 0.3783, "step": 1589 }, { "epoch": 2.568659127625202, "grad_norm": 3.4077425121921388, "learning_rate": 4.242841777525102e-06, "loss": 0.3186, "step": 1590 }, { "epoch": 2.5702746365105007, "grad_norm": 3.192666687513683, "learning_rate": 4.241928939068668e-06, "loss": 0.4016, "step": 1591 }, { "epoch": 2.5718901453957996, "grad_norm": 3.397292953979881, "learning_rate": 4.24101564900609e-06, "loss": 0.3624, "step": 1592 }, { "epoch": 2.5735056542810986, "grad_norm": 3.504573035788377, "learning_rate": 4.240101907574145e-06, "loss": 0.4498, "step": 1593 }, { "epoch": 2.5751211631663975, "grad_norm": 4.014662050250214, "learning_rate": 4.239187715009723e-06, "loss": 0.4626, "step": 1594 }, { "epoch": 2.5767366720516964, "grad_norm": 3.53566942330063, "learning_rate": 4.238273071549838e-06, "loss": 0.4393, "step": 1595 }, { "epoch": 2.578352180936995, "grad_norm": 4.033339886333814, "learning_rate": 4.237357977431614e-06, "loss": 0.3763, "step": 1596 }, { "epoch": 2.5799676898222943, "grad_norm": 3.831611477959194, "learning_rate": 4.236442432892297e-06, "loss": 0.3884, "step": 1597 }, { "epoch": 2.581583198707593, "grad_norm": 3.839686921419718, "learning_rate": 4.235526438169247e-06, "loss": 0.3858, "step": 1598 }, { "epoch": 2.5831987075928917, "grad_norm": 4.042780000678055, "learning_rate": 4.234609993499943e-06, "loss": 0.4415, "step": 1599 }, { "epoch": 2.5848142164781907, "grad_norm": 3.0325905927420806, "learning_rate": 4.233693099121976e-06, "loss": 0.3532, "step": 1600 }, { "epoch": 2.5848142164781907, "eval_loss": 1.1023402214050293, "eval_runtime": 2.3479, "eval_samples_per_second": 127.772, "eval_steps_per_second": 2.981, "step": 1600 }, { "epoch": 2.5864297253634896, "grad_norm": 3.5790841144036447, "learning_rate": 4.2327757552730595e-06, "loss": 0.3887, "step": 1601 }, { "epoch": 2.5880452342487885, "grad_norm": 3.557701251463041, "learning_rate": 4.231857962191019e-06, "loss": 0.4074, "step": 1602 }, { "epoch": 2.589660743134087, "grad_norm": 3.6612461688302456, "learning_rate": 4.2309397201137986e-06, "loss": 0.3727, "step": 1603 }, { "epoch": 2.5912762520193864, "grad_norm": 4.039515312490631, "learning_rate": 4.230021029279459e-06, "loss": 0.4176, "step": 1604 }, { "epoch": 2.592891760904685, "grad_norm": 3.1633086026856545, "learning_rate": 4.229101889926176e-06, "loss": 0.3078, "step": 1605 }, { "epoch": 2.594507269789984, "grad_norm": 4.7059999384216225, "learning_rate": 4.228182302292243e-06, "loss": 0.4297, "step": 1606 }, { "epoch": 2.5961227786752827, "grad_norm": 4.131686031330674, "learning_rate": 4.2272622666160685e-06, "loss": 0.422, "step": 1607 }, { "epoch": 2.5977382875605817, "grad_norm": 3.4035794628084686, "learning_rate": 4.226341783136177e-06, "loss": 0.3774, "step": 1608 }, { "epoch": 2.5993537964458806, "grad_norm": 4.144667845778581, "learning_rate": 4.22542085209121e-06, "loss": 0.3934, "step": 1609 }, { "epoch": 2.600969305331179, "grad_norm": 4.001029516764389, "learning_rate": 4.224499473719926e-06, "loss": 0.4012, "step": 1610 }, { "epoch": 2.602584814216478, "grad_norm": 4.095001460370882, "learning_rate": 4.223577648261197e-06, "loss": 0.3732, "step": 1611 }, { "epoch": 2.604200323101777, "grad_norm": 3.9667585274951347, "learning_rate": 4.222655375954011e-06, "loss": 0.3504, "step": 1612 }, { "epoch": 2.605815831987076, "grad_norm": 4.030753939317076, "learning_rate": 4.221732657037476e-06, "loss": 0.4552, "step": 1613 }, { "epoch": 2.607431340872375, "grad_norm": 3.3083589237469466, "learning_rate": 4.22080949175081e-06, "loss": 0.3584, "step": 1614 }, { "epoch": 2.6090468497576738, "grad_norm": 3.489270385074895, "learning_rate": 4.219885880333351e-06, "loss": 0.4107, "step": 1615 }, { "epoch": 2.6106623586429727, "grad_norm": 3.680764317174567, "learning_rate": 4.21896182302455e-06, "loss": 0.428, "step": 1616 }, { "epoch": 2.612277867528271, "grad_norm": 3.5340662625072334, "learning_rate": 4.2180373200639755e-06, "loss": 0.3648, "step": 1617 }, { "epoch": 2.61389337641357, "grad_norm": 3.427930409557275, "learning_rate": 4.217112371691311e-06, "loss": 0.3729, "step": 1618 }, { "epoch": 2.615508885298869, "grad_norm": 3.3462122389159563, "learning_rate": 4.216186978146354e-06, "loss": 0.4258, "step": 1619 }, { "epoch": 2.617124394184168, "grad_norm": 3.6634201092058474, "learning_rate": 4.21526113966902e-06, "loss": 0.4387, "step": 1620 }, { "epoch": 2.618739903069467, "grad_norm": 3.4281427483535394, "learning_rate": 4.2143348564993385e-06, "loss": 0.3489, "step": 1621 }, { "epoch": 2.620355411954766, "grad_norm": 4.293174260947583, "learning_rate": 4.213408128877453e-06, "loss": 0.3652, "step": 1622 }, { "epoch": 2.621970920840065, "grad_norm": 3.0683916535059295, "learning_rate": 4.212480957043626e-06, "loss": 0.3711, "step": 1623 }, { "epoch": 2.6235864297253633, "grad_norm": 3.4696485705026503, "learning_rate": 4.2115533412382285e-06, "loss": 0.4078, "step": 1624 }, { "epoch": 2.625201938610662, "grad_norm": 3.1222286142761333, "learning_rate": 4.210625281701755e-06, "loss": 0.3554, "step": 1625 }, { "epoch": 2.626817447495961, "grad_norm": 3.322899345199203, "learning_rate": 4.209696778674808e-06, "loss": 0.3317, "step": 1626 }, { "epoch": 2.62843295638126, "grad_norm": 3.4605600119878908, "learning_rate": 4.208767832398109e-06, "loss": 0.3353, "step": 1627 }, { "epoch": 2.630048465266559, "grad_norm": 3.8488476079835916, "learning_rate": 4.207838443112494e-06, "loss": 0.3567, "step": 1628 }, { "epoch": 2.631663974151858, "grad_norm": 3.162800178201966, "learning_rate": 4.2069086110589115e-06, "loss": 0.3116, "step": 1629 }, { "epoch": 2.633279483037157, "grad_norm": 3.518650670194589, "learning_rate": 4.205978336478427e-06, "loss": 0.389, "step": 1630 }, { "epoch": 2.6348949919224554, "grad_norm": 3.3757059356787424, "learning_rate": 4.205047619612221e-06, "loss": 0.3815, "step": 1631 }, { "epoch": 2.6365105008077543, "grad_norm": 3.757793932320141, "learning_rate": 4.2041164607015865e-06, "loss": 0.3691, "step": 1632 }, { "epoch": 2.638126009693053, "grad_norm": 4.471132371108438, "learning_rate": 4.203184859987932e-06, "loss": 0.4463, "step": 1633 }, { "epoch": 2.639741518578352, "grad_norm": 3.695355957514038, "learning_rate": 4.202252817712783e-06, "loss": 0.3714, "step": 1634 }, { "epoch": 2.641357027463651, "grad_norm": 3.6077217138189583, "learning_rate": 4.201320334117776e-06, "loss": 0.3448, "step": 1635 }, { "epoch": 2.64297253634895, "grad_norm": 3.5454843290040183, "learning_rate": 4.200387409444663e-06, "loss": 0.4058, "step": 1636 }, { "epoch": 2.644588045234249, "grad_norm": 3.97311104443553, "learning_rate": 4.199454043935311e-06, "loss": 0.3858, "step": 1637 }, { "epoch": 2.6462035541195474, "grad_norm": 3.8088586195478493, "learning_rate": 4.1985202378317e-06, "loss": 0.3813, "step": 1638 }, { "epoch": 2.6478190630048464, "grad_norm": 4.440801516903624, "learning_rate": 4.1975859913759276e-06, "loss": 0.4399, "step": 1639 }, { "epoch": 2.6494345718901453, "grad_norm": 3.6021453944513326, "learning_rate": 4.196651304810202e-06, "loss": 0.3906, "step": 1640 }, { "epoch": 2.6510500807754442, "grad_norm": 3.562686160208374, "learning_rate": 4.195716178376845e-06, "loss": 0.4224, "step": 1641 }, { "epoch": 2.652665589660743, "grad_norm": 3.414359930344821, "learning_rate": 4.194780612318297e-06, "loss": 0.3692, "step": 1642 }, { "epoch": 2.654281098546042, "grad_norm": 3.182112417911165, "learning_rate": 4.193844606877106e-06, "loss": 0.3362, "step": 1643 }, { "epoch": 2.655896607431341, "grad_norm": 3.6447380239488796, "learning_rate": 4.19290816229594e-06, "loss": 0.4184, "step": 1644 }, { "epoch": 2.6575121163166395, "grad_norm": 3.7497165369475303, "learning_rate": 4.1919712788175774e-06, "loss": 0.368, "step": 1645 }, { "epoch": 2.6591276252019385, "grad_norm": 3.5121541641342584, "learning_rate": 4.191033956684911e-06, "loss": 0.3571, "step": 1646 }, { "epoch": 2.6607431340872374, "grad_norm": 3.5452688193957274, "learning_rate": 4.190096196140947e-06, "loss": 0.3973, "step": 1647 }, { "epoch": 2.6623586429725363, "grad_norm": 3.254763830191771, "learning_rate": 4.1891579974288075e-06, "loss": 0.3477, "step": 1648 }, { "epoch": 2.6639741518578353, "grad_norm": 3.8228546764056635, "learning_rate": 4.188219360791726e-06, "loss": 0.431, "step": 1649 }, { "epoch": 2.665589660743134, "grad_norm": 3.2471396207202456, "learning_rate": 4.187280286473048e-06, "loss": 0.3624, "step": 1650 }, { "epoch": 2.667205169628433, "grad_norm": 3.6730701380107083, "learning_rate": 4.186340774716237e-06, "loss": 0.3976, "step": 1651 }, { "epoch": 2.6688206785137316, "grad_norm": 3.615536992027568, "learning_rate": 4.185400825764866e-06, "loss": 0.4558, "step": 1652 }, { "epoch": 2.6704361873990305, "grad_norm": 3.9954855421490856, "learning_rate": 4.184460439862623e-06, "loss": 0.4065, "step": 1653 }, { "epoch": 2.6720516962843295, "grad_norm": 3.420473208601952, "learning_rate": 4.183519617253309e-06, "loss": 0.3698, "step": 1654 }, { "epoch": 2.6736672051696284, "grad_norm": 3.7301601807711466, "learning_rate": 4.182578358180839e-06, "loss": 0.3899, "step": 1655 }, { "epoch": 2.6752827140549273, "grad_norm": 4.169271132307559, "learning_rate": 4.181636662889239e-06, "loss": 0.4267, "step": 1656 }, { "epoch": 2.6768982229402263, "grad_norm": 4.121976486599395, "learning_rate": 4.18069453162265e-06, "loss": 0.4499, "step": 1657 }, { "epoch": 2.678513731825525, "grad_norm": 3.170891666101397, "learning_rate": 4.179751964625326e-06, "loss": 0.3583, "step": 1658 }, { "epoch": 2.6801292407108237, "grad_norm": 3.920203731080901, "learning_rate": 4.178808962141634e-06, "loss": 0.4249, "step": 1659 }, { "epoch": 2.6817447495961226, "grad_norm": 3.963815203461091, "learning_rate": 4.177865524416052e-06, "loss": 0.4138, "step": 1660 }, { "epoch": 2.6833602584814216, "grad_norm": 3.4171673977831207, "learning_rate": 4.1769216516931724e-06, "loss": 0.3662, "step": 1661 }, { "epoch": 2.6849757673667205, "grad_norm": 3.6134514564271223, "learning_rate": 4.175977344217701e-06, "loss": 0.375, "step": 1662 }, { "epoch": 2.6865912762520194, "grad_norm": 3.2322530671775924, "learning_rate": 4.175032602234455e-06, "loss": 0.3784, "step": 1663 }, { "epoch": 2.6882067851373184, "grad_norm": 3.7740728324158006, "learning_rate": 4.1740874259883655e-06, "loss": 0.4006, "step": 1664 }, { "epoch": 2.6898222940226173, "grad_norm": 3.411766848702038, "learning_rate": 4.173141815724474e-06, "loss": 0.3926, "step": 1665 }, { "epoch": 2.691437802907916, "grad_norm": 4.411425787672013, "learning_rate": 4.172195771687937e-06, "loss": 0.4271, "step": 1666 }, { "epoch": 2.6930533117932147, "grad_norm": 4.023551222825741, "learning_rate": 4.171249294124022e-06, "loss": 0.3692, "step": 1667 }, { "epoch": 2.6946688206785137, "grad_norm": 5.094027367555343, "learning_rate": 4.170302383278111e-06, "loss": 0.3318, "step": 1668 }, { "epoch": 2.6962843295638126, "grad_norm": 3.231385980867205, "learning_rate": 4.1693550393956935e-06, "loss": 0.3634, "step": 1669 }, { "epoch": 2.6978998384491115, "grad_norm": 2.9260685834268325, "learning_rate": 4.168407262722377e-06, "loss": 0.3117, "step": 1670 }, { "epoch": 2.6995153473344105, "grad_norm": 4.180978738584895, "learning_rate": 4.167459053503878e-06, "loss": 0.4279, "step": 1671 }, { "epoch": 2.7011308562197094, "grad_norm": 3.37764256225911, "learning_rate": 4.166510411986025e-06, "loss": 0.3955, "step": 1672 }, { "epoch": 2.702746365105008, "grad_norm": 3.3046058812427876, "learning_rate": 4.1655613384147596e-06, "loss": 0.3959, "step": 1673 }, { "epoch": 2.704361873990307, "grad_norm": 3.666700097700293, "learning_rate": 4.164611833036136e-06, "loss": 0.3763, "step": 1674 }, { "epoch": 2.7059773828756057, "grad_norm": 3.890973384121412, "learning_rate": 4.163661896096319e-06, "loss": 0.3885, "step": 1675 }, { "epoch": 2.7075928917609047, "grad_norm": 3.248333357910107, "learning_rate": 4.162711527841586e-06, "loss": 0.3507, "step": 1676 }, { "epoch": 2.7092084006462036, "grad_norm": 3.804721419949619, "learning_rate": 4.161760728518324e-06, "loss": 0.3724, "step": 1677 }, { "epoch": 2.7108239095315025, "grad_norm": 4.39832247444967, "learning_rate": 4.160809498373037e-06, "loss": 0.427, "step": 1678 }, { "epoch": 2.7124394184168015, "grad_norm": 3.8626815938692847, "learning_rate": 4.159857837652336e-06, "loss": 0.3952, "step": 1679 }, { "epoch": 2.7140549273021, "grad_norm": 3.719400158294678, "learning_rate": 4.158905746602944e-06, "loss": 0.3758, "step": 1680 }, { "epoch": 2.715670436187399, "grad_norm": 3.477737479961911, "learning_rate": 4.157953225471699e-06, "loss": 0.3799, "step": 1681 }, { "epoch": 2.717285945072698, "grad_norm": 3.1618959021477213, "learning_rate": 4.157000274505546e-06, "loss": 0.3706, "step": 1682 }, { "epoch": 2.7189014539579968, "grad_norm": 3.410827521761044, "learning_rate": 4.156046893951544e-06, "loss": 0.38, "step": 1683 }, { "epoch": 2.7205169628432957, "grad_norm": 3.35016838945228, "learning_rate": 4.155093084056864e-06, "loss": 0.3554, "step": 1684 }, { "epoch": 2.7221324717285946, "grad_norm": 3.424066688353192, "learning_rate": 4.154138845068787e-06, "loss": 0.3596, "step": 1685 }, { "epoch": 2.7237479806138936, "grad_norm": 3.555062295582287, "learning_rate": 4.153184177234705e-06, "loss": 0.4046, "step": 1686 }, { "epoch": 2.725363489499192, "grad_norm": 3.416787305663776, "learning_rate": 4.152229080802122e-06, "loss": 0.3717, "step": 1687 }, { "epoch": 2.726978998384491, "grad_norm": 3.4802639531881545, "learning_rate": 4.151273556018651e-06, "loss": 0.3935, "step": 1688 }, { "epoch": 2.72859450726979, "grad_norm": 3.958703957759027, "learning_rate": 4.150317603132021e-06, "loss": 0.4218, "step": 1689 }, { "epoch": 2.730210016155089, "grad_norm": 3.5429578685633234, "learning_rate": 4.149361222390068e-06, "loss": 0.3851, "step": 1690 }, { "epoch": 2.731825525040388, "grad_norm": 3.585447300014098, "learning_rate": 4.148404414040739e-06, "loss": 0.3908, "step": 1691 }, { "epoch": 2.7334410339256867, "grad_norm": 3.5501472905600875, "learning_rate": 4.147447178332092e-06, "loss": 0.3759, "step": 1692 }, { "epoch": 2.7350565428109856, "grad_norm": 3.423081811511673, "learning_rate": 4.146489515512298e-06, "loss": 0.3839, "step": 1693 }, { "epoch": 2.736672051696284, "grad_norm": 3.06983827216265, "learning_rate": 4.145531425829636e-06, "loss": 0.3558, "step": 1694 }, { "epoch": 2.738287560581583, "grad_norm": 3.479550811246217, "learning_rate": 4.144572909532497e-06, "loss": 0.3593, "step": 1695 }, { "epoch": 2.739903069466882, "grad_norm": 3.575408002276295, "learning_rate": 4.143613966869384e-06, "loss": 0.3723, "step": 1696 }, { "epoch": 2.741518578352181, "grad_norm": 2.9922441340518793, "learning_rate": 4.142654598088908e-06, "loss": 0.3712, "step": 1697 }, { "epoch": 2.74313408723748, "grad_norm": 3.3915322572396778, "learning_rate": 4.14169480343979e-06, "loss": 0.323, "step": 1698 }, { "epoch": 2.744749596122779, "grad_norm": 3.6992784396597034, "learning_rate": 4.140734583170864e-06, "loss": 0.4127, "step": 1699 }, { "epoch": 2.7463651050080777, "grad_norm": 3.615566627628317, "learning_rate": 4.139773937531074e-06, "loss": 0.3745, "step": 1700 }, { "epoch": 2.7479806138933762, "grad_norm": 4.660831895517865, "learning_rate": 4.138812866769472e-06, "loss": 0.3513, "step": 1701 }, { "epoch": 2.749596122778675, "grad_norm": 3.3916359496353574, "learning_rate": 4.1378513711352226e-06, "loss": 0.3618, "step": 1702 }, { "epoch": 2.751211631663974, "grad_norm": 3.243804998167915, "learning_rate": 4.1368894508776e-06, "loss": 0.3732, "step": 1703 }, { "epoch": 2.752827140549273, "grad_norm": 3.7347351997053093, "learning_rate": 4.135927106245986e-06, "loss": 0.4281, "step": 1704 }, { "epoch": 2.754442649434572, "grad_norm": 3.3364801953189716, "learning_rate": 4.134964337489878e-06, "loss": 0.4177, "step": 1705 }, { "epoch": 2.756058158319871, "grad_norm": 4.157778893757109, "learning_rate": 4.1340011448588775e-06, "loss": 0.4311, "step": 1706 }, { "epoch": 2.75767366720517, "grad_norm": 3.1021013585393793, "learning_rate": 4.1330375286026985e-06, "loss": 0.3819, "step": 1707 }, { "epoch": 2.7592891760904683, "grad_norm": 3.270223302044196, "learning_rate": 4.132073488971166e-06, "loss": 0.4145, "step": 1708 }, { "epoch": 2.7609046849757672, "grad_norm": 4.523826128514779, "learning_rate": 4.131109026214212e-06, "loss": 0.3887, "step": 1709 }, { "epoch": 2.762520193861066, "grad_norm": 3.836486125746584, "learning_rate": 4.1301441405818795e-06, "loss": 0.4047, "step": 1710 }, { "epoch": 2.764135702746365, "grad_norm": 3.9536311432764144, "learning_rate": 4.129178832324322e-06, "loss": 0.4234, "step": 1711 }, { "epoch": 2.765751211631664, "grad_norm": 3.66527454601466, "learning_rate": 4.128213101691801e-06, "loss": 0.4002, "step": 1712 }, { "epoch": 2.767366720516963, "grad_norm": 3.1167914400063803, "learning_rate": 4.12724694893469e-06, "loss": 0.3337, "step": 1713 }, { "epoch": 2.768982229402262, "grad_norm": 4.075559187031591, "learning_rate": 4.126280374303469e-06, "loss": 0.3842, "step": 1714 }, { "epoch": 2.7705977382875604, "grad_norm": 4.095998463979417, "learning_rate": 4.125313378048728e-06, "loss": 0.4067, "step": 1715 }, { "epoch": 2.7722132471728593, "grad_norm": 3.7930961098882015, "learning_rate": 4.124345960421169e-06, "loss": 0.3815, "step": 1716 }, { "epoch": 2.7738287560581583, "grad_norm": 3.224008730711559, "learning_rate": 4.123378121671599e-06, "loss": 0.2859, "step": 1717 }, { "epoch": 2.775444264943457, "grad_norm": 3.3855942778176398, "learning_rate": 4.1224098620509364e-06, "loss": 0.3565, "step": 1718 }, { "epoch": 2.777059773828756, "grad_norm": 3.8698283975275785, "learning_rate": 4.121441181810211e-06, "loss": 0.3628, "step": 1719 }, { "epoch": 2.778675282714055, "grad_norm": 3.5268958673200492, "learning_rate": 4.120472081200556e-06, "loss": 0.4291, "step": 1720 }, { "epoch": 2.780290791599354, "grad_norm": 3.6269592145629383, "learning_rate": 4.11950256047322e-06, "loss": 0.4173, "step": 1721 }, { "epoch": 2.7819063004846525, "grad_norm": 4.077240808910235, "learning_rate": 4.118532619879555e-06, "loss": 0.4244, "step": 1722 }, { "epoch": 2.7835218093699514, "grad_norm": 3.844812552293849, "learning_rate": 4.117562259671026e-06, "loss": 0.3999, "step": 1723 }, { "epoch": 2.7851373182552503, "grad_norm": 3.987856997039492, "learning_rate": 4.116591480099205e-06, "loss": 0.4557, "step": 1724 }, { "epoch": 2.7867528271405493, "grad_norm": 4.061993107672451, "learning_rate": 4.115620281415771e-06, "loss": 0.5199, "step": 1725 }, { "epoch": 2.788368336025848, "grad_norm": 3.1248846165130995, "learning_rate": 4.114648663872514e-06, "loss": 0.3454, "step": 1726 }, { "epoch": 2.789983844911147, "grad_norm": 3.769558028242174, "learning_rate": 4.113676627721333e-06, "loss": 0.4249, "step": 1727 }, { "epoch": 2.791599353796446, "grad_norm": 3.096719473498619, "learning_rate": 4.112704173214234e-06, "loss": 0.3165, "step": 1728 }, { "epoch": 2.7932148626817446, "grad_norm": 3.3011890832245423, "learning_rate": 4.1117313006033335e-06, "loss": 0.3572, "step": 1729 }, { "epoch": 2.7948303715670435, "grad_norm": 3.6875556758224697, "learning_rate": 4.110758010140853e-06, "loss": 0.3838, "step": 1730 }, { "epoch": 2.7964458804523424, "grad_norm": 3.5352510856729564, "learning_rate": 4.109784302079124e-06, "loss": 0.4086, "step": 1731 }, { "epoch": 2.7980613893376414, "grad_norm": 3.418898679035385, "learning_rate": 4.1088101766705875e-06, "loss": 0.3677, "step": 1732 }, { "epoch": 2.7996768982229403, "grad_norm": 3.6129155047422086, "learning_rate": 4.1078356341677915e-06, "loss": 0.3814, "step": 1733 }, { "epoch": 2.8012924071082392, "grad_norm": 3.4390234145070604, "learning_rate": 4.106860674823392e-06, "loss": 0.3358, "step": 1734 }, { "epoch": 2.802907915993538, "grad_norm": 3.2638377059558867, "learning_rate": 4.105885298890154e-06, "loss": 0.3831, "step": 1735 }, { "epoch": 2.8045234248788367, "grad_norm": 3.56828430866315, "learning_rate": 4.1049095066209495e-06, "loss": 0.435, "step": 1736 }, { "epoch": 2.8061389337641356, "grad_norm": 3.1563756306548467, "learning_rate": 4.103933298268759e-06, "loss": 0.2961, "step": 1737 }, { "epoch": 2.8077544426494345, "grad_norm": 3.954338893711832, "learning_rate": 4.1029566740866715e-06, "loss": 0.3584, "step": 1738 }, { "epoch": 2.8093699515347335, "grad_norm": 3.872019055806136, "learning_rate": 4.101979634327881e-06, "loss": 0.3743, "step": 1739 }, { "epoch": 2.8109854604200324, "grad_norm": 3.4169671138264555, "learning_rate": 4.101002179245693e-06, "loss": 0.327, "step": 1740 }, { "epoch": 2.8126009693053313, "grad_norm": 3.5235985348621757, "learning_rate": 4.100024309093518e-06, "loss": 0.2902, "step": 1741 }, { "epoch": 2.8142164781906303, "grad_norm": 3.711084228299185, "learning_rate": 4.099046024124876e-06, "loss": 0.3427, "step": 1742 }, { "epoch": 2.8158319870759287, "grad_norm": 3.724174406131902, "learning_rate": 4.0980673245933926e-06, "loss": 0.3904, "step": 1743 }, { "epoch": 2.8174474959612277, "grad_norm": 3.5434589397403355, "learning_rate": 4.097088210752802e-06, "loss": 0.4104, "step": 1744 }, { "epoch": 2.8190630048465266, "grad_norm": 3.4772258950249952, "learning_rate": 4.096108682856946e-06, "loss": 0.3361, "step": 1745 }, { "epoch": 2.8206785137318255, "grad_norm": 3.485540682436376, "learning_rate": 4.095128741159775e-06, "loss": 0.3593, "step": 1746 }, { "epoch": 2.8222940226171245, "grad_norm": 3.354294831057483, "learning_rate": 4.094148385915342e-06, "loss": 0.3715, "step": 1747 }, { "epoch": 2.8239095315024234, "grad_norm": 3.372601369302996, "learning_rate": 4.093167617377812e-06, "loss": 0.3771, "step": 1748 }, { "epoch": 2.8255250403877223, "grad_norm": 2.7998098681739476, "learning_rate": 4.092186435801455e-06, "loss": 0.3234, "step": 1749 }, { "epoch": 2.827140549273021, "grad_norm": 3.3579019245740205, "learning_rate": 4.09120484144065e-06, "loss": 0.3952, "step": 1750 }, { "epoch": 2.8287560581583198, "grad_norm": 2.9652476792509574, "learning_rate": 4.09022283454988e-06, "loss": 0.2812, "step": 1751 }, { "epoch": 2.8303715670436187, "grad_norm": 3.538571440193516, "learning_rate": 4.089240415383738e-06, "loss": 0.3409, "step": 1752 }, { "epoch": 2.8319870759289176, "grad_norm": 3.6607101987016653, "learning_rate": 4.088257584196919e-06, "loss": 0.4459, "step": 1753 }, { "epoch": 2.8336025848142166, "grad_norm": 4.0064029240935115, "learning_rate": 4.0872743412442324e-06, "loss": 0.4574, "step": 1754 }, { "epoch": 2.8352180936995155, "grad_norm": 4.441288463282269, "learning_rate": 4.086290686780587e-06, "loss": 0.4464, "step": 1755 }, { "epoch": 2.8368336025848144, "grad_norm": 4.061294239306539, "learning_rate": 4.085306621061005e-06, "loss": 0.3761, "step": 1756 }, { "epoch": 2.838449111470113, "grad_norm": 3.9038641119051345, "learning_rate": 4.084322144340609e-06, "loss": 0.3392, "step": 1757 }, { "epoch": 2.840064620355412, "grad_norm": 3.5167147291879113, "learning_rate": 4.08333725687463e-06, "loss": 0.3859, "step": 1758 }, { "epoch": 2.841680129240711, "grad_norm": 3.5903647763759703, "learning_rate": 4.082351958918409e-06, "loss": 0.4374, "step": 1759 }, { "epoch": 2.8432956381260097, "grad_norm": 3.338990841276623, "learning_rate": 4.0813662507273885e-06, "loss": 0.3878, "step": 1760 }, { "epoch": 2.8449111470113086, "grad_norm": 3.3692471050144697, "learning_rate": 4.080380132557121e-06, "loss": 0.38, "step": 1761 }, { "epoch": 2.8465266558966076, "grad_norm": 3.3949930414182057, "learning_rate": 4.0793936046632644e-06, "loss": 0.3675, "step": 1762 }, { "epoch": 2.8481421647819065, "grad_norm": 3.7700824502216874, "learning_rate": 4.0784066673015805e-06, "loss": 0.3956, "step": 1763 }, { "epoch": 2.849757673667205, "grad_norm": 3.6318393637733575, "learning_rate": 4.077419320727941e-06, "loss": 0.4026, "step": 1764 }, { "epoch": 2.851373182552504, "grad_norm": 3.114960766392506, "learning_rate": 4.07643156519832e-06, "loss": 0.3655, "step": 1765 }, { "epoch": 2.852988691437803, "grad_norm": 3.643730787020456, "learning_rate": 4.0754434009688e-06, "loss": 0.3918, "step": 1766 }, { "epoch": 2.854604200323102, "grad_norm": 3.924256766019972, "learning_rate": 4.0744548282955685e-06, "loss": 0.3608, "step": 1767 }, { "epoch": 2.8562197092084007, "grad_norm": 3.7733479097225593, "learning_rate": 4.073465847434921e-06, "loss": 0.4102, "step": 1768 }, { "epoch": 2.8578352180936997, "grad_norm": 3.8665378707555114, "learning_rate": 4.072476458643256e-06, "loss": 0.3777, "step": 1769 }, { "epoch": 2.8594507269789986, "grad_norm": 3.8111236941732627, "learning_rate": 4.071486662177077e-06, "loss": 0.3618, "step": 1770 }, { "epoch": 2.861066235864297, "grad_norm": 3.018947239727078, "learning_rate": 4.070496458292999e-06, "loss": 0.3163, "step": 1771 }, { "epoch": 2.862681744749596, "grad_norm": 3.310577224224053, "learning_rate": 4.0695058472477354e-06, "loss": 0.3885, "step": 1772 }, { "epoch": 2.864297253634895, "grad_norm": 3.58211197235379, "learning_rate": 4.0685148292981094e-06, "loss": 0.3508, "step": 1773 }, { "epoch": 2.865912762520194, "grad_norm": 3.6084464320331158, "learning_rate": 4.067523404701048e-06, "loss": 0.3886, "step": 1774 }, { "epoch": 2.867528271405493, "grad_norm": 4.04283167107187, "learning_rate": 4.0665315737135855e-06, "loss": 0.3877, "step": 1775 }, { "epoch": 2.8691437802907918, "grad_norm": 3.7595363783698463, "learning_rate": 4.06553933659286e-06, "loss": 0.4868, "step": 1776 }, { "epoch": 2.8707592891760907, "grad_norm": 3.680215068600754, "learning_rate": 4.064546693596115e-06, "loss": 0.3657, "step": 1777 }, { "epoch": 2.872374798061389, "grad_norm": 3.627675695067229, "learning_rate": 4.0635536449807e-06, "loss": 0.4273, "step": 1778 }, { "epoch": 2.873990306946688, "grad_norm": 3.6660192218301577, "learning_rate": 4.062560191004068e-06, "loss": 0.4424, "step": 1779 }, { "epoch": 2.875605815831987, "grad_norm": 3.597795566533819, "learning_rate": 4.061566331923779e-06, "loss": 0.3874, "step": 1780 }, { "epoch": 2.877221324717286, "grad_norm": 3.40185743835083, "learning_rate": 4.060572067997499e-06, "loss": 0.3454, "step": 1781 }, { "epoch": 2.878836833602585, "grad_norm": 3.4984513940893045, "learning_rate": 4.059577399482994e-06, "loss": 0.3605, "step": 1782 }, { "epoch": 2.880452342487884, "grad_norm": 3.13850706670711, "learning_rate": 4.058582326638141e-06, "loss": 0.2989, "step": 1783 }, { "epoch": 2.8820678513731828, "grad_norm": 4.0813989384676015, "learning_rate": 4.057586849720917e-06, "loss": 0.341, "step": 1784 }, { "epoch": 2.8836833602584813, "grad_norm": 3.5618012838417075, "learning_rate": 4.056590968989406e-06, "loss": 0.4096, "step": 1785 }, { "epoch": 2.88529886914378, "grad_norm": 3.4330268417431187, "learning_rate": 4.055594684701798e-06, "loss": 0.3464, "step": 1786 }, { "epoch": 2.886914378029079, "grad_norm": 3.537230438181447, "learning_rate": 4.054597997116384e-06, "loss": 0.3708, "step": 1787 }, { "epoch": 2.888529886914378, "grad_norm": 3.3126213433783094, "learning_rate": 4.053600906491561e-06, "loss": 0.414, "step": 1788 }, { "epoch": 2.890145395799677, "grad_norm": 3.36757032177516, "learning_rate": 4.052603413085835e-06, "loss": 0.3419, "step": 1789 }, { "epoch": 2.891760904684976, "grad_norm": 3.821299737243405, "learning_rate": 4.05160551715781e-06, "loss": 0.4125, "step": 1790 }, { "epoch": 2.893376413570275, "grad_norm": 3.3901147882701106, "learning_rate": 4.050607218966195e-06, "loss": 0.3937, "step": 1791 }, { "epoch": 2.8949919224555734, "grad_norm": 3.7591822301571387, "learning_rate": 4.049608518769808e-06, "loss": 0.4366, "step": 1792 }, { "epoch": 2.8966074313408723, "grad_norm": 3.3987860933025837, "learning_rate": 4.048609416827567e-06, "loss": 0.3642, "step": 1793 }, { "epoch": 2.898222940226171, "grad_norm": 3.831551351036389, "learning_rate": 4.047609913398497e-06, "loss": 0.3953, "step": 1794 }, { "epoch": 2.89983844911147, "grad_norm": 4.019359175511672, "learning_rate": 4.046610008741723e-06, "loss": 0.402, "step": 1795 }, { "epoch": 2.901453957996769, "grad_norm": 3.126004467302887, "learning_rate": 4.045609703116479e-06, "loss": 0.3749, "step": 1796 }, { "epoch": 2.903069466882068, "grad_norm": 3.8610285146812737, "learning_rate": 4.0446089967821005e-06, "loss": 0.3725, "step": 1797 }, { "epoch": 2.904684975767367, "grad_norm": 4.584281604411092, "learning_rate": 4.043607889998025e-06, "loss": 0.4392, "step": 1798 }, { "epoch": 2.9063004846526654, "grad_norm": 3.2739218538767814, "learning_rate": 4.042606383023797e-06, "loss": 0.373, "step": 1799 }, { "epoch": 2.9079159935379644, "grad_norm": 3.5659539589235156, "learning_rate": 4.041604476119064e-06, "loss": 0.3278, "step": 1800 }, { "epoch": 2.9079159935379644, "eval_loss": 1.1062554121017456, "eval_runtime": 2.349, "eval_samples_per_second": 127.713, "eval_steps_per_second": 2.98, "step": 1800 }, { "epoch": 2.9095315024232633, "grad_norm": 4.316726635419034, "learning_rate": 4.040602169543576e-06, "loss": 0.3692, "step": 1801 }, { "epoch": 2.9111470113085622, "grad_norm": 3.513809671757262, "learning_rate": 4.039599463557188e-06, "loss": 0.3382, "step": 1802 }, { "epoch": 2.912762520193861, "grad_norm": 3.878447876870299, "learning_rate": 4.0385963584198584e-06, "loss": 0.3297, "step": 1803 }, { "epoch": 2.9143780290791597, "grad_norm": 3.3860729321758867, "learning_rate": 4.037592854391647e-06, "loss": 0.3841, "step": 1804 }, { "epoch": 2.915993537964459, "grad_norm": 3.9509536128883185, "learning_rate": 4.036588951732718e-06, "loss": 0.4202, "step": 1805 }, { "epoch": 2.9176090468497575, "grad_norm": 3.5423850527828806, "learning_rate": 4.035584650703342e-06, "loss": 0.4024, "step": 1806 }, { "epoch": 2.9192245557350565, "grad_norm": 3.3717146537662024, "learning_rate": 4.034579951563889e-06, "loss": 0.4003, "step": 1807 }, { "epoch": 2.9208400646203554, "grad_norm": 4.422084627786211, "learning_rate": 4.033574854574836e-06, "loss": 0.4241, "step": 1808 }, { "epoch": 2.9224555735056543, "grad_norm": 3.7900867073883955, "learning_rate": 4.032569359996757e-06, "loss": 0.4585, "step": 1809 }, { "epoch": 2.9240710823909533, "grad_norm": 3.4218119583049966, "learning_rate": 4.031563468090334e-06, "loss": 0.4211, "step": 1810 }, { "epoch": 2.9256865912762517, "grad_norm": 3.5322406328178144, "learning_rate": 4.030557179116351e-06, "loss": 0.4413, "step": 1811 }, { "epoch": 2.927302100161551, "grad_norm": 3.3402568718551677, "learning_rate": 4.029550493335696e-06, "loss": 0.3986, "step": 1812 }, { "epoch": 2.9289176090468496, "grad_norm": 3.191016240617844, "learning_rate": 4.0285434110093585e-06, "loss": 0.3529, "step": 1813 }, { "epoch": 2.9305331179321485, "grad_norm": 3.7094300011469734, "learning_rate": 4.02753593239843e-06, "loss": 0.3883, "step": 1814 }, { "epoch": 2.9321486268174475, "grad_norm": 3.935979249863305, "learning_rate": 4.026528057764107e-06, "loss": 0.4506, "step": 1815 }, { "epoch": 2.9337641357027464, "grad_norm": 3.3748423000939862, "learning_rate": 4.025519787367685e-06, "loss": 0.3288, "step": 1816 }, { "epoch": 2.9353796445880453, "grad_norm": 3.475833337256282, "learning_rate": 4.024511121470567e-06, "loss": 0.3834, "step": 1817 }, { "epoch": 2.936995153473344, "grad_norm": 3.8994747086800805, "learning_rate": 4.023502060334254e-06, "loss": 0.4147, "step": 1818 }, { "epoch": 2.938610662358643, "grad_norm": 3.48307052309643, "learning_rate": 4.022492604220354e-06, "loss": 0.3744, "step": 1819 }, { "epoch": 2.9402261712439417, "grad_norm": 3.205171821850966, "learning_rate": 4.021482753390573e-06, "loss": 0.3463, "step": 1820 }, { "epoch": 2.9418416801292406, "grad_norm": 3.8931777413283566, "learning_rate": 4.020472508106722e-06, "loss": 0.3965, "step": 1821 }, { "epoch": 2.9434571890145396, "grad_norm": 3.7875034225636464, "learning_rate": 4.019461868630713e-06, "loss": 0.4223, "step": 1822 }, { "epoch": 2.9450726978998385, "grad_norm": 3.447764752624036, "learning_rate": 4.0184508352245605e-06, "loss": 0.3551, "step": 1823 }, { "epoch": 2.9466882067851374, "grad_norm": 4.145195802077717, "learning_rate": 4.017439408150382e-06, "loss": 0.4357, "step": 1824 }, { "epoch": 2.948303715670436, "grad_norm": 3.787686634854691, "learning_rate": 4.0164275876703965e-06, "loss": 0.3315, "step": 1825 }, { "epoch": 2.9499192245557353, "grad_norm": 3.2036414198518255, "learning_rate": 4.0154153740469246e-06, "loss": 0.323, "step": 1826 }, { "epoch": 2.951534733441034, "grad_norm": 4.37100207787249, "learning_rate": 4.0144027675423895e-06, "loss": 0.4219, "step": 1827 }, { "epoch": 2.9531502423263327, "grad_norm": 3.8591464277731733, "learning_rate": 4.013389768419315e-06, "loss": 0.3673, "step": 1828 }, { "epoch": 2.9547657512116317, "grad_norm": 3.9750395624850907, "learning_rate": 4.012376376940328e-06, "loss": 0.4196, "step": 1829 }, { "epoch": 2.9563812600969306, "grad_norm": 3.648357433078913, "learning_rate": 4.0113625933681565e-06, "loss": 0.4001, "step": 1830 }, { "epoch": 2.9579967689822295, "grad_norm": 4.0958726832275945, "learning_rate": 4.01034841796563e-06, "loss": 0.4174, "step": 1831 }, { "epoch": 2.959612277867528, "grad_norm": 3.395218186364348, "learning_rate": 4.009333850995682e-06, "loss": 0.4079, "step": 1832 }, { "epoch": 2.9612277867528274, "grad_norm": 3.4357682449827105, "learning_rate": 4.008318892721343e-06, "loss": 0.323, "step": 1833 }, { "epoch": 2.962843295638126, "grad_norm": 3.2844020562717775, "learning_rate": 4.007303543405749e-06, "loss": 0.368, "step": 1834 }, { "epoch": 2.964458804523425, "grad_norm": 3.9815953151605163, "learning_rate": 4.006287803312134e-06, "loss": 0.35, "step": 1835 }, { "epoch": 2.9660743134087237, "grad_norm": 3.614865700307112, "learning_rate": 4.005271672703836e-06, "loss": 0.4129, "step": 1836 }, { "epoch": 2.9676898222940227, "grad_norm": 3.427751191545116, "learning_rate": 4.004255151844294e-06, "loss": 0.3462, "step": 1837 }, { "epoch": 2.9693053311793216, "grad_norm": 3.855827926191369, "learning_rate": 4.0032382409970474e-06, "loss": 0.3939, "step": 1838 }, { "epoch": 2.97092084006462, "grad_norm": 3.794069460270105, "learning_rate": 4.002220940425735e-06, "loss": 0.4062, "step": 1839 }, { "epoch": 2.9725363489499195, "grad_norm": 3.3516827418116346, "learning_rate": 4.001203250394101e-06, "loss": 0.3608, "step": 1840 }, { "epoch": 2.974151857835218, "grad_norm": 3.3949959080604484, "learning_rate": 4.000185171165987e-06, "loss": 0.3521, "step": 1841 }, { "epoch": 2.975767366720517, "grad_norm": 3.84055656933237, "learning_rate": 3.999166703005335e-06, "loss": 0.3962, "step": 1842 }, { "epoch": 2.977382875605816, "grad_norm": 3.336575231112788, "learning_rate": 3.998147846176191e-06, "loss": 0.3775, "step": 1843 }, { "epoch": 2.9789983844911148, "grad_norm": 3.5766840605823633, "learning_rate": 3.9971286009427e-06, "loss": 0.3994, "step": 1844 }, { "epoch": 2.9806138933764137, "grad_norm": 3.6960096947461833, "learning_rate": 3.996108967569107e-06, "loss": 0.3741, "step": 1845 }, { "epoch": 2.982229402261712, "grad_norm": 3.402741325700353, "learning_rate": 3.99508894631976e-06, "loss": 0.3711, "step": 1846 }, { "epoch": 2.9838449111470116, "grad_norm": 3.807167479481642, "learning_rate": 3.994068537459104e-06, "loss": 0.4316, "step": 1847 }, { "epoch": 2.98546042003231, "grad_norm": 4.070291804779621, "learning_rate": 3.993047741251687e-06, "loss": 0.3629, "step": 1848 }, { "epoch": 2.987075928917609, "grad_norm": 3.879431538685708, "learning_rate": 3.992026557962159e-06, "loss": 0.4393, "step": 1849 }, { "epoch": 2.988691437802908, "grad_norm": 3.661116198083276, "learning_rate": 3.991004987855264e-06, "loss": 0.3931, "step": 1850 }, { "epoch": 2.990306946688207, "grad_norm": 3.389323038237829, "learning_rate": 3.989983031195855e-06, "loss": 0.3504, "step": 1851 }, { "epoch": 2.991922455573506, "grad_norm": 3.675253216739536, "learning_rate": 3.988960688248879e-06, "loss": 0.399, "step": 1852 }, { "epoch": 2.9935379644588043, "grad_norm": 3.297781008335146, "learning_rate": 3.987937959279384e-06, "loss": 0.3161, "step": 1853 }, { "epoch": 2.9951534733441036, "grad_norm": 3.6213325724727428, "learning_rate": 3.98691484455252e-06, "loss": 0.4558, "step": 1854 }, { "epoch": 2.996768982229402, "grad_norm": 3.754807516504538, "learning_rate": 3.9858913443335355e-06, "loss": 0.4351, "step": 1855 }, { "epoch": 2.998384491114701, "grad_norm": 3.1200636041419623, "learning_rate": 3.98486745888778e-06, "loss": 0.3681, "step": 1856 }, { "epoch": 3.0, "grad_norm": 3.25467814752527, "learning_rate": 3.9838431884807025e-06, "loss": 0.3862, "step": 1857 }, { "epoch": 3.001615508885299, "grad_norm": 4.022187442083879, "learning_rate": 3.98281853337785e-06, "loss": 0.2481, "step": 1858 }, { "epoch": 3.003231017770598, "grad_norm": 3.4010251992468414, "learning_rate": 3.981793493844873e-06, "loss": 0.1973, "step": 1859 }, { "epoch": 3.004846526655897, "grad_norm": 3.212611772972496, "learning_rate": 3.980768070147518e-06, "loss": 0.1761, "step": 1860 }, { "epoch": 3.0064620355411953, "grad_norm": 2.8991375403537893, "learning_rate": 3.979742262551632e-06, "loss": 0.1338, "step": 1861 }, { "epoch": 3.008077544426494, "grad_norm": 3.6816461152128506, "learning_rate": 3.978716071323164e-06, "loss": 0.1705, "step": 1862 }, { "epoch": 3.009693053311793, "grad_norm": 4.127221088863613, "learning_rate": 3.977689496728161e-06, "loss": 0.1734, "step": 1863 }, { "epoch": 3.011308562197092, "grad_norm": 4.167211140701715, "learning_rate": 3.976662539032766e-06, "loss": 0.1839, "step": 1864 }, { "epoch": 3.012924071082391, "grad_norm": 4.9867061208377965, "learning_rate": 3.975635198503227e-06, "loss": 0.1548, "step": 1865 }, { "epoch": 3.01453957996769, "grad_norm": 4.2907913012740995, "learning_rate": 3.974607475405888e-06, "loss": 0.1662, "step": 1866 }, { "epoch": 3.016155088852989, "grad_norm": 3.529709558247358, "learning_rate": 3.973579370007191e-06, "loss": 0.1672, "step": 1867 }, { "epoch": 3.0177705977382874, "grad_norm": 3.7627008423854384, "learning_rate": 3.972550882573681e-06, "loss": 0.1745, "step": 1868 }, { "epoch": 3.0193861066235863, "grad_norm": 3.5733957150906814, "learning_rate": 3.971522013372e-06, "loss": 0.2164, "step": 1869 }, { "epoch": 3.0210016155088852, "grad_norm": 3.070872699165485, "learning_rate": 3.970492762668887e-06, "loss": 0.1724, "step": 1870 }, { "epoch": 3.022617124394184, "grad_norm": 3.8406046868947685, "learning_rate": 3.969463130731183e-06, "loss": 0.2044, "step": 1871 }, { "epoch": 3.024232633279483, "grad_norm": 3.059465145086743, "learning_rate": 3.968433117825826e-06, "loss": 0.162, "step": 1872 }, { "epoch": 3.025848142164782, "grad_norm": 3.2111284479942355, "learning_rate": 3.967402724219855e-06, "loss": 0.1602, "step": 1873 }, { "epoch": 3.027463651050081, "grad_norm": 3.343811044104482, "learning_rate": 3.966371950180404e-06, "loss": 0.1993, "step": 1874 }, { "epoch": 3.0290791599353795, "grad_norm": 2.5218061325040844, "learning_rate": 3.9653407959747094e-06, "loss": 0.1452, "step": 1875 }, { "epoch": 3.0306946688206784, "grad_norm": 3.09750841465876, "learning_rate": 3.964309261870103e-06, "loss": 0.1759, "step": 1876 }, { "epoch": 3.0323101777059773, "grad_norm": 2.8856367436780572, "learning_rate": 3.963277348134018e-06, "loss": 0.1415, "step": 1877 }, { "epoch": 3.0339256865912763, "grad_norm": 3.24134396574376, "learning_rate": 3.9622450550339825e-06, "loss": 0.155, "step": 1878 }, { "epoch": 3.035541195476575, "grad_norm": 3.139873717036236, "learning_rate": 3.961212382837627e-06, "loss": 0.1647, "step": 1879 }, { "epoch": 3.037156704361874, "grad_norm": 3.8798168905201083, "learning_rate": 3.960179331812678e-06, "loss": 0.2238, "step": 1880 }, { "epoch": 3.038772213247173, "grad_norm": 3.602695481395378, "learning_rate": 3.9591459022269606e-06, "loss": 0.1558, "step": 1881 }, { "epoch": 3.0403877221324715, "grad_norm": 2.7090219785990133, "learning_rate": 3.958112094348398e-06, "loss": 0.1385, "step": 1882 }, { "epoch": 3.0420032310177705, "grad_norm": 3.4618449600963697, "learning_rate": 3.95707790844501e-06, "loss": 0.167, "step": 1883 }, { "epoch": 3.0436187399030694, "grad_norm": 3.755449069909825, "learning_rate": 3.956043344784917e-06, "loss": 0.1555, "step": 1884 }, { "epoch": 3.0452342487883683, "grad_norm": 2.9534062227678595, "learning_rate": 3.955008403636336e-06, "loss": 0.1486, "step": 1885 }, { "epoch": 3.0468497576736673, "grad_norm": 3.302005458424012, "learning_rate": 3.953973085267584e-06, "loss": 0.1591, "step": 1886 }, { "epoch": 3.048465266558966, "grad_norm": 3.3003438437255825, "learning_rate": 3.95293738994707e-06, "loss": 0.1435, "step": 1887 }, { "epoch": 3.050080775444265, "grad_norm": 3.2145165493953742, "learning_rate": 3.951901317943308e-06, "loss": 0.1341, "step": 1888 }, { "epoch": 3.0516962843295636, "grad_norm": 3.0851696903024655, "learning_rate": 3.9508648695249056e-06, "loss": 0.1893, "step": 1889 }, { "epoch": 3.0533117932148626, "grad_norm": 3.0335309910653265, "learning_rate": 3.949828044960566e-06, "loss": 0.159, "step": 1890 }, { "epoch": 3.0549273021001615, "grad_norm": 3.7776749694766347, "learning_rate": 3.948790844519096e-06, "loss": 0.2094, "step": 1891 }, { "epoch": 3.0565428109854604, "grad_norm": 3.9374353856357094, "learning_rate": 3.947753268469395e-06, "loss": 0.1905, "step": 1892 }, { "epoch": 3.0581583198707594, "grad_norm": 3.397489731585982, "learning_rate": 3.946715317080461e-06, "loss": 0.1764, "step": 1893 }, { "epoch": 3.0597738287560583, "grad_norm": 3.484065311042714, "learning_rate": 3.945676990621388e-06, "loss": 0.1581, "step": 1894 }, { "epoch": 3.0613893376413572, "grad_norm": 3.243942157190105, "learning_rate": 3.944638289361374e-06, "loss": 0.1345, "step": 1895 }, { "epoch": 3.0630048465266557, "grad_norm": 3.5944645170070175, "learning_rate": 3.943599213569702e-06, "loss": 0.168, "step": 1896 }, { "epoch": 3.0646203554119547, "grad_norm": 3.5974047161695277, "learning_rate": 3.9425597635157634e-06, "loss": 0.1794, "step": 1897 }, { "epoch": 3.0662358642972536, "grad_norm": 3.4755839951969625, "learning_rate": 3.941519939469041e-06, "loss": 0.1644, "step": 1898 }, { "epoch": 3.0678513731825525, "grad_norm": 3.3622523472807817, "learning_rate": 3.9404797416991155e-06, "loss": 0.1838, "step": 1899 }, { "epoch": 3.0694668820678515, "grad_norm": 3.880707020085532, "learning_rate": 3.939439170475666e-06, "loss": 0.2173, "step": 1900 }, { "epoch": 3.0710823909531504, "grad_norm": 4.381868520363921, "learning_rate": 3.938398226068466e-06, "loss": 0.1755, "step": 1901 }, { "epoch": 3.0726978998384493, "grad_norm": 3.083005231793832, "learning_rate": 3.937356908747387e-06, "loss": 0.1819, "step": 1902 }, { "epoch": 3.074313408723748, "grad_norm": 3.3453102165913036, "learning_rate": 3.936315218782398e-06, "loss": 0.1482, "step": 1903 }, { "epoch": 3.0759289176090467, "grad_norm": 3.041129979550403, "learning_rate": 3.935273156443563e-06, "loss": 0.1416, "step": 1904 }, { "epoch": 3.0775444264943457, "grad_norm": 3.139467454349726, "learning_rate": 3.934230722001043e-06, "loss": 0.1299, "step": 1905 }, { "epoch": 3.0791599353796446, "grad_norm": 2.5630365346312574, "learning_rate": 3.933187915725098e-06, "loss": 0.1465, "step": 1906 }, { "epoch": 3.0807754442649435, "grad_norm": 3.6436086979649134, "learning_rate": 3.932144737886079e-06, "loss": 0.1998, "step": 1907 }, { "epoch": 3.0823909531502425, "grad_norm": 3.4935751730781095, "learning_rate": 3.931101188754437e-06, "loss": 0.1631, "step": 1908 }, { "epoch": 3.0840064620355414, "grad_norm": 3.216157751410817, "learning_rate": 3.930057268600721e-06, "loss": 0.1548, "step": 1909 }, { "epoch": 3.08562197092084, "grad_norm": 3.147173181201143, "learning_rate": 3.929012977695572e-06, "loss": 0.1446, "step": 1910 }, { "epoch": 3.087237479806139, "grad_norm": 3.464000866716035, "learning_rate": 3.92796831630973e-06, "loss": 0.1661, "step": 1911 }, { "epoch": 3.0888529886914378, "grad_norm": 3.280002327320069, "learning_rate": 3.926923284714029e-06, "loss": 0.1404, "step": 1912 }, { "epoch": 3.0904684975767367, "grad_norm": 4.09096998387834, "learning_rate": 3.925877883179401e-06, "loss": 0.1835, "step": 1913 }, { "epoch": 3.0920840064620356, "grad_norm": 3.489490688414725, "learning_rate": 3.924832111976873e-06, "loss": 0.1712, "step": 1914 }, { "epoch": 3.0936995153473346, "grad_norm": 3.457065443239995, "learning_rate": 3.923785971377568e-06, "loss": 0.1501, "step": 1915 }, { "epoch": 3.0953150242326335, "grad_norm": 3.3955821846378695, "learning_rate": 3.922739461652704e-06, "loss": 0.1752, "step": 1916 }, { "epoch": 3.096930533117932, "grad_norm": 4.994194886096933, "learning_rate": 3.921692583073594e-06, "loss": 0.1833, "step": 1917 }, { "epoch": 3.098546042003231, "grad_norm": 3.5162599207176513, "learning_rate": 3.920645335911651e-06, "loss": 0.1785, "step": 1918 }, { "epoch": 3.10016155088853, "grad_norm": 3.1765031020893253, "learning_rate": 3.919597720438378e-06, "loss": 0.1707, "step": 1919 }, { "epoch": 3.101777059773829, "grad_norm": 2.943793827363341, "learning_rate": 3.918549736925377e-06, "loss": 0.1616, "step": 1920 }, { "epoch": 3.1033925686591277, "grad_norm": 3.5499253063810525, "learning_rate": 3.917501385644345e-06, "loss": 0.1824, "step": 1921 }, { "epoch": 3.1050080775444266, "grad_norm": 3.1677198840957312, "learning_rate": 3.9164526668670735e-06, "loss": 0.1635, "step": 1922 }, { "epoch": 3.106623586429725, "grad_norm": 2.8866260892345355, "learning_rate": 3.915403580865448e-06, "loss": 0.1233, "step": 1923 }, { "epoch": 3.108239095315024, "grad_norm": 3.209090829282254, "learning_rate": 3.914354127911452e-06, "loss": 0.156, "step": 1924 }, { "epoch": 3.109854604200323, "grad_norm": 3.567101680019465, "learning_rate": 3.913304308277161e-06, "loss": 0.172, "step": 1925 }, { "epoch": 3.111470113085622, "grad_norm": 3.1373011331655936, "learning_rate": 3.912254122234751e-06, "loss": 0.1571, "step": 1926 }, { "epoch": 3.113085621970921, "grad_norm": 3.3488201113988407, "learning_rate": 3.911203570056487e-06, "loss": 0.1676, "step": 1927 }, { "epoch": 3.11470113085622, "grad_norm": 3.194203459197046, "learning_rate": 3.910152652014731e-06, "loss": 0.177, "step": 1928 }, { "epoch": 3.1163166397415187, "grad_norm": 3.4414600904956965, "learning_rate": 3.909101368381941e-06, "loss": 0.1654, "step": 1929 }, { "epoch": 3.1179321486268172, "grad_norm": 3.1203050348106363, "learning_rate": 3.908049719430669e-06, "loss": 0.169, "step": 1930 }, { "epoch": 3.119547657512116, "grad_norm": 3.8718299416520967, "learning_rate": 3.906997705433561e-06, "loss": 0.2007, "step": 1931 }, { "epoch": 3.121163166397415, "grad_norm": 4.138535494884287, "learning_rate": 3.905945326663359e-06, "loss": 0.2023, "step": 1932 }, { "epoch": 3.122778675282714, "grad_norm": 3.3163382588025017, "learning_rate": 3.904892583392899e-06, "loss": 0.1532, "step": 1933 }, { "epoch": 3.124394184168013, "grad_norm": 3.3203691416225385, "learning_rate": 3.903839475895111e-06, "loss": 0.2038, "step": 1934 }, { "epoch": 3.126009693053312, "grad_norm": 2.9602401278363017, "learning_rate": 3.90278600444302e-06, "loss": 0.1423, "step": 1935 }, { "epoch": 3.127625201938611, "grad_norm": 3.907950066677988, "learning_rate": 3.901732169309745e-06, "loss": 0.1851, "step": 1936 }, { "epoch": 3.1292407108239093, "grad_norm": 2.8670555144100374, "learning_rate": 3.9006779707684996e-06, "loss": 0.2103, "step": 1937 }, { "epoch": 3.1308562197092082, "grad_norm": 4.069701645108188, "learning_rate": 3.8996234090925904e-06, "loss": 0.1793, "step": 1938 }, { "epoch": 3.132471728594507, "grad_norm": 3.0886739023931433, "learning_rate": 3.898568484555421e-06, "loss": 0.1518, "step": 1939 }, { "epoch": 3.134087237479806, "grad_norm": 3.1829138904787553, "learning_rate": 3.8975131974304866e-06, "loss": 0.1645, "step": 1940 }, { "epoch": 3.135702746365105, "grad_norm": 3.218105150860798, "learning_rate": 3.896457547991377e-06, "loss": 0.1946, "step": 1941 }, { "epoch": 3.137318255250404, "grad_norm": 3.094707487398131, "learning_rate": 3.895401536511775e-06, "loss": 0.1663, "step": 1942 }, { "epoch": 3.138933764135703, "grad_norm": 3.1491631488567897, "learning_rate": 3.89434516326546e-06, "loss": 0.169, "step": 1943 }, { "epoch": 3.1405492730210014, "grad_norm": 3.0327231889213855, "learning_rate": 3.893288428526303e-06, "loss": 0.1536, "step": 1944 }, { "epoch": 3.1421647819063003, "grad_norm": 3.09189740521498, "learning_rate": 3.892231332568268e-06, "loss": 0.1578, "step": 1945 }, { "epoch": 3.1437802907915993, "grad_norm": 3.2327061845784337, "learning_rate": 3.891173875665416e-06, "loss": 0.1413, "step": 1946 }, { "epoch": 3.145395799676898, "grad_norm": 4.353321093141304, "learning_rate": 3.890116058091897e-06, "loss": 0.2027, "step": 1947 }, { "epoch": 3.147011308562197, "grad_norm": 3.2579375559616226, "learning_rate": 3.8890578801219595e-06, "loss": 0.151, "step": 1948 }, { "epoch": 3.148626817447496, "grad_norm": 25.984665185533643, "learning_rate": 3.88799934202994e-06, "loss": 0.1822, "step": 1949 }, { "epoch": 3.150242326332795, "grad_norm": 3.287229938846169, "learning_rate": 3.886940444090274e-06, "loss": 0.1511, "step": 1950 }, { "epoch": 3.1518578352180935, "grad_norm": 2.6695663237111384, "learning_rate": 3.885881186577485e-06, "loss": 0.1274, "step": 1951 }, { "epoch": 3.1534733441033924, "grad_norm": 3.2453890990138428, "learning_rate": 3.884821569766195e-06, "loss": 0.1607, "step": 1952 }, { "epoch": 3.1550888529886914, "grad_norm": 2.7553726843159354, "learning_rate": 3.8837615939311136e-06, "loss": 0.1631, "step": 1953 }, { "epoch": 3.1567043618739903, "grad_norm": 3.2952351642808324, "learning_rate": 3.882701259347048e-06, "loss": 0.1524, "step": 1954 }, { "epoch": 3.158319870759289, "grad_norm": 3.5915874885904735, "learning_rate": 3.881640566288895e-06, "loss": 0.1699, "step": 1955 }, { "epoch": 3.159935379644588, "grad_norm": 3.2071356364125396, "learning_rate": 3.8805795150316495e-06, "loss": 0.152, "step": 1956 }, { "epoch": 3.161550888529887, "grad_norm": 2.828248189915256, "learning_rate": 3.879518105850392e-06, "loss": 0.1501, "step": 1957 }, { "epoch": 3.1631663974151856, "grad_norm": 3.3764049561261396, "learning_rate": 3.878456339020302e-06, "loss": 0.151, "step": 1958 }, { "epoch": 3.1647819063004845, "grad_norm": 4.242863227919734, "learning_rate": 3.877394214816648e-06, "loss": 0.2139, "step": 1959 }, { "epoch": 3.1663974151857834, "grad_norm": 3.4011527036744944, "learning_rate": 3.876331733514792e-06, "loss": 0.1619, "step": 1960 }, { "epoch": 3.1680129240710824, "grad_norm": 3.764237446384895, "learning_rate": 3.87526889539019e-06, "loss": 0.201, "step": 1961 }, { "epoch": 3.1696284329563813, "grad_norm": 3.4460887356844516, "learning_rate": 3.87420570071839e-06, "loss": 0.1765, "step": 1962 }, { "epoch": 3.1712439418416802, "grad_norm": 3.0650543324039465, "learning_rate": 3.873142149775031e-06, "loss": 0.158, "step": 1963 }, { "epoch": 3.172859450726979, "grad_norm": 3.559011972990184, "learning_rate": 3.872078242835846e-06, "loss": 0.2052, "step": 1964 }, { "epoch": 3.1744749596122777, "grad_norm": 3.4527740782809966, "learning_rate": 3.871013980176659e-06, "loss": 0.1657, "step": 1965 }, { "epoch": 3.1760904684975766, "grad_norm": 3.342935565067194, "learning_rate": 3.869949362073388e-06, "loss": 0.1945, "step": 1966 }, { "epoch": 3.1777059773828755, "grad_norm": 3.1117293658182157, "learning_rate": 3.868884388802041e-06, "loss": 0.1433, "step": 1967 }, { "epoch": 3.1793214862681745, "grad_norm": 3.620951729826309, "learning_rate": 3.8678190606387185e-06, "loss": 0.1622, "step": 1968 }, { "epoch": 3.1809369951534734, "grad_norm": 3.261994881639392, "learning_rate": 3.8667533778596145e-06, "loss": 0.1683, "step": 1969 }, { "epoch": 3.1825525040387723, "grad_norm": 3.520933060901431, "learning_rate": 3.865687340741014e-06, "loss": 0.1661, "step": 1970 }, { "epoch": 3.1841680129240713, "grad_norm": 3.1090427981731437, "learning_rate": 3.864620949559295e-06, "loss": 0.1699, "step": 1971 }, { "epoch": 3.1857835218093697, "grad_norm": 3.8207191294271663, "learning_rate": 3.863554204590923e-06, "loss": 0.173, "step": 1972 }, { "epoch": 3.1873990306946687, "grad_norm": 3.121596207788622, "learning_rate": 3.862487106112462e-06, "loss": 0.1671, "step": 1973 }, { "epoch": 3.1890145395799676, "grad_norm": 3.8495840769088274, "learning_rate": 3.861419654400561e-06, "loss": 0.169, "step": 1974 }, { "epoch": 3.1906300484652665, "grad_norm": 3.1289457483965664, "learning_rate": 3.8603518497319665e-06, "loss": 0.1423, "step": 1975 }, { "epoch": 3.1922455573505655, "grad_norm": 3.9012320345771756, "learning_rate": 3.859283692383512e-06, "loss": 0.1963, "step": 1976 }, { "epoch": 3.1938610662358644, "grad_norm": 3.2454208889337117, "learning_rate": 3.858215182632124e-06, "loss": 0.1761, "step": 1977 }, { "epoch": 3.1954765751211633, "grad_norm": 3.5047060346161465, "learning_rate": 3.85714632075482e-06, "loss": 0.1718, "step": 1978 }, { "epoch": 3.197092084006462, "grad_norm": 3.521506503577529, "learning_rate": 3.8560771070287105e-06, "loss": 0.1724, "step": 1979 }, { "epoch": 3.1987075928917608, "grad_norm": 4.0239125471304735, "learning_rate": 3.855007541730996e-06, "loss": 0.1542, "step": 1980 }, { "epoch": 3.2003231017770597, "grad_norm": 3.712649486387994, "learning_rate": 3.853937625138967e-06, "loss": 0.2243, "step": 1981 }, { "epoch": 3.2019386106623586, "grad_norm": 3.5531626272742494, "learning_rate": 3.852867357530007e-06, "loss": 0.1915, "step": 1982 }, { "epoch": 3.2035541195476576, "grad_norm": 3.257131483354185, "learning_rate": 3.8517967391815906e-06, "loss": 0.1538, "step": 1983 }, { "epoch": 3.2051696284329565, "grad_norm": 3.4995988070304698, "learning_rate": 3.850725770371279e-06, "loss": 0.1629, "step": 1984 }, { "epoch": 3.2067851373182554, "grad_norm": 3.242006621547336, "learning_rate": 3.849654451376731e-06, "loss": 0.1169, "step": 1985 }, { "epoch": 3.208400646203554, "grad_norm": 3.386099912160688, "learning_rate": 3.8485827824756915e-06, "loss": 0.1611, "step": 1986 }, { "epoch": 3.210016155088853, "grad_norm": 2.8507680489494653, "learning_rate": 3.847510763945997e-06, "loss": 0.1743, "step": 1987 }, { "epoch": 3.211631663974152, "grad_norm": 3.26029578175282, "learning_rate": 3.846438396065577e-06, "loss": 0.1933, "step": 1988 }, { "epoch": 3.2132471728594507, "grad_norm": 3.086909558817628, "learning_rate": 3.845365679112449e-06, "loss": 0.1785, "step": 1989 }, { "epoch": 3.2148626817447497, "grad_norm": 3.1438171133864383, "learning_rate": 3.84429261336472e-06, "loss": 0.1593, "step": 1990 }, { "epoch": 3.2164781906300486, "grad_norm": 3.0192313921416924, "learning_rate": 3.843219199100591e-06, "loss": 0.1503, "step": 1991 }, { "epoch": 3.2180936995153475, "grad_norm": 3.0553852130662524, "learning_rate": 3.842145436598349e-06, "loss": 0.1494, "step": 1992 }, { "epoch": 3.219709208400646, "grad_norm": 3.37732354809403, "learning_rate": 3.841071326136377e-06, "loss": 0.1852, "step": 1993 }, { "epoch": 3.221324717285945, "grad_norm": 2.952289589135533, "learning_rate": 3.839996867993145e-06, "loss": 0.1361, "step": 1994 }, { "epoch": 3.222940226171244, "grad_norm": 3.7841851202529995, "learning_rate": 3.838922062447209e-06, "loss": 0.1557, "step": 1995 }, { "epoch": 3.224555735056543, "grad_norm": 3.1050015153632646, "learning_rate": 3.837846909777223e-06, "loss": 0.1934, "step": 1996 }, { "epoch": 3.2261712439418417, "grad_norm": 3.47892604586167, "learning_rate": 3.836771410261926e-06, "loss": 0.1849, "step": 1997 }, { "epoch": 3.2277867528271407, "grad_norm": 3.461590016663954, "learning_rate": 3.835695564180147e-06, "loss": 0.1767, "step": 1998 }, { "epoch": 3.2294022617124396, "grad_norm": 4.238751161023222, "learning_rate": 3.834619371810808e-06, "loss": 0.2067, "step": 1999 }, { "epoch": 3.231017770597738, "grad_norm": 3.722803376081496, "learning_rate": 3.8335428334329165e-06, "loss": 0.1959, "step": 2000 }, { "epoch": 3.231017770597738, "eval_loss": 1.2499394416809082, "eval_runtime": 2.3493, "eval_samples_per_second": 127.697, "eval_steps_per_second": 2.98, "step": 2000 }, { "epoch": 3.232633279483037, "grad_norm": 3.466963179040905, "learning_rate": 3.832465949325574e-06, "loss": 0.159, "step": 2001 }, { "epoch": 3.234248788368336, "grad_norm": 3.1785377751768857, "learning_rate": 3.831388719767968e-06, "loss": 0.1573, "step": 2002 }, { "epoch": 3.235864297253635, "grad_norm": 3.2344260602328596, "learning_rate": 3.830311145039378e-06, "loss": 0.1937, "step": 2003 }, { "epoch": 3.237479806138934, "grad_norm": 3.631445651571307, "learning_rate": 3.829233225419172e-06, "loss": 0.1806, "step": 2004 }, { "epoch": 3.2390953150242328, "grad_norm": 3.015828189299519, "learning_rate": 3.828154961186808e-06, "loss": 0.1447, "step": 2005 }, { "epoch": 3.2407108239095317, "grad_norm": 4.281430861945742, "learning_rate": 3.8270763526218305e-06, "loss": 0.2169, "step": 2006 }, { "epoch": 3.24232633279483, "grad_norm": 3.501999723414982, "learning_rate": 3.825997400003878e-06, "loss": 0.1575, "step": 2007 }, { "epoch": 3.243941841680129, "grad_norm": 3.1057838849275696, "learning_rate": 3.824918103612675e-06, "loss": 0.1667, "step": 2008 }, { "epoch": 3.245557350565428, "grad_norm": 3.316374076382555, "learning_rate": 3.823838463728036e-06, "loss": 0.1844, "step": 2009 }, { "epoch": 3.247172859450727, "grad_norm": 3.0918396093793583, "learning_rate": 3.822758480629865e-06, "loss": 0.1495, "step": 2010 }, { "epoch": 3.248788368336026, "grad_norm": 2.982326594873758, "learning_rate": 3.821678154598153e-06, "loss": 0.1752, "step": 2011 }, { "epoch": 3.250403877221325, "grad_norm": 3.206254332062991, "learning_rate": 3.820597485912983e-06, "loss": 0.1775, "step": 2012 }, { "epoch": 3.2520193861066238, "grad_norm": 3.304326166526247, "learning_rate": 3.819516474854526e-06, "loss": 0.1578, "step": 2013 }, { "epoch": 3.2536348949919223, "grad_norm": 3.466049140038643, "learning_rate": 3.818435121703037e-06, "loss": 0.1999, "step": 2014 }, { "epoch": 3.255250403877221, "grad_norm": 3.339388680674845, "learning_rate": 3.817353426738868e-06, "loss": 0.1462, "step": 2015 }, { "epoch": 3.25686591276252, "grad_norm": 3.257991776073122, "learning_rate": 3.816271390242453e-06, "loss": 0.1682, "step": 2016 }, { "epoch": 3.258481421647819, "grad_norm": 3.4438096822291877, "learning_rate": 3.815189012494318e-06, "loss": 0.1443, "step": 2017 }, { "epoch": 3.260096930533118, "grad_norm": 4.5395898640284855, "learning_rate": 3.8141062937750757e-06, "loss": 0.1816, "step": 2018 }, { "epoch": 3.261712439418417, "grad_norm": 3.585639218363843, "learning_rate": 3.8130232343654273e-06, "loss": 0.178, "step": 2019 }, { "epoch": 3.263327948303716, "grad_norm": 3.3690356191774082, "learning_rate": 3.811939834546163e-06, "loss": 0.1664, "step": 2020 }, { "epoch": 3.2649434571890144, "grad_norm": 3.398400162022011, "learning_rate": 3.8108560945981632e-06, "loss": 0.1761, "step": 2021 }, { "epoch": 3.2665589660743133, "grad_norm": 3.2725658108206104, "learning_rate": 3.8097720148023908e-06, "loss": 0.1715, "step": 2022 }, { "epoch": 3.268174474959612, "grad_norm": 3.777730383024192, "learning_rate": 3.808687595439903e-06, "loss": 0.1844, "step": 2023 }, { "epoch": 3.269789983844911, "grad_norm": 4.426230569338995, "learning_rate": 3.8076028367918414e-06, "loss": 0.1741, "step": 2024 }, { "epoch": 3.27140549273021, "grad_norm": 3.193291359630436, "learning_rate": 3.806517739139437e-06, "loss": 0.1885, "step": 2025 }, { "epoch": 3.273021001615509, "grad_norm": 3.2743303205232612, "learning_rate": 3.805432302764008e-06, "loss": 0.1587, "step": 2026 }, { "epoch": 3.274636510500808, "grad_norm": 3.43787762973275, "learning_rate": 3.804346527946961e-06, "loss": 0.1868, "step": 2027 }, { "epoch": 3.2762520193861064, "grad_norm": 3.629392357278631, "learning_rate": 3.8032604149697895e-06, "loss": 0.1651, "step": 2028 }, { "epoch": 3.2778675282714054, "grad_norm": 3.353008049622137, "learning_rate": 3.8021739641140752e-06, "loss": 0.1789, "step": 2029 }, { "epoch": 3.2794830371567043, "grad_norm": 3.0948672022067725, "learning_rate": 3.8010871756614875e-06, "loss": 0.2051, "step": 2030 }, { "epoch": 3.2810985460420032, "grad_norm": 3.679366039841227, "learning_rate": 3.8000000498937838e-06, "loss": 0.1768, "step": 2031 }, { "epoch": 3.282714054927302, "grad_norm": 3.2381631933239468, "learning_rate": 3.798912587092808e-06, "loss": 0.1651, "step": 2032 }, { "epoch": 3.284329563812601, "grad_norm": 3.0637057821785247, "learning_rate": 3.7978247875404906e-06, "loss": 0.1561, "step": 2033 }, { "epoch": 3.2859450726979, "grad_norm": 3.539108326059919, "learning_rate": 3.796736651518852e-06, "loss": 0.1696, "step": 2034 }, { "epoch": 3.2875605815831985, "grad_norm": 4.323354740422715, "learning_rate": 3.7956481793099985e-06, "loss": 0.2206, "step": 2035 }, { "epoch": 3.2891760904684975, "grad_norm": 3.6660858247446586, "learning_rate": 3.7945593711961226e-06, "loss": 0.1857, "step": 2036 }, { "epoch": 3.2907915993537964, "grad_norm": 2.7091279045325187, "learning_rate": 3.7934702274595047e-06, "loss": 0.1588, "step": 2037 }, { "epoch": 3.2924071082390953, "grad_norm": 3.517583778795657, "learning_rate": 3.7923807483825132e-06, "loss": 0.1572, "step": 2038 }, { "epoch": 3.2940226171243943, "grad_norm": 4.005167097740463, "learning_rate": 3.7912909342476012e-06, "loss": 0.1626, "step": 2039 }, { "epoch": 3.295638126009693, "grad_norm": 3.874711627201315, "learning_rate": 3.790200785337311e-06, "loss": 0.1843, "step": 2040 }, { "epoch": 3.297253634894992, "grad_norm": 3.827423619806271, "learning_rate": 3.7891103019342696e-06, "loss": 0.1758, "step": 2041 }, { "epoch": 3.2988691437802906, "grad_norm": 2.5961094608939357, "learning_rate": 3.7880194843211927e-06, "loss": 0.1237, "step": 2042 }, { "epoch": 3.3004846526655895, "grad_norm": 3.0836804117464838, "learning_rate": 3.786928332780881e-06, "loss": 0.1681, "step": 2043 }, { "epoch": 3.3021001615508885, "grad_norm": 3.3022488653585924, "learning_rate": 3.7858368475962224e-06, "loss": 0.1632, "step": 2044 }, { "epoch": 3.3037156704361874, "grad_norm": 3.834440822591183, "learning_rate": 3.784745029050193e-06, "loss": 0.1627, "step": 2045 }, { "epoch": 3.3053311793214863, "grad_norm": 4.197113300189305, "learning_rate": 3.7836528774258517e-06, "loss": 0.1582, "step": 2046 }, { "epoch": 3.3069466882067853, "grad_norm": 3.061694785160916, "learning_rate": 3.782560393006347e-06, "loss": 0.1655, "step": 2047 }, { "epoch": 3.308562197092084, "grad_norm": 3.365584696724671, "learning_rate": 3.7814675760749114e-06, "loss": 0.1923, "step": 2048 }, { "epoch": 3.3101777059773827, "grad_norm": 3.284192114024375, "learning_rate": 3.780374426914865e-06, "loss": 0.1524, "step": 2049 }, { "epoch": 3.3117932148626816, "grad_norm": 3.3590047677433903, "learning_rate": 3.7792809458096146e-06, "loss": 0.1477, "step": 2050 }, { "epoch": 3.3134087237479806, "grad_norm": 3.4333382164482535, "learning_rate": 3.778187133042651e-06, "loss": 0.1567, "step": 2051 }, { "epoch": 3.3150242326332795, "grad_norm": 3.2379106667437516, "learning_rate": 3.7770929888975523e-06, "loss": 0.1647, "step": 2052 }, { "epoch": 3.3166397415185784, "grad_norm": 4.0972716324304, "learning_rate": 3.7759985136579823e-06, "loss": 0.1852, "step": 2053 }, { "epoch": 3.3182552504038774, "grad_norm": 4.489415312044499, "learning_rate": 3.7749037076076924e-06, "loss": 0.2003, "step": 2054 }, { "epoch": 3.3198707592891763, "grad_norm": 2.855231506649599, "learning_rate": 3.7738085710305144e-06, "loss": 0.1568, "step": 2055 }, { "epoch": 3.321486268174475, "grad_norm": 3.7808878795974805, "learning_rate": 3.7727131042103725e-06, "loss": 0.2231, "step": 2056 }, { "epoch": 3.3231017770597737, "grad_norm": 3.9788336101897332, "learning_rate": 3.771617307431272e-06, "loss": 0.1728, "step": 2057 }, { "epoch": 3.3247172859450727, "grad_norm": 3.3329649215615142, "learning_rate": 3.7705211809773057e-06, "loss": 0.1521, "step": 2058 }, { "epoch": 3.3263327948303716, "grad_norm": 3.962968833130297, "learning_rate": 3.7694247251326514e-06, "loss": 0.1928, "step": 2059 }, { "epoch": 3.3279483037156705, "grad_norm": 3.6926623155895126, "learning_rate": 3.768327940181572e-06, "loss": 0.1919, "step": 2060 }, { "epoch": 3.3295638126009695, "grad_norm": 3.7698571722073786, "learning_rate": 3.7672308264084163e-06, "loss": 0.2457, "step": 2061 }, { "epoch": 3.3311793214862684, "grad_norm": 3.5104938861013797, "learning_rate": 3.766133384097618e-06, "loss": 0.1793, "step": 2062 }, { "epoch": 3.332794830371567, "grad_norm": 3.7321314677792965, "learning_rate": 3.7650356135336946e-06, "loss": 0.2087, "step": 2063 }, { "epoch": 3.334410339256866, "grad_norm": 3.6524943317248595, "learning_rate": 3.7639375150012527e-06, "loss": 0.2209, "step": 2064 }, { "epoch": 3.3360258481421647, "grad_norm": 3.8496873050675053, "learning_rate": 3.7628390887849786e-06, "loss": 0.2057, "step": 2065 }, { "epoch": 3.3376413570274637, "grad_norm": 3.405612695491496, "learning_rate": 3.761740335169648e-06, "loss": 0.2131, "step": 2066 }, { "epoch": 3.3392568659127626, "grad_norm": 3.022129957847099, "learning_rate": 3.7606412544401196e-06, "loss": 0.1625, "step": 2067 }, { "epoch": 3.3408723747980615, "grad_norm": 3.500928629646821, "learning_rate": 3.759541846881337e-06, "loss": 0.1424, "step": 2068 }, { "epoch": 3.3424878836833605, "grad_norm": 3.4552413360682994, "learning_rate": 3.7584421127783277e-06, "loss": 0.1466, "step": 2069 }, { "epoch": 3.344103392568659, "grad_norm": 3.232544934907141, "learning_rate": 3.7573420524162054e-06, "loss": 0.1779, "step": 2070 }, { "epoch": 3.345718901453958, "grad_norm": 3.713704190922176, "learning_rate": 3.7562416660801677e-06, "loss": 0.1792, "step": 2071 }, { "epoch": 3.347334410339257, "grad_norm": 3.1571124414945047, "learning_rate": 3.7551409540554967e-06, "loss": 0.1452, "step": 2072 }, { "epoch": 3.3489499192245558, "grad_norm": 3.799186291369904, "learning_rate": 3.75403991662756e-06, "loss": 0.214, "step": 2073 }, { "epoch": 3.3505654281098547, "grad_norm": 3.446794463707235, "learning_rate": 3.7529385540818064e-06, "loss": 0.1786, "step": 2074 }, { "epoch": 3.3521809369951536, "grad_norm": 3.6747642987600653, "learning_rate": 3.751836866703773e-06, "loss": 0.1677, "step": 2075 }, { "epoch": 3.3537964458804526, "grad_norm": 3.795014217303121, "learning_rate": 3.750734854779079e-06, "loss": 0.1827, "step": 2076 }, { "epoch": 3.355411954765751, "grad_norm": 3.2574329283518417, "learning_rate": 3.7496325185934267e-06, "loss": 0.1805, "step": 2077 }, { "epoch": 3.35702746365105, "grad_norm": 3.490562207524755, "learning_rate": 3.748529858432606e-06, "loss": 0.163, "step": 2078 }, { "epoch": 3.358642972536349, "grad_norm": 2.843633142509095, "learning_rate": 3.747426874582486e-06, "loss": 0.1274, "step": 2079 }, { "epoch": 3.360258481421648, "grad_norm": 3.267832695539971, "learning_rate": 3.7463235673290245e-06, "loss": 0.1665, "step": 2080 }, { "epoch": 3.361873990306947, "grad_norm": 3.498245664942454, "learning_rate": 3.7452199369582594e-06, "loss": 0.1741, "step": 2081 }, { "epoch": 3.3634894991922457, "grad_norm": 3.747479911118375, "learning_rate": 3.744115983756314e-06, "loss": 0.163, "step": 2082 }, { "epoch": 3.3651050080775446, "grad_norm": 4.070850371207589, "learning_rate": 3.7430117080093962e-06, "loss": 0.2153, "step": 2083 }, { "epoch": 3.366720516962843, "grad_norm": 3.2914454434335774, "learning_rate": 3.741907110003796e-06, "loss": 0.1586, "step": 2084 }, { "epoch": 3.368336025848142, "grad_norm": 3.42744751388509, "learning_rate": 3.740802190025886e-06, "loss": 0.1804, "step": 2085 }, { "epoch": 3.369951534733441, "grad_norm": 3.0508482720360295, "learning_rate": 3.7396969483621264e-06, "loss": 0.1657, "step": 2086 }, { "epoch": 3.37156704361874, "grad_norm": 3.4568107247370268, "learning_rate": 3.738591385299057e-06, "loss": 0.1807, "step": 2087 }, { "epoch": 3.373182552504039, "grad_norm": 4.149779198440348, "learning_rate": 3.737485501123301e-06, "loss": 0.1635, "step": 2088 }, { "epoch": 3.374798061389338, "grad_norm": 3.127797874230412, "learning_rate": 3.736379296121567e-06, "loss": 0.1616, "step": 2089 }, { "epoch": 3.3764135702746367, "grad_norm": 3.968889990496269, "learning_rate": 3.7352727705806446e-06, "loss": 0.183, "step": 2090 }, { "epoch": 3.378029079159935, "grad_norm": 3.127804208770206, "learning_rate": 3.7341659247874097e-06, "loss": 0.165, "step": 2091 }, { "epoch": 3.379644588045234, "grad_norm": 3.318373119937564, "learning_rate": 3.7330587590288176e-06, "loss": 0.1751, "step": 2092 }, { "epoch": 3.381260096930533, "grad_norm": 3.9400552342518913, "learning_rate": 3.7319512735919072e-06, "loss": 0.2091, "step": 2093 }, { "epoch": 3.382875605815832, "grad_norm": 3.7632812979836734, "learning_rate": 3.7308434687638027e-06, "loss": 0.1691, "step": 2094 }, { "epoch": 3.384491114701131, "grad_norm": 3.1819178184095125, "learning_rate": 3.7297353448317096e-06, "loss": 0.1628, "step": 2095 }, { "epoch": 3.38610662358643, "grad_norm": 2.8926463906874735, "learning_rate": 3.7286269020829145e-06, "loss": 0.1363, "step": 2096 }, { "epoch": 3.387722132471729, "grad_norm": 3.550879705076649, "learning_rate": 3.72751814080479e-06, "loss": 0.1677, "step": 2097 }, { "epoch": 3.3893376413570273, "grad_norm": 3.315459772886651, "learning_rate": 3.726409061284789e-06, "loss": 0.1775, "step": 2098 }, { "epoch": 3.3909531502423262, "grad_norm": 3.3165787564985894, "learning_rate": 3.7252996638104465e-06, "loss": 0.1523, "step": 2099 }, { "epoch": 3.392568659127625, "grad_norm": 3.3248493202850855, "learning_rate": 3.7241899486693816e-06, "loss": 0.1593, "step": 2100 }, { "epoch": 3.394184168012924, "grad_norm": 4.196843001264028, "learning_rate": 3.7230799161492946e-06, "loss": 0.1967, "step": 2101 }, { "epoch": 3.395799676898223, "grad_norm": 3.1910765300461406, "learning_rate": 3.7219695665379684e-06, "loss": 0.1514, "step": 2102 }, { "epoch": 3.397415185783522, "grad_norm": 3.585079723498381, "learning_rate": 3.7208589001232694e-06, "loss": 0.1614, "step": 2103 }, { "epoch": 3.399030694668821, "grad_norm": 3.751242738207349, "learning_rate": 3.7197479171931432e-06, "loss": 0.162, "step": 2104 }, { "epoch": 3.4006462035541194, "grad_norm": 2.946861663810531, "learning_rate": 3.7186366180356215e-06, "loss": 0.1362, "step": 2105 }, { "epoch": 3.4022617124394183, "grad_norm": 3.9285398021151603, "learning_rate": 3.717525002938813e-06, "loss": 0.2125, "step": 2106 }, { "epoch": 3.4038772213247173, "grad_norm": 3.353480093917449, "learning_rate": 3.7164130721909124e-06, "loss": 0.1569, "step": 2107 }, { "epoch": 3.405492730210016, "grad_norm": 3.4719583223973247, "learning_rate": 3.715300826080195e-06, "loss": 0.1757, "step": 2108 }, { "epoch": 3.407108239095315, "grad_norm": 3.6948745525150732, "learning_rate": 3.7141882648950167e-06, "loss": 0.171, "step": 2109 }, { "epoch": 3.408723747980614, "grad_norm": 3.926203878159657, "learning_rate": 3.713075388923817e-06, "loss": 0.1595, "step": 2110 }, { "epoch": 3.410339256865913, "grad_norm": 3.073824551318246, "learning_rate": 3.7119621984551155e-06, "loss": 0.1322, "step": 2111 }, { "epoch": 3.4119547657512115, "grad_norm": 3.4797455373124646, "learning_rate": 3.7108486937775144e-06, "loss": 0.1817, "step": 2112 }, { "epoch": 3.4135702746365104, "grad_norm": 3.3834344144207598, "learning_rate": 3.709734875179697e-06, "loss": 0.1741, "step": 2113 }, { "epoch": 3.4151857835218093, "grad_norm": 3.309696949738976, "learning_rate": 3.708620742950427e-06, "loss": 0.1349, "step": 2114 }, { "epoch": 3.4168012924071083, "grad_norm": 2.97179417829899, "learning_rate": 3.7075062973785504e-06, "loss": 0.1229, "step": 2115 }, { "epoch": 3.418416801292407, "grad_norm": 3.4800962971467935, "learning_rate": 3.7063915387529947e-06, "loss": 0.1713, "step": 2116 }, { "epoch": 3.420032310177706, "grad_norm": 2.717175817793524, "learning_rate": 3.705276467362768e-06, "loss": 0.1657, "step": 2117 }, { "epoch": 3.421647819063005, "grad_norm": 3.278907776118608, "learning_rate": 3.7041610834969593e-06, "loss": 0.1743, "step": 2118 }, { "epoch": 3.4232633279483036, "grad_norm": 3.024396051755872, "learning_rate": 3.7030453874447397e-06, "loss": 0.1413, "step": 2119 }, { "epoch": 3.4248788368336025, "grad_norm": 3.416514032812191, "learning_rate": 3.7019293794953597e-06, "loss": 0.1908, "step": 2120 }, { "epoch": 3.4264943457189014, "grad_norm": 3.7590774585258, "learning_rate": 3.7008130599381523e-06, "loss": 0.1784, "step": 2121 }, { "epoch": 3.4281098546042004, "grad_norm": 3.344951204635803, "learning_rate": 3.6996964290625305e-06, "loss": 0.1694, "step": 2122 }, { "epoch": 3.4297253634894993, "grad_norm": 3.286510618310588, "learning_rate": 3.6985794871579867e-06, "loss": 0.1659, "step": 2123 }, { "epoch": 3.4313408723747982, "grad_norm": 3.4122285857763943, "learning_rate": 3.6974622345140967e-06, "loss": 0.168, "step": 2124 }, { "epoch": 3.432956381260097, "grad_norm": 3.053027302887575, "learning_rate": 3.6963446714205138e-06, "loss": 0.1828, "step": 2125 }, { "epoch": 3.4345718901453957, "grad_norm": 3.420316141397638, "learning_rate": 3.6952267981669737e-06, "loss": 0.1853, "step": 2126 }, { "epoch": 3.4361873990306946, "grad_norm": 3.0042985276495875, "learning_rate": 3.6941086150432935e-06, "loss": 0.1413, "step": 2127 }, { "epoch": 3.4378029079159935, "grad_norm": 3.209670003916799, "learning_rate": 3.6929901223393676e-06, "loss": 0.1655, "step": 2128 }, { "epoch": 3.4394184168012925, "grad_norm": 3.7910479094716885, "learning_rate": 3.691871320345174e-06, "loss": 0.2079, "step": 2129 }, { "epoch": 3.4410339256865914, "grad_norm": 3.2203295438960673, "learning_rate": 3.6907522093507674e-06, "loss": 0.1807, "step": 2130 }, { "epoch": 3.4426494345718903, "grad_norm": 3.599375166788358, "learning_rate": 3.6896327896462858e-06, "loss": 0.1468, "step": 2131 }, { "epoch": 3.4442649434571893, "grad_norm": 3.2107140375464773, "learning_rate": 3.6885130615219444e-06, "loss": 0.1701, "step": 2132 }, { "epoch": 3.4458804523424877, "grad_norm": 3.7056157556701135, "learning_rate": 3.6873930252680424e-06, "loss": 0.1594, "step": 2133 }, { "epoch": 3.4474959612277867, "grad_norm": 3.006155868622333, "learning_rate": 3.6862726811749535e-06, "loss": 0.1606, "step": 2134 }, { "epoch": 3.4491114701130856, "grad_norm": 2.9701128881467267, "learning_rate": 3.685152029533136e-06, "loss": 0.173, "step": 2135 }, { "epoch": 3.4507269789983845, "grad_norm": 3.908917180348476, "learning_rate": 3.684031070633125e-06, "loss": 0.1704, "step": 2136 }, { "epoch": 3.4523424878836835, "grad_norm": 3.1987319748765004, "learning_rate": 3.6829098047655364e-06, "loss": 0.1707, "step": 2137 }, { "epoch": 3.4539579967689824, "grad_norm": 3.8663455566401996, "learning_rate": 3.6817882322210656e-06, "loss": 0.202, "step": 2138 }, { "epoch": 3.4555735056542813, "grad_norm": 2.8213253992909535, "learning_rate": 3.6806663532904874e-06, "loss": 0.1484, "step": 2139 }, { "epoch": 3.45718901453958, "grad_norm": 3.503633679822531, "learning_rate": 3.679544168264656e-06, "loss": 0.1894, "step": 2140 }, { "epoch": 3.4588045234248788, "grad_norm": 3.5585716497155886, "learning_rate": 3.678421677434505e-06, "loss": 0.1912, "step": 2141 }, { "epoch": 3.4604200323101777, "grad_norm": 3.6920710853908427, "learning_rate": 3.677298881091047e-06, "loss": 0.2083, "step": 2142 }, { "epoch": 3.4620355411954766, "grad_norm": 3.3461739807264053, "learning_rate": 3.676175779525375e-06, "loss": 0.1852, "step": 2143 }, { "epoch": 3.4636510500807756, "grad_norm": 3.3068842135487304, "learning_rate": 3.675052373028659e-06, "loss": 0.1476, "step": 2144 }, { "epoch": 3.4652665589660745, "grad_norm": 3.2149464176554226, "learning_rate": 3.67392866189215e-06, "loss": 0.175, "step": 2145 }, { "epoch": 3.4668820678513734, "grad_norm": 3.4909178990707623, "learning_rate": 3.672804646407177e-06, "loss": 0.1782, "step": 2146 }, { "epoch": 3.468497576736672, "grad_norm": 3.0703897864903436, "learning_rate": 3.6716803268651484e-06, "loss": 0.1406, "step": 2147 }, { "epoch": 3.470113085621971, "grad_norm": 3.7103331579713275, "learning_rate": 3.6705557035575505e-06, "loss": 0.1796, "step": 2148 }, { "epoch": 3.47172859450727, "grad_norm": 2.9022976489906966, "learning_rate": 3.66943077677595e-06, "loss": 0.1369, "step": 2149 }, { "epoch": 3.4733441033925687, "grad_norm": 3.9072435608667746, "learning_rate": 3.668305546811991e-06, "loss": 0.1839, "step": 2150 }, { "epoch": 3.4749596122778676, "grad_norm": 3.6096275578304433, "learning_rate": 3.6671800139573967e-06, "loss": 0.1549, "step": 2151 }, { "epoch": 3.4765751211631666, "grad_norm": 4.0343216075306145, "learning_rate": 3.6660541785039676e-06, "loss": 0.1821, "step": 2152 }, { "epoch": 3.4781906300484655, "grad_norm": 3.74906165005085, "learning_rate": 3.664928040743585e-06, "loss": 0.1641, "step": 2153 }, { "epoch": 3.479806138933764, "grad_norm": 3.589563860473302, "learning_rate": 3.6638016009682064e-06, "loss": 0.1538, "step": 2154 }, { "epoch": 3.481421647819063, "grad_norm": 3.3979345397435288, "learning_rate": 3.662674859469869e-06, "loss": 0.1691, "step": 2155 }, { "epoch": 3.483037156704362, "grad_norm": 3.4824712557989685, "learning_rate": 3.661547816540687e-06, "loss": 0.1753, "step": 2156 }, { "epoch": 3.484652665589661, "grad_norm": 3.7833072923888484, "learning_rate": 3.6604204724728543e-06, "loss": 0.2283, "step": 2157 }, { "epoch": 3.4862681744749597, "grad_norm": 3.789387852282767, "learning_rate": 3.6592928275586413e-06, "loss": 0.1745, "step": 2158 }, { "epoch": 3.4878836833602587, "grad_norm": 2.914434519461706, "learning_rate": 3.658164882090398e-06, "loss": 0.1568, "step": 2159 }, { "epoch": 3.489499192245557, "grad_norm": 3.355030293129559, "learning_rate": 3.6570366363605504e-06, "loss": 0.153, "step": 2160 }, { "epoch": 3.491114701130856, "grad_norm": 3.3290591781659464, "learning_rate": 3.655908090661604e-06, "loss": 0.2106, "step": 2161 }, { "epoch": 3.492730210016155, "grad_norm": 3.1990773694568286, "learning_rate": 3.654779245286141e-06, "loss": 0.1662, "step": 2162 }, { "epoch": 3.494345718901454, "grad_norm": 3.834868648038421, "learning_rate": 3.653650100526822e-06, "loss": 0.1909, "step": 2163 }, { "epoch": 3.495961227786753, "grad_norm": 2.905226557120111, "learning_rate": 3.6525206566763856e-06, "loss": 0.1735, "step": 2164 }, { "epoch": 3.497576736672052, "grad_norm": 3.2207855050926817, "learning_rate": 3.651390914027646e-06, "loss": 0.1569, "step": 2165 }, { "epoch": 3.4991922455573508, "grad_norm": 3.8906660033599962, "learning_rate": 3.650260872873498e-06, "loss": 0.1785, "step": 2166 }, { "epoch": 3.5008077544426497, "grad_norm": 3.606304987262493, "learning_rate": 3.6491305335069094e-06, "loss": 0.1841, "step": 2167 }, { "epoch": 3.502423263327948, "grad_norm": 3.6169762470018676, "learning_rate": 3.6479998962209296e-06, "loss": 0.162, "step": 2168 }, { "epoch": 3.504038772213247, "grad_norm": 3.485991776666163, "learning_rate": 3.6468689613086827e-06, "loss": 0.179, "step": 2169 }, { "epoch": 3.505654281098546, "grad_norm": 3.853291409603338, "learning_rate": 3.645737729063372e-06, "loss": 0.1903, "step": 2170 }, { "epoch": 3.507269789983845, "grad_norm": 3.4639698840517985, "learning_rate": 3.6446061997782754e-06, "loss": 0.1556, "step": 2171 }, { "epoch": 3.508885298869144, "grad_norm": 3.0194442880783963, "learning_rate": 3.6434743737467498e-06, "loss": 0.165, "step": 2172 }, { "epoch": 3.5105008077544424, "grad_norm": 3.361773676316651, "learning_rate": 3.6423422512622273e-06, "loss": 0.1544, "step": 2173 }, { "epoch": 3.5121163166397418, "grad_norm": 3.466693478834449, "learning_rate": 3.6412098326182198e-06, "loss": 0.1505, "step": 2174 }, { "epoch": 3.5137318255250403, "grad_norm": 3.7441607299157846, "learning_rate": 3.6400771181083117e-06, "loss": 0.1636, "step": 2175 }, { "epoch": 3.515347334410339, "grad_norm": 3.0311881423874376, "learning_rate": 3.6389441080261677e-06, "loss": 0.2082, "step": 2176 }, { "epoch": 3.516962843295638, "grad_norm": 3.644367922506805, "learning_rate": 3.637810802665528e-06, "loss": 0.1419, "step": 2177 }, { "epoch": 3.518578352180937, "grad_norm": 3.285668136041679, "learning_rate": 3.6366772023202085e-06, "loss": 0.1764, "step": 2178 }, { "epoch": 3.520193861066236, "grad_norm": 3.9120503076903783, "learning_rate": 3.6355433072841027e-06, "loss": 0.1675, "step": 2179 }, { "epoch": 3.5218093699515345, "grad_norm": 3.1944849879104606, "learning_rate": 3.63440911785118e-06, "loss": 0.1647, "step": 2180 }, { "epoch": 3.523424878836834, "grad_norm": 3.5522868978157387, "learning_rate": 3.6332746343154864e-06, "loss": 0.1681, "step": 2181 }, { "epoch": 3.5250403877221324, "grad_norm": 3.3934170303912827, "learning_rate": 3.632139856971144e-06, "loss": 0.1817, "step": 2182 }, { "epoch": 3.5266558966074313, "grad_norm": 4.05937805852781, "learning_rate": 3.6310047861123503e-06, "loss": 0.2048, "step": 2183 }, { "epoch": 3.52827140549273, "grad_norm": 3.779005624181952, "learning_rate": 3.6298694220333807e-06, "loss": 0.2098, "step": 2184 }, { "epoch": 3.529886914378029, "grad_norm": 3.671594848914875, "learning_rate": 3.6287337650285853e-06, "loss": 0.1901, "step": 2185 }, { "epoch": 3.531502423263328, "grad_norm": 4.269756612386602, "learning_rate": 3.627597815392389e-06, "loss": 0.1625, "step": 2186 }, { "epoch": 3.5331179321486266, "grad_norm": 3.4922485973169524, "learning_rate": 3.6264615734192963e-06, "loss": 0.1431, "step": 2187 }, { "epoch": 3.534733441033926, "grad_norm": 2.589647287250347, "learning_rate": 3.6253250394038832e-06, "loss": 0.129, "step": 2188 }, { "epoch": 3.5363489499192244, "grad_norm": 3.3475986690815445, "learning_rate": 3.6241882136408044e-06, "loss": 0.2006, "step": 2189 }, { "epoch": 3.5379644588045234, "grad_norm": 3.776743055092912, "learning_rate": 3.623051096424789e-06, "loss": 0.1794, "step": 2190 }, { "epoch": 3.5395799676898223, "grad_norm": 3.607072527034199, "learning_rate": 3.6219136880506417e-06, "loss": 0.1647, "step": 2191 }, { "epoch": 3.5411954765751212, "grad_norm": 3.972784567010926, "learning_rate": 3.6207759888132425e-06, "loss": 0.1659, "step": 2192 }, { "epoch": 3.54281098546042, "grad_norm": 2.9508195157051498, "learning_rate": 3.6196379990075483e-06, "loss": 0.1444, "step": 2193 }, { "epoch": 3.5444264943457187, "grad_norm": 3.476518219801498, "learning_rate": 3.6184997189285884e-06, "loss": 0.1703, "step": 2194 }, { "epoch": 3.546042003231018, "grad_norm": 3.172909562293003, "learning_rate": 3.6173611488714715e-06, "loss": 0.162, "step": 2195 }, { "epoch": 3.5476575121163165, "grad_norm": 3.26598287451548, "learning_rate": 3.6162222891313774e-06, "loss": 0.1803, "step": 2196 }, { "epoch": 3.5492730210016155, "grad_norm": 3.27588984507208, "learning_rate": 3.6150831400035628e-06, "loss": 0.1836, "step": 2197 }, { "epoch": 3.5508885298869144, "grad_norm": 3.2923312230772814, "learning_rate": 3.61394370178336e-06, "loss": 0.1449, "step": 2198 }, { "epoch": 3.5525040387722133, "grad_norm": 3.3479154535137976, "learning_rate": 3.6128039747661754e-06, "loss": 0.1687, "step": 2199 }, { "epoch": 3.5541195476575123, "grad_norm": 3.3655257687251563, "learning_rate": 3.6116639592474902e-06, "loss": 0.1842, "step": 2200 }, { "epoch": 3.5541195476575123, "eval_loss": 1.2643392086029053, "eval_runtime": 2.3473, "eval_samples_per_second": 127.806, "eval_steps_per_second": 2.982, "step": 2200 }, { "epoch": 3.5557350565428107, "grad_norm": 3.174712760237604, "learning_rate": 3.610523655522861e-06, "loss": 0.1625, "step": 2201 }, { "epoch": 3.55735056542811, "grad_norm": 4.333170386420995, "learning_rate": 3.6093830638879186e-06, "loss": 0.1877, "step": 2202 }, { "epoch": 3.5589660743134086, "grad_norm": 3.6127313351416745, "learning_rate": 3.6082421846383687e-06, "loss": 0.1711, "step": 2203 }, { "epoch": 3.5605815831987075, "grad_norm": 3.102551691798011, "learning_rate": 3.607101018069991e-06, "loss": 0.1524, "step": 2204 }, { "epoch": 3.5621970920840065, "grad_norm": 3.413036052626699, "learning_rate": 3.605959564478641e-06, "loss": 0.1806, "step": 2205 }, { "epoch": 3.5638126009693054, "grad_norm": 3.5398975277359317, "learning_rate": 3.6048178241602473e-06, "loss": 0.2051, "step": 2206 }, { "epoch": 3.5654281098546043, "grad_norm": 3.679269992376448, "learning_rate": 3.603675797410814e-06, "loss": 0.156, "step": 2207 }, { "epoch": 3.567043618739903, "grad_norm": 3.13186441256956, "learning_rate": 3.602533484526417e-06, "loss": 0.1498, "step": 2208 }, { "epoch": 3.568659127625202, "grad_norm": 3.2662793273483346, "learning_rate": 3.601390885803211e-06, "loss": 0.1542, "step": 2209 }, { "epoch": 3.5702746365105007, "grad_norm": 2.729326466766825, "learning_rate": 3.6002480015374185e-06, "loss": 0.1501, "step": 2210 }, { "epoch": 3.5718901453957996, "grad_norm": 4.037090161695907, "learning_rate": 3.599104832025343e-06, "loss": 0.1789, "step": 2211 }, { "epoch": 3.5735056542810986, "grad_norm": 3.278212280732828, "learning_rate": 3.5979613775633566e-06, "loss": 0.2086, "step": 2212 }, { "epoch": 3.5751211631663975, "grad_norm": 3.229434779221099, "learning_rate": 3.596817638447907e-06, "loss": 0.1727, "step": 2213 }, { "epoch": 3.5767366720516964, "grad_norm": 3.462545025808373, "learning_rate": 3.5956736149755168e-06, "loss": 0.1645, "step": 2214 }, { "epoch": 3.578352180936995, "grad_norm": 3.714082065181985, "learning_rate": 3.59452930744278e-06, "loss": 0.1735, "step": 2215 }, { "epoch": 3.5799676898222943, "grad_norm": 3.308426308551786, "learning_rate": 3.5933847161463662e-06, "loss": 0.1676, "step": 2216 }, { "epoch": 3.581583198707593, "grad_norm": 3.2669370174122667, "learning_rate": 3.5922398413830195e-06, "loss": 0.1773, "step": 2217 }, { "epoch": 3.5831987075928917, "grad_norm": 3.2701460335450783, "learning_rate": 3.591094683449553e-06, "loss": 0.1608, "step": 2218 }, { "epoch": 3.5848142164781907, "grad_norm": 4.488792564664496, "learning_rate": 3.589949242642859e-06, "loss": 0.1954, "step": 2219 }, { "epoch": 3.5864297253634896, "grad_norm": 3.2893454457240607, "learning_rate": 3.588803519259898e-06, "loss": 0.1542, "step": 2220 }, { "epoch": 3.5880452342487885, "grad_norm": 2.9532829993668113, "learning_rate": 3.5876575135977083e-06, "loss": 0.1552, "step": 2221 }, { "epoch": 3.589660743134087, "grad_norm": 3.5327873279207602, "learning_rate": 3.5865112259533973e-06, "loss": 0.1944, "step": 2222 }, { "epoch": 3.5912762520193864, "grad_norm": 3.2809291043136017, "learning_rate": 3.5853646566241486e-06, "loss": 0.1199, "step": 2223 }, { "epoch": 3.592891760904685, "grad_norm": 2.9183566638154437, "learning_rate": 3.584217805907216e-06, "loss": 0.163, "step": 2224 }, { "epoch": 3.594507269789984, "grad_norm": 2.982757770912894, "learning_rate": 3.5830706740999297e-06, "loss": 0.151, "step": 2225 }, { "epoch": 3.5961227786752827, "grad_norm": 3.0536855808934806, "learning_rate": 3.58192326149969e-06, "loss": 0.1768, "step": 2226 }, { "epoch": 3.5977382875605817, "grad_norm": 3.159960399796372, "learning_rate": 3.5807755684039707e-06, "loss": 0.1387, "step": 2227 }, { "epoch": 3.5993537964458806, "grad_norm": 3.5743310011318212, "learning_rate": 3.5796275951103187e-06, "loss": 0.1576, "step": 2228 }, { "epoch": 3.600969305331179, "grad_norm": 3.1951176967054207, "learning_rate": 3.5784793419163537e-06, "loss": 0.1699, "step": 2229 }, { "epoch": 3.602584814216478, "grad_norm": 3.5168474620129606, "learning_rate": 3.577330809119768e-06, "loss": 0.1632, "step": 2230 }, { "epoch": 3.604200323101777, "grad_norm": 3.816246234578784, "learning_rate": 3.5761819970183247e-06, "loss": 0.1967, "step": 2231 }, { "epoch": 3.605815831987076, "grad_norm": 3.1925254536535355, "learning_rate": 3.5750329059098615e-06, "loss": 0.1752, "step": 2232 }, { "epoch": 3.607431340872375, "grad_norm": 2.905628004394664, "learning_rate": 3.5738835360922874e-06, "loss": 0.1524, "step": 2233 }, { "epoch": 3.6090468497576738, "grad_norm": 3.612226734524439, "learning_rate": 3.5727338878635843e-06, "loss": 0.1603, "step": 2234 }, { "epoch": 3.6106623586429727, "grad_norm": 3.4879582326299094, "learning_rate": 3.571583961521805e-06, "loss": 0.181, "step": 2235 }, { "epoch": 3.612277867528271, "grad_norm": 3.738458090032055, "learning_rate": 3.570433757365076e-06, "loss": 0.2032, "step": 2236 }, { "epoch": 3.61389337641357, "grad_norm": 3.400649313061845, "learning_rate": 3.5692832756915942e-06, "loss": 0.1904, "step": 2237 }, { "epoch": 3.615508885298869, "grad_norm": 3.326775488197859, "learning_rate": 3.5681325167996296e-06, "loss": 0.1828, "step": 2238 }, { "epoch": 3.617124394184168, "grad_norm": 3.943173917114514, "learning_rate": 3.566981480987524e-06, "loss": 0.2197, "step": 2239 }, { "epoch": 3.618739903069467, "grad_norm": 3.109038013803076, "learning_rate": 3.5658301685536906e-06, "loss": 0.1616, "step": 2240 }, { "epoch": 3.620355411954766, "grad_norm": 3.198138821092137, "learning_rate": 3.5646785797966144e-06, "loss": 0.1671, "step": 2241 }, { "epoch": 3.621970920840065, "grad_norm": 3.339930474110946, "learning_rate": 3.5635267150148526e-06, "loss": 0.1801, "step": 2242 }, { "epoch": 3.6235864297253633, "grad_norm": 3.395674185714543, "learning_rate": 3.5623745745070325e-06, "loss": 0.1598, "step": 2243 }, { "epoch": 3.625201938610662, "grad_norm": 3.04994455409612, "learning_rate": 3.561222158571855e-06, "loss": 0.151, "step": 2244 }, { "epoch": 3.626817447495961, "grad_norm": 3.1662735436529172, "learning_rate": 3.5600694675080905e-06, "loss": 0.1579, "step": 2245 }, { "epoch": 3.62843295638126, "grad_norm": 2.993706824461059, "learning_rate": 3.558916501614581e-06, "loss": 0.1473, "step": 2246 }, { "epoch": 3.630048465266559, "grad_norm": 3.2926203578815114, "learning_rate": 3.557763261190241e-06, "loss": 0.152, "step": 2247 }, { "epoch": 3.631663974151858, "grad_norm": 2.889548233307333, "learning_rate": 3.5566097465340555e-06, "loss": 0.1417, "step": 2248 }, { "epoch": 3.633279483037157, "grad_norm": 3.3944500390830306, "learning_rate": 3.5554559579450803e-06, "loss": 0.1471, "step": 2249 }, { "epoch": 3.6348949919224554, "grad_norm": 3.1902249438386945, "learning_rate": 3.5543018957224422e-06, "loss": 0.1721, "step": 2250 }, { "epoch": 3.6365105008077543, "grad_norm": 3.461832038667998, "learning_rate": 3.5531475601653396e-06, "loss": 0.1618, "step": 2251 }, { "epoch": 3.638126009693053, "grad_norm": 3.4342858236427913, "learning_rate": 3.5519929515730406e-06, "loss": 0.1335, "step": 2252 }, { "epoch": 3.639741518578352, "grad_norm": 3.666643833151781, "learning_rate": 3.550838070244886e-06, "loss": 0.1419, "step": 2253 }, { "epoch": 3.641357027463651, "grad_norm": 3.22012687894018, "learning_rate": 3.549682916480285e-06, "loss": 0.1582, "step": 2254 }, { "epoch": 3.64297253634895, "grad_norm": 3.2010600970448793, "learning_rate": 3.5485274905787187e-06, "loss": 0.1623, "step": 2255 }, { "epoch": 3.644588045234249, "grad_norm": 3.5044628984154285, "learning_rate": 3.5473717928397394e-06, "loss": 0.16, "step": 2256 }, { "epoch": 3.6462035541195474, "grad_norm": 3.747272568691585, "learning_rate": 3.5462158235629685e-06, "loss": 0.1733, "step": 2257 }, { "epoch": 3.6478190630048464, "grad_norm": 3.2516628091655595, "learning_rate": 3.545059583048098e-06, "loss": 0.1585, "step": 2258 }, { "epoch": 3.6494345718901453, "grad_norm": 5.438285587125824, "learning_rate": 3.5439030715948906e-06, "loss": 0.2081, "step": 2259 }, { "epoch": 3.6510500807754442, "grad_norm": 3.689829222750308, "learning_rate": 3.5427462895031813e-06, "loss": 0.1928, "step": 2260 }, { "epoch": 3.652665589660743, "grad_norm": 3.310158796038799, "learning_rate": 3.5415892370728706e-06, "loss": 0.1434, "step": 2261 }, { "epoch": 3.654281098546042, "grad_norm": 3.0995925470351304, "learning_rate": 3.5404319146039323e-06, "loss": 0.1336, "step": 2262 }, { "epoch": 3.655896607431341, "grad_norm": 3.876188941151889, "learning_rate": 3.5392743223964105e-06, "loss": 0.1715, "step": 2263 }, { "epoch": 3.6575121163166395, "grad_norm": 3.3863817416820567, "learning_rate": 3.5381164607504183e-06, "loss": 0.1669, "step": 2264 }, { "epoch": 3.6591276252019385, "grad_norm": 3.1846155832573286, "learning_rate": 3.536958329966137e-06, "loss": 0.1568, "step": 2265 }, { "epoch": 3.6607431340872374, "grad_norm": 4.01860814483179, "learning_rate": 3.5357999303438213e-06, "loss": 0.2206, "step": 2266 }, { "epoch": 3.6623586429725363, "grad_norm": 3.0464168046107503, "learning_rate": 3.534641262183792e-06, "loss": 0.1633, "step": 2267 }, { "epoch": 3.6639741518578353, "grad_norm": 4.852821867648468, "learning_rate": 3.5334823257864433e-06, "loss": 0.1916, "step": 2268 }, { "epoch": 3.665589660743134, "grad_norm": 2.8478711644251598, "learning_rate": 3.532323121452235e-06, "loss": 0.1281, "step": 2269 }, { "epoch": 3.667205169628433, "grad_norm": 3.506166491988056, "learning_rate": 3.5311636494816987e-06, "loss": 0.1871, "step": 2270 }, { "epoch": 3.6688206785137316, "grad_norm": 3.938567935915337, "learning_rate": 3.530003910175436e-06, "loss": 0.1825, "step": 2271 }, { "epoch": 3.6704361873990305, "grad_norm": 3.421787341821498, "learning_rate": 3.528843903834115e-06, "loss": 0.1531, "step": 2272 }, { "epoch": 3.6720516962843295, "grad_norm": 4.305868798138057, "learning_rate": 3.527683630758475e-06, "loss": 0.1902, "step": 2273 }, { "epoch": 3.6736672051696284, "grad_norm": 3.487136643871255, "learning_rate": 3.5265230912493246e-06, "loss": 0.1992, "step": 2274 }, { "epoch": 3.6752827140549273, "grad_norm": 3.0085563412806, "learning_rate": 3.5253622856075416e-06, "loss": 0.1534, "step": 2275 }, { "epoch": 3.6768982229402263, "grad_norm": 4.050243919934574, "learning_rate": 3.524201214134071e-06, "loss": 0.1935, "step": 2276 }, { "epoch": 3.678513731825525, "grad_norm": 3.170874212387338, "learning_rate": 3.523039877129929e-06, "loss": 0.1496, "step": 2277 }, { "epoch": 3.6801292407108237, "grad_norm": 3.3642057789756694, "learning_rate": 3.521878274896199e-06, "loss": 0.1939, "step": 2278 }, { "epoch": 3.6817447495961226, "grad_norm": 3.3628731637867917, "learning_rate": 3.520716407734034e-06, "loss": 0.1821, "step": 2279 }, { "epoch": 3.6833602584814216, "grad_norm": 3.423457016549564, "learning_rate": 3.5195542759446547e-06, "loss": 0.1854, "step": 2280 }, { "epoch": 3.6849757673667205, "grad_norm": 3.2382804919856283, "learning_rate": 3.5183918798293528e-06, "loss": 0.1656, "step": 2281 }, { "epoch": 3.6865912762520194, "grad_norm": 3.0525488589595113, "learning_rate": 3.5172292196894854e-06, "loss": 0.1207, "step": 2282 }, { "epoch": 3.6882067851373184, "grad_norm": 3.3838128573651063, "learning_rate": 3.51606629582648e-06, "loss": 0.1568, "step": 2283 }, { "epoch": 3.6898222940226173, "grad_norm": 3.6352667043117033, "learning_rate": 3.5149031085418324e-06, "loss": 0.2182, "step": 2284 }, { "epoch": 3.691437802907916, "grad_norm": 3.4079378986359408, "learning_rate": 3.5137396581371063e-06, "loss": 0.1268, "step": 2285 }, { "epoch": 3.6930533117932147, "grad_norm": 3.69277870581677, "learning_rate": 3.512575944913933e-06, "loss": 0.1554, "step": 2286 }, { "epoch": 3.6946688206785137, "grad_norm": 3.4297790932149366, "learning_rate": 3.5114119691740124e-06, "loss": 0.1751, "step": 2287 }, { "epoch": 3.6962843295638126, "grad_norm": 3.858526471591439, "learning_rate": 3.510247731219113e-06, "loss": 0.1772, "step": 2288 }, { "epoch": 3.6978998384491115, "grad_norm": 3.7418892403890385, "learning_rate": 3.509083231351071e-06, "loss": 0.1588, "step": 2289 }, { "epoch": 3.6995153473344105, "grad_norm": 3.786005541634276, "learning_rate": 3.5079184698717903e-06, "loss": 0.1782, "step": 2290 }, { "epoch": 3.7011308562197094, "grad_norm": 3.9396433915265074, "learning_rate": 3.5067534470832424e-06, "loss": 0.1649, "step": 2291 }, { "epoch": 3.702746365105008, "grad_norm": 2.987652663377894, "learning_rate": 3.5055881632874676e-06, "loss": 0.1496, "step": 2292 }, { "epoch": 3.704361873990307, "grad_norm": 4.106900395818699, "learning_rate": 3.5044226187865725e-06, "loss": 0.1987, "step": 2293 }, { "epoch": 3.7059773828756057, "grad_norm": 3.7798806396918176, "learning_rate": 3.5032568138827323e-06, "loss": 0.1581, "step": 2294 }, { "epoch": 3.7075928917609047, "grad_norm": 3.1195954203206626, "learning_rate": 3.5020907488781887e-06, "loss": 0.2085, "step": 2295 }, { "epoch": 3.7092084006462036, "grad_norm": 3.1690395026400817, "learning_rate": 3.500924424075252e-06, "loss": 0.1764, "step": 2296 }, { "epoch": 3.7108239095315025, "grad_norm": 3.28061319700633, "learning_rate": 3.4997578397762993e-06, "loss": 0.1915, "step": 2297 }, { "epoch": 3.7124394184168015, "grad_norm": 3.3377541164726225, "learning_rate": 3.498590996283775e-06, "loss": 0.1568, "step": 2298 }, { "epoch": 3.7140549273021, "grad_norm": 3.1740809918991024, "learning_rate": 3.497423893900191e-06, "loss": 0.1553, "step": 2299 }, { "epoch": 3.715670436187399, "grad_norm": 3.5963450229930345, "learning_rate": 3.4962565329281247e-06, "loss": 0.1783, "step": 2300 }, { "epoch": 3.717285945072698, "grad_norm": 3.4493124010590046, "learning_rate": 3.4950889136702226e-06, "loss": 0.1736, "step": 2301 }, { "epoch": 3.7189014539579968, "grad_norm": 3.4162284969466175, "learning_rate": 3.4939210364291986e-06, "loss": 0.1757, "step": 2302 }, { "epoch": 3.7205169628432957, "grad_norm": 4.08991846236741, "learning_rate": 3.4927529015078305e-06, "loss": 0.2203, "step": 2303 }, { "epoch": 3.7221324717285946, "grad_norm": 3.4865299278057393, "learning_rate": 3.4915845092089658e-06, "loss": 0.1914, "step": 2304 }, { "epoch": 3.7237479806138936, "grad_norm": 4.0650014036915545, "learning_rate": 3.490415859835517e-06, "loss": 0.1901, "step": 2305 }, { "epoch": 3.725363489499192, "grad_norm": 3.6023579026152253, "learning_rate": 3.489246953690464e-06, "loss": 0.1692, "step": 2306 }, { "epoch": 3.726978998384491, "grad_norm": 3.2750235835784287, "learning_rate": 3.4880777910768535e-06, "loss": 0.1532, "step": 2307 }, { "epoch": 3.72859450726979, "grad_norm": 3.886226300485572, "learning_rate": 3.4869083722977976e-06, "loss": 0.2264, "step": 2308 }, { "epoch": 3.730210016155089, "grad_norm": 3.2836399091542803, "learning_rate": 3.485738697656477e-06, "loss": 0.1816, "step": 2309 }, { "epoch": 3.731825525040388, "grad_norm": 3.2818413805692956, "learning_rate": 3.4845687674561352e-06, "loss": 0.2087, "step": 2310 }, { "epoch": 3.7334410339256867, "grad_norm": 3.5886028693456247, "learning_rate": 3.483398582000086e-06, "loss": 0.1354, "step": 2311 }, { "epoch": 3.7350565428109856, "grad_norm": 3.223197444951783, "learning_rate": 3.4822281415917065e-06, "loss": 0.1919, "step": 2312 }, { "epoch": 3.736672051696284, "grad_norm": 3.015412900343749, "learning_rate": 3.481057446534441e-06, "loss": 0.1447, "step": 2313 }, { "epoch": 3.738287560581583, "grad_norm": 3.0308019955202, "learning_rate": 3.479886497131799e-06, "loss": 0.1996, "step": 2314 }, { "epoch": 3.739903069466882, "grad_norm": 3.4641657727716053, "learning_rate": 3.4787152936873576e-06, "loss": 0.1903, "step": 2315 }, { "epoch": 3.741518578352181, "grad_norm": 3.8870103640392717, "learning_rate": 3.4775438365047587e-06, "loss": 0.1705, "step": 2316 }, { "epoch": 3.74313408723748, "grad_norm": 3.446399363120236, "learning_rate": 3.476372125887708e-06, "loss": 0.1813, "step": 2317 }, { "epoch": 3.744749596122779, "grad_norm": 4.59693590718971, "learning_rate": 3.475200162139982e-06, "loss": 0.1779, "step": 2318 }, { "epoch": 3.7463651050080777, "grad_norm": 3.4321597149675824, "learning_rate": 3.4740279455654175e-06, "loss": 0.1853, "step": 2319 }, { "epoch": 3.7479806138933762, "grad_norm": 3.2867607596044217, "learning_rate": 3.47285547646792e-06, "loss": 0.1568, "step": 2320 }, { "epoch": 3.749596122778675, "grad_norm": 3.95471728150191, "learning_rate": 3.4716827551514596e-06, "loss": 0.1605, "step": 2321 }, { "epoch": 3.751211631663974, "grad_norm": 3.4069267029512535, "learning_rate": 3.4705097819200716e-06, "loss": 0.1743, "step": 2322 }, { "epoch": 3.752827140549273, "grad_norm": 3.851088525297605, "learning_rate": 3.4693365570778566e-06, "loss": 0.1629, "step": 2323 }, { "epoch": 3.754442649434572, "grad_norm": 3.4939713915857626, "learning_rate": 3.468163080928981e-06, "loss": 0.1849, "step": 2324 }, { "epoch": 3.756058158319871, "grad_norm": 3.530544984258287, "learning_rate": 3.4669893537776743e-06, "loss": 0.1809, "step": 2325 }, { "epoch": 3.75767366720517, "grad_norm": 3.5207404109738833, "learning_rate": 3.4658153759282353e-06, "loss": 0.1751, "step": 2326 }, { "epoch": 3.7592891760904683, "grad_norm": 4.155966701496785, "learning_rate": 3.464641147685024e-06, "loss": 0.2043, "step": 2327 }, { "epoch": 3.7609046849757672, "grad_norm": 3.6582804742227277, "learning_rate": 3.463466669352466e-06, "loss": 0.1418, "step": 2328 }, { "epoch": 3.762520193861066, "grad_norm": 3.159836687664768, "learning_rate": 3.462291941235053e-06, "loss": 0.1679, "step": 2329 }, { "epoch": 3.764135702746365, "grad_norm": 3.9803886610334347, "learning_rate": 3.46111696363734e-06, "loss": 0.2033, "step": 2330 }, { "epoch": 3.765751211631664, "grad_norm": 3.5724667036738977, "learning_rate": 3.4599417368639494e-06, "loss": 0.1777, "step": 2331 }, { "epoch": 3.767366720516963, "grad_norm": 3.4063616579991733, "learning_rate": 3.4587662612195638e-06, "loss": 0.1612, "step": 2332 }, { "epoch": 3.768982229402262, "grad_norm": 3.2009978853629506, "learning_rate": 3.457590537008933e-06, "loss": 0.1627, "step": 2333 }, { "epoch": 3.7705977382875604, "grad_norm": 3.413661768277048, "learning_rate": 3.456414564536873e-06, "loss": 0.1672, "step": 2334 }, { "epoch": 3.7722132471728593, "grad_norm": 3.781825970045012, "learning_rate": 3.45523834410826e-06, "loss": 0.181, "step": 2335 }, { "epoch": 3.7738287560581583, "grad_norm": 3.61568370886633, "learning_rate": 3.454061876028037e-06, "loss": 0.1764, "step": 2336 }, { "epoch": 3.775444264943457, "grad_norm": 2.601308473848653, "learning_rate": 3.4528851606012117e-06, "loss": 0.1296, "step": 2337 }, { "epoch": 3.777059773828756, "grad_norm": 3.899875126648377, "learning_rate": 3.4517081981328544e-06, "loss": 0.2071, "step": 2338 }, { "epoch": 3.778675282714055, "grad_norm": 4.6367545797535925, "learning_rate": 3.4505309889281004e-06, "loss": 0.1671, "step": 2339 }, { "epoch": 3.780290791599354, "grad_norm": 3.276315778424701, "learning_rate": 3.449353533292148e-06, "loss": 0.1864, "step": 2340 }, { "epoch": 3.7819063004846525, "grad_norm": 3.4649817806472827, "learning_rate": 3.4481758315302612e-06, "loss": 0.1937, "step": 2341 }, { "epoch": 3.7835218093699514, "grad_norm": 3.248945235762814, "learning_rate": 3.4469978839477653e-06, "loss": 0.1955, "step": 2342 }, { "epoch": 3.7851373182552503, "grad_norm": 3.5544463553425274, "learning_rate": 3.4458196908500524e-06, "loss": 0.1943, "step": 2343 }, { "epoch": 3.7867528271405493, "grad_norm": 2.966576525140422, "learning_rate": 3.4446412525425747e-06, "loss": 0.1642, "step": 2344 }, { "epoch": 3.788368336025848, "grad_norm": 3.5421531344342347, "learning_rate": 3.4434625693308517e-06, "loss": 0.1819, "step": 2345 }, { "epoch": 3.789983844911147, "grad_norm": 3.761055976466393, "learning_rate": 3.442283641520463e-06, "loss": 0.1625, "step": 2346 }, { "epoch": 3.791599353796446, "grad_norm": 3.197743439459131, "learning_rate": 3.441104469417053e-06, "loss": 0.1584, "step": 2347 }, { "epoch": 3.7932148626817446, "grad_norm": 3.1754922311136577, "learning_rate": 3.4399250533263313e-06, "loss": 0.1911, "step": 2348 }, { "epoch": 3.7948303715670435, "grad_norm": 4.07432217805168, "learning_rate": 3.438745393554067e-06, "loss": 0.2061, "step": 2349 }, { "epoch": 3.7964458804523424, "grad_norm": 4.13563674470729, "learning_rate": 3.437565490406096e-06, "loss": 0.2147, "step": 2350 }, { "epoch": 3.7980613893376414, "grad_norm": 3.5299986391736407, "learning_rate": 3.4363853441883145e-06, "loss": 0.1824, "step": 2351 }, { "epoch": 3.7996768982229403, "grad_norm": 3.4700859451631803, "learning_rate": 3.4352049552066837e-06, "loss": 0.1789, "step": 2352 }, { "epoch": 3.8012924071082392, "grad_norm": 3.295885921775218, "learning_rate": 3.4340243237672262e-06, "loss": 0.1743, "step": 2353 }, { "epoch": 3.802907915993538, "grad_norm": 3.5166329362697586, "learning_rate": 3.432843450176029e-06, "loss": 0.1563, "step": 2354 }, { "epoch": 3.8045234248788367, "grad_norm": 3.722522569955789, "learning_rate": 3.4316623347392404e-06, "loss": 0.1926, "step": 2355 }, { "epoch": 3.8061389337641356, "grad_norm": 3.31016637470691, "learning_rate": 3.4304809777630724e-06, "loss": 0.1655, "step": 2356 }, { "epoch": 3.8077544426494345, "grad_norm": 3.285224898800796, "learning_rate": 3.4292993795537998e-06, "loss": 0.2083, "step": 2357 }, { "epoch": 3.8093699515347335, "grad_norm": 3.674624850402775, "learning_rate": 3.4281175404177587e-06, "loss": 0.1566, "step": 2358 }, { "epoch": 3.8109854604200324, "grad_norm": 3.4742153368224926, "learning_rate": 3.4269354606613486e-06, "loss": 0.1813, "step": 2359 }, { "epoch": 3.8126009693053313, "grad_norm": 2.846978994132507, "learning_rate": 3.425753140591031e-06, "loss": 0.1681, "step": 2360 }, { "epoch": 3.8142164781906303, "grad_norm": 3.6950137309229683, "learning_rate": 3.4245705805133306e-06, "loss": 0.1899, "step": 2361 }, { "epoch": 3.8158319870759287, "grad_norm": 3.305012009019456, "learning_rate": 3.423387780734833e-06, "loss": 0.1931, "step": 2362 }, { "epoch": 3.8174474959612277, "grad_norm": 3.875248200724316, "learning_rate": 3.422204741562186e-06, "loss": 0.2205, "step": 2363 }, { "epoch": 3.8190630048465266, "grad_norm": 3.7633238901056925, "learning_rate": 3.4210214633021023e-06, "loss": 0.1822, "step": 2364 }, { "epoch": 3.8206785137318255, "grad_norm": 2.8345672793878975, "learning_rate": 3.4198379462613517e-06, "loss": 0.1528, "step": 2365 }, { "epoch": 3.8222940226171245, "grad_norm": 3.7276154937301325, "learning_rate": 3.4186541907467693e-06, "loss": 0.2037, "step": 2366 }, { "epoch": 3.8239095315024234, "grad_norm": 3.3350140071634677, "learning_rate": 3.4174701970652517e-06, "loss": 0.1744, "step": 2367 }, { "epoch": 3.8255250403877223, "grad_norm": 3.446037954015599, "learning_rate": 3.416285965523756e-06, "loss": 0.2024, "step": 2368 }, { "epoch": 3.827140549273021, "grad_norm": 3.1213485317020435, "learning_rate": 3.4151014964293035e-06, "loss": 0.1525, "step": 2369 }, { "epoch": 3.8287560581583198, "grad_norm": 3.341551260512109, "learning_rate": 3.413916790088973e-06, "loss": 0.1831, "step": 2370 }, { "epoch": 3.8303715670436187, "grad_norm": 3.1254108448438824, "learning_rate": 3.4127318468099093e-06, "loss": 0.1431, "step": 2371 }, { "epoch": 3.8319870759289176, "grad_norm": 3.4134557514154125, "learning_rate": 3.4115466668993147e-06, "loss": 0.1715, "step": 2372 }, { "epoch": 3.8336025848142166, "grad_norm": 3.399298965119023, "learning_rate": 3.4103612506644557e-06, "loss": 0.2155, "step": 2373 }, { "epoch": 3.8352180936995155, "grad_norm": 3.8445564151765566, "learning_rate": 3.4091755984126585e-06, "loss": 0.1874, "step": 2374 }, { "epoch": 3.8368336025848144, "grad_norm": 3.1220272261217366, "learning_rate": 3.407989710451311e-06, "loss": 0.1651, "step": 2375 }, { "epoch": 3.838449111470113, "grad_norm": 3.4664646403468873, "learning_rate": 3.4068035870878624e-06, "loss": 0.158, "step": 2376 }, { "epoch": 3.840064620355412, "grad_norm": 4.619461606341802, "learning_rate": 3.4056172286298227e-06, "loss": 0.1841, "step": 2377 }, { "epoch": 3.841680129240711, "grad_norm": 4.583968557375734, "learning_rate": 3.4044306353847633e-06, "loss": 0.2216, "step": 2378 }, { "epoch": 3.8432956381260097, "grad_norm": 3.4631399116735593, "learning_rate": 3.403243807660315e-06, "loss": 0.1524, "step": 2379 }, { "epoch": 3.8449111470113086, "grad_norm": 3.2591469135420574, "learning_rate": 3.4020567457641716e-06, "loss": 0.1515, "step": 2380 }, { "epoch": 3.8465266558966076, "grad_norm": 4.018472240225124, "learning_rate": 3.4008694500040855e-06, "loss": 0.1953, "step": 2381 }, { "epoch": 3.8481421647819065, "grad_norm": 3.2891737487023507, "learning_rate": 3.399681920687871e-06, "loss": 0.1849, "step": 2382 }, { "epoch": 3.849757673667205, "grad_norm": 3.6128674495076156, "learning_rate": 3.3984941581234025e-06, "loss": 0.1544, "step": 2383 }, { "epoch": 3.851373182552504, "grad_norm": 3.215033024706533, "learning_rate": 3.3973061626186155e-06, "loss": 0.1458, "step": 2384 }, { "epoch": 3.852988691437803, "grad_norm": 3.2393802952141857, "learning_rate": 3.3961179344815044e-06, "loss": 0.1871, "step": 2385 }, { "epoch": 3.854604200323102, "grad_norm": 3.5810407242611983, "learning_rate": 3.3949294740201262e-06, "loss": 0.1741, "step": 2386 }, { "epoch": 3.8562197092084007, "grad_norm": 3.4284263441977543, "learning_rate": 3.3937407815425956e-06, "loss": 0.1607, "step": 2387 }, { "epoch": 3.8578352180936997, "grad_norm": 4.039235340352999, "learning_rate": 3.3925518573570893e-06, "loss": 0.1947, "step": 2388 }, { "epoch": 3.8594507269789986, "grad_norm": 3.8758038621749176, "learning_rate": 3.3913627017718435e-06, "loss": 0.21, "step": 2389 }, { "epoch": 3.861066235864297, "grad_norm": 3.185402801851314, "learning_rate": 3.3901733150951536e-06, "loss": 0.1793, "step": 2390 }, { "epoch": 3.862681744749596, "grad_norm": 3.5741189963062183, "learning_rate": 3.388983697635377e-06, "loss": 0.1943, "step": 2391 }, { "epoch": 3.864297253634895, "grad_norm": 3.765654248475129, "learning_rate": 3.3877938497009287e-06, "loss": 0.1472, "step": 2392 }, { "epoch": 3.865912762520194, "grad_norm": 3.3160627523681243, "learning_rate": 3.3866037716002843e-06, "loss": 0.1577, "step": 2393 }, { "epoch": 3.867528271405493, "grad_norm": 3.5492138088057223, "learning_rate": 3.385413463641979e-06, "loss": 0.1864, "step": 2394 }, { "epoch": 3.8691437802907918, "grad_norm": 3.2156861980386715, "learning_rate": 3.3842229261346084e-06, "loss": 0.1442, "step": 2395 }, { "epoch": 3.8707592891760907, "grad_norm": 3.818458408509696, "learning_rate": 3.383032159386826e-06, "loss": 0.1815, "step": 2396 }, { "epoch": 3.872374798061389, "grad_norm": 3.255325689009682, "learning_rate": 3.381841163707346e-06, "loss": 0.2022, "step": 2397 }, { "epoch": 3.873990306946688, "grad_norm": 3.0190869567739456, "learning_rate": 3.3806499394049418e-06, "loss": 0.1788, "step": 2398 }, { "epoch": 3.875605815831987, "grad_norm": 3.39459555084932, "learning_rate": 3.379458486788446e-06, "loss": 0.216, "step": 2399 }, { "epoch": 3.877221324717286, "grad_norm": 3.342872872243635, "learning_rate": 3.37826680616675e-06, "loss": 0.135, "step": 2400 }, { "epoch": 3.877221324717286, "eval_loss": 1.2557854652404785, "eval_runtime": 2.3479, "eval_samples_per_second": 127.774, "eval_steps_per_second": 2.981, "step": 2400 }, { "epoch": 3.878836833602585, "grad_norm": 3.41467858328468, "learning_rate": 3.377074897848805e-06, "loss": 0.2051, "step": 2401 }, { "epoch": 3.880452342487884, "grad_norm": 3.4969687133659964, "learning_rate": 3.3758827621436196e-06, "loss": 0.1851, "step": 2402 }, { "epoch": 3.8820678513731828, "grad_norm": 2.5262566371678923, "learning_rate": 3.3746903993602644e-06, "loss": 0.1235, "step": 2403 }, { "epoch": 3.8836833602584813, "grad_norm": 3.165267667402732, "learning_rate": 3.3734978098078653e-06, "loss": 0.1491, "step": 2404 }, { "epoch": 3.88529886914378, "grad_norm": 3.782793935360498, "learning_rate": 3.3723049937956097e-06, "loss": 0.2348, "step": 2405 }, { "epoch": 3.886914378029079, "grad_norm": 2.9545376230180196, "learning_rate": 3.3711119516327417e-06, "loss": 0.1332, "step": 2406 }, { "epoch": 3.888529886914378, "grad_norm": 4.130157915178056, "learning_rate": 3.369918683628567e-06, "loss": 0.1684, "step": 2407 }, { "epoch": 3.890145395799677, "grad_norm": 3.255480219321519, "learning_rate": 3.368725190092446e-06, "loss": 0.1745, "step": 2408 }, { "epoch": 3.891760904684976, "grad_norm": 3.711660923563174, "learning_rate": 3.3675314713338003e-06, "loss": 0.1768, "step": 2409 }, { "epoch": 3.893376413570275, "grad_norm": 3.3331731243321037, "learning_rate": 3.3663375276621093e-06, "loss": 0.1683, "step": 2410 }, { "epoch": 3.8949919224555734, "grad_norm": 3.1834948072422025, "learning_rate": 3.36514335938691e-06, "loss": 0.1513, "step": 2411 }, { "epoch": 3.8966074313408723, "grad_norm": 3.265465488929365, "learning_rate": 3.3639489668177977e-06, "loss": 0.1662, "step": 2412 }, { "epoch": 3.898222940226171, "grad_norm": 3.410664800650834, "learning_rate": 3.3627543502644276e-06, "loss": 0.1583, "step": 2413 }, { "epoch": 3.89983844911147, "grad_norm": 3.6808672960439797, "learning_rate": 3.3615595100365105e-06, "loss": 0.1815, "step": 2414 }, { "epoch": 3.901453957996769, "grad_norm": 3.4378805096120058, "learning_rate": 3.360364446443817e-06, "loss": 0.1749, "step": 2415 }, { "epoch": 3.903069466882068, "grad_norm": 3.608457701740676, "learning_rate": 3.3591691597961745e-06, "loss": 0.179, "step": 2416 }, { "epoch": 3.904684975767367, "grad_norm": 4.132068614905824, "learning_rate": 3.3579736504034683e-06, "loss": 0.1424, "step": 2417 }, { "epoch": 3.9063004846526654, "grad_norm": 3.0520974207975944, "learning_rate": 3.3567779185756427e-06, "loss": 0.1746, "step": 2418 }, { "epoch": 3.9079159935379644, "grad_norm": 3.485124901599522, "learning_rate": 3.3555819646226983e-06, "loss": 0.1813, "step": 2419 }, { "epoch": 3.9095315024232633, "grad_norm": 3.5480234982107395, "learning_rate": 3.3543857888546936e-06, "loss": 0.2231, "step": 2420 }, { "epoch": 3.9111470113085622, "grad_norm": 3.9492876900997995, "learning_rate": 3.3531893915817453e-06, "loss": 0.1835, "step": 2421 }, { "epoch": 3.912762520193861, "grad_norm": 3.7502746643842833, "learning_rate": 3.3519927731140265e-06, "loss": 0.1638, "step": 2422 }, { "epoch": 3.9143780290791597, "grad_norm": 3.2267923889049794, "learning_rate": 3.3507959337617678e-06, "loss": 0.1782, "step": 2423 }, { "epoch": 3.915993537964459, "grad_norm": 4.48149943366299, "learning_rate": 3.3495988738352593e-06, "loss": 0.194, "step": 2424 }, { "epoch": 3.9176090468497575, "grad_norm": 3.2576871636167257, "learning_rate": 3.3484015936448442e-06, "loss": 0.1777, "step": 2425 }, { "epoch": 3.9192245557350565, "grad_norm": 3.7078747589194383, "learning_rate": 3.3472040935009254e-06, "loss": 0.2152, "step": 2426 }, { "epoch": 3.9208400646203554, "grad_norm": 3.6918930870941575, "learning_rate": 3.3460063737139636e-06, "loss": 0.1704, "step": 2427 }, { "epoch": 3.9224555735056543, "grad_norm": 4.205468055405657, "learning_rate": 3.3448084345944736e-06, "loss": 0.2147, "step": 2428 }, { "epoch": 3.9240710823909533, "grad_norm": 3.5468316214355147, "learning_rate": 3.3436102764530306e-06, "loss": 0.1845, "step": 2429 }, { "epoch": 3.9256865912762517, "grad_norm": 3.354479591035662, "learning_rate": 3.3424118996002626e-06, "loss": 0.191, "step": 2430 }, { "epoch": 3.927302100161551, "grad_norm": 3.313199230351379, "learning_rate": 3.341213304346858e-06, "loss": 0.1609, "step": 2431 }, { "epoch": 3.9289176090468496, "grad_norm": 3.1305713139650124, "learning_rate": 3.3400144910035596e-06, "loss": 0.1475, "step": 2432 }, { "epoch": 3.9305331179321485, "grad_norm": 3.6846385493437372, "learning_rate": 3.3388154598811683e-06, "loss": 0.1962, "step": 2433 }, { "epoch": 3.9321486268174475, "grad_norm": 3.6624360555452316, "learning_rate": 3.337616211290539e-06, "loss": 0.1825, "step": 2434 }, { "epoch": 3.9337641357027464, "grad_norm": 3.04963256244289, "learning_rate": 3.336416745542586e-06, "loss": 0.1555, "step": 2435 }, { "epoch": 3.9353796445880453, "grad_norm": 3.8741654240035133, "learning_rate": 3.3352170629482772e-06, "loss": 0.1733, "step": 2436 }, { "epoch": 3.936995153473344, "grad_norm": 4.216106403297842, "learning_rate": 3.3340171638186386e-06, "loss": 0.1695, "step": 2437 }, { "epoch": 3.938610662358643, "grad_norm": 3.196552249510656, "learning_rate": 3.3328170484647514e-06, "loss": 0.1552, "step": 2438 }, { "epoch": 3.9402261712439417, "grad_norm": 3.5536400836309383, "learning_rate": 3.331616717197754e-06, "loss": 0.2044, "step": 2439 }, { "epoch": 3.9418416801292406, "grad_norm": 3.1592437146666486, "learning_rate": 3.3304161703288384e-06, "loss": 0.1621, "step": 2440 }, { "epoch": 3.9434571890145396, "grad_norm": 3.52926840283299, "learning_rate": 3.329215408169255e-06, "loss": 0.1599, "step": 2441 }, { "epoch": 3.9450726978998385, "grad_norm": 3.490653961481053, "learning_rate": 3.3280144310303086e-06, "loss": 0.1659, "step": 2442 }, { "epoch": 3.9466882067851374, "grad_norm": 2.9928330229296645, "learning_rate": 3.3268132392233604e-06, "loss": 0.1429, "step": 2443 }, { "epoch": 3.948303715670436, "grad_norm": 3.1182677426310383, "learning_rate": 3.3256118330598276e-06, "loss": 0.2109, "step": 2444 }, { "epoch": 3.9499192245557353, "grad_norm": 3.6928113083182312, "learning_rate": 3.3244102128511805e-06, "loss": 0.1897, "step": 2445 }, { "epoch": 3.951534733441034, "grad_norm": 3.6183492347752355, "learning_rate": 3.323208378908949e-06, "loss": 0.189, "step": 2446 }, { "epoch": 3.9531502423263327, "grad_norm": 3.1747675845580585, "learning_rate": 3.3220063315447145e-06, "loss": 0.2047, "step": 2447 }, { "epoch": 3.9547657512116317, "grad_norm": 3.700067937705417, "learning_rate": 3.3208040710701166e-06, "loss": 0.1879, "step": 2448 }, { "epoch": 3.9563812600969306, "grad_norm": 3.3742049212044964, "learning_rate": 3.319601597796848e-06, "loss": 0.1769, "step": 2449 }, { "epoch": 3.9579967689822295, "grad_norm": 3.1785765181122563, "learning_rate": 3.3183989120366585e-06, "loss": 0.1993, "step": 2450 }, { "epoch": 3.959612277867528, "grad_norm": 3.102309548866693, "learning_rate": 3.3171960141013516e-06, "loss": 0.1517, "step": 2451 }, { "epoch": 3.9612277867528274, "grad_norm": 3.7228860294488855, "learning_rate": 3.315992904302786e-06, "loss": 0.1741, "step": 2452 }, { "epoch": 3.962843295638126, "grad_norm": 3.188236473123566, "learning_rate": 3.3147895829528753e-06, "loss": 0.1668, "step": 2453 }, { "epoch": 3.964458804523425, "grad_norm": 3.52372238645286, "learning_rate": 3.3135860503635892e-06, "loss": 0.1839, "step": 2454 }, { "epoch": 3.9660743134087237, "grad_norm": 3.4657038043243777, "learning_rate": 3.31238230684695e-06, "loss": 0.1859, "step": 2455 }, { "epoch": 3.9676898222940227, "grad_norm": 3.6632818486067933, "learning_rate": 3.3111783527150364e-06, "loss": 0.1811, "step": 2456 }, { "epoch": 3.9693053311793216, "grad_norm": 3.3319867004673593, "learning_rate": 3.3099741882799817e-06, "loss": 0.1444, "step": 2457 }, { "epoch": 3.97092084006462, "grad_norm": 3.0273498026719223, "learning_rate": 3.3087698138539728e-06, "loss": 0.1797, "step": 2458 }, { "epoch": 3.9725363489499195, "grad_norm": 4.588329383732215, "learning_rate": 3.307565229749251e-06, "loss": 0.1934, "step": 2459 }, { "epoch": 3.974151857835218, "grad_norm": 3.349892683628265, "learning_rate": 3.3063604362781125e-06, "loss": 0.162, "step": 2460 }, { "epoch": 3.975767366720517, "grad_norm": 3.1357827040700474, "learning_rate": 3.305155433752908e-06, "loss": 0.1483, "step": 2461 }, { "epoch": 3.977382875605816, "grad_norm": 3.5477450605562577, "learning_rate": 3.303950222486042e-06, "loss": 0.1685, "step": 2462 }, { "epoch": 3.9789983844911148, "grad_norm": 3.487303764536278, "learning_rate": 3.3027448027899733e-06, "loss": 0.1497, "step": 2463 }, { "epoch": 3.9806138933764137, "grad_norm": 3.5399120149095906, "learning_rate": 3.301539174977214e-06, "loss": 0.164, "step": 2464 }, { "epoch": 3.982229402261712, "grad_norm": 3.4546416659182824, "learning_rate": 3.3003333393603315e-06, "loss": 0.1792, "step": 2465 }, { "epoch": 3.9838449111470116, "grad_norm": 4.1361037697542296, "learning_rate": 3.2991272962519455e-06, "loss": 0.1926, "step": 2466 }, { "epoch": 3.98546042003231, "grad_norm": 2.8974821798598343, "learning_rate": 3.2979210459647314e-06, "loss": 0.1827, "step": 2467 }, { "epoch": 3.987075928917609, "grad_norm": 3.346898185624, "learning_rate": 3.2967145888114166e-06, "loss": 0.1722, "step": 2468 }, { "epoch": 3.988691437802908, "grad_norm": 4.241892081864668, "learning_rate": 3.295507925104783e-06, "loss": 0.187, "step": 2469 }, { "epoch": 3.990306946688207, "grad_norm": 3.421309062616269, "learning_rate": 3.2943010551576655e-06, "loss": 0.1769, "step": 2470 }, { "epoch": 3.991922455573506, "grad_norm": 3.4938443438144215, "learning_rate": 3.293093979282953e-06, "loss": 0.16, "step": 2471 }, { "epoch": 3.9935379644588043, "grad_norm": 3.2522537516234533, "learning_rate": 3.2918866977935874e-06, "loss": 0.1584, "step": 2472 }, { "epoch": 3.9951534733441036, "grad_norm": 3.8388632332836234, "learning_rate": 3.2906792110025654e-06, "loss": 0.1746, "step": 2473 }, { "epoch": 3.996768982229402, "grad_norm": 3.498483281102831, "learning_rate": 3.2894715192229337e-06, "loss": 0.1907, "step": 2474 }, { "epoch": 3.998384491114701, "grad_norm": 3.6749563039635325, "learning_rate": 3.2882636227677946e-06, "loss": 0.2065, "step": 2475 }, { "epoch": 4.0, "grad_norm": 3.3334239314982526, "learning_rate": 3.287055521950304e-06, "loss": 0.1551, "step": 2476 }, { "epoch": 4.0016155088852985, "grad_norm": 2.4425151686126494, "learning_rate": 3.2858472170836682e-06, "loss": 0.0714, "step": 2477 }, { "epoch": 4.003231017770598, "grad_norm": 2.647491306498951, "learning_rate": 3.2846387084811487e-06, "loss": 0.0931, "step": 2478 }, { "epoch": 4.004846526655896, "grad_norm": 2.378793488305172, "learning_rate": 3.2834299964560596e-06, "loss": 0.0679, "step": 2479 }, { "epoch": 4.006462035541196, "grad_norm": 2.6357277602880353, "learning_rate": 3.282221081321766e-06, "loss": 0.0749, "step": 2480 }, { "epoch": 4.008077544426494, "grad_norm": 2.272260579465781, "learning_rate": 3.2810119633916874e-06, "loss": 0.0679, "step": 2481 }, { "epoch": 4.009693053311794, "grad_norm": 2.5700716821179794, "learning_rate": 3.2798026429792963e-06, "loss": 0.0638, "step": 2482 }, { "epoch": 4.011308562197092, "grad_norm": 2.7604319444515113, "learning_rate": 3.2785931203981142e-06, "loss": 0.0641, "step": 2483 }, { "epoch": 4.012924071082391, "grad_norm": 3.069488316286541, "learning_rate": 3.2773833959617197e-06, "loss": 0.0655, "step": 2484 }, { "epoch": 4.01453957996769, "grad_norm": 2.794927210861604, "learning_rate": 3.276173469983741e-06, "loss": 0.062, "step": 2485 }, { "epoch": 4.016155088852988, "grad_norm": 2.976312681788636, "learning_rate": 3.274963342777859e-06, "loss": 0.0638, "step": 2486 }, { "epoch": 4.017770597738288, "grad_norm": 2.810516922350264, "learning_rate": 3.2737530146578066e-06, "loss": 0.0584, "step": 2487 }, { "epoch": 4.019386106623586, "grad_norm": 2.655853352117199, "learning_rate": 3.272542485937369e-06, "loss": 0.0577, "step": 2488 }, { "epoch": 4.021001615508886, "grad_norm": 3.175186890619602, "learning_rate": 3.2713317569303843e-06, "loss": 0.0654, "step": 2489 }, { "epoch": 4.022617124394184, "grad_norm": 2.843129596978916, "learning_rate": 3.27012082795074e-06, "loss": 0.0768, "step": 2490 }, { "epoch": 4.024232633279483, "grad_norm": 2.859447572097875, "learning_rate": 3.2689096993123797e-06, "loss": 0.0637, "step": 2491 }, { "epoch": 4.025848142164782, "grad_norm": 3.026337270470383, "learning_rate": 3.2676983713292936e-06, "loss": 0.0727, "step": 2492 }, { "epoch": 4.0274636510500805, "grad_norm": 2.7298056230658814, "learning_rate": 3.2664868443155284e-06, "loss": 0.0554, "step": 2493 }, { "epoch": 4.02907915993538, "grad_norm": 2.192274521884983, "learning_rate": 3.2652751185851783e-06, "loss": 0.0615, "step": 2494 }, { "epoch": 4.030694668820678, "grad_norm": 1.9810552622701285, "learning_rate": 3.264063194452392e-06, "loss": 0.0507, "step": 2495 }, { "epoch": 4.032310177705978, "grad_norm": 2.846200673228709, "learning_rate": 3.2628510722313676e-06, "loss": 0.0759, "step": 2496 }, { "epoch": 4.033925686591276, "grad_norm": 2.234607679419766, "learning_rate": 3.2616387522363567e-06, "loss": 0.0689, "step": 2497 }, { "epoch": 4.035541195476575, "grad_norm": 2.271917573382841, "learning_rate": 3.2604262347816605e-06, "loss": 0.063, "step": 2498 }, { "epoch": 4.037156704361874, "grad_norm": 2.907170027180095, "learning_rate": 3.259213520181632e-06, "loss": 0.084, "step": 2499 }, { "epoch": 4.038772213247173, "grad_norm": 2.416322756576027, "learning_rate": 3.2580006087506745e-06, "loss": 0.0637, "step": 2500 }, { "epoch": 4.040387722132472, "grad_norm": 2.362587147059952, "learning_rate": 3.2567875008032433e-06, "loss": 0.0694, "step": 2501 }, { "epoch": 4.0420032310177705, "grad_norm": 2.115290557788558, "learning_rate": 3.255574196653844e-06, "loss": 0.046, "step": 2502 }, { "epoch": 4.04361873990307, "grad_norm": 3.1985552343985044, "learning_rate": 3.2543606966170346e-06, "loss": 0.0575, "step": 2503 }, { "epoch": 4.045234248788368, "grad_norm": 2.619686022295049, "learning_rate": 3.2531470010074217e-06, "loss": 0.0648, "step": 2504 }, { "epoch": 4.046849757673667, "grad_norm": 2.660167864336397, "learning_rate": 3.2519331101396632e-06, "loss": 0.0609, "step": 2505 }, { "epoch": 4.048465266558966, "grad_norm": 2.7511436124302415, "learning_rate": 3.2507190243284697e-06, "loss": 0.0563, "step": 2506 }, { "epoch": 4.050080775444265, "grad_norm": 2.872783339494083, "learning_rate": 3.2495047438885985e-06, "loss": 0.0725, "step": 2507 }, { "epoch": 4.051696284329564, "grad_norm": 2.9862587359013237, "learning_rate": 3.2482902691348617e-06, "loss": 0.0689, "step": 2508 }, { "epoch": 4.053311793214863, "grad_norm": 2.9929867757923283, "learning_rate": 3.247075600382118e-06, "loss": 0.0745, "step": 2509 }, { "epoch": 4.054927302100162, "grad_norm": 3.4031439925404396, "learning_rate": 3.2458607379452785e-06, "loss": 0.088, "step": 2510 }, { "epoch": 4.05654281098546, "grad_norm": 2.437279326763499, "learning_rate": 3.244645682139304e-06, "loss": 0.0595, "step": 2511 }, { "epoch": 4.058158319870759, "grad_norm": 2.534049505472988, "learning_rate": 3.243430433279206e-06, "loss": 0.0566, "step": 2512 }, { "epoch": 4.059773828756058, "grad_norm": 3.049745069654658, "learning_rate": 3.2422149916800437e-06, "loss": 0.0703, "step": 2513 }, { "epoch": 4.061389337641357, "grad_norm": 2.366589280603187, "learning_rate": 3.2409993576569305e-06, "loss": 0.0728, "step": 2514 }, { "epoch": 4.063004846526656, "grad_norm": 2.5867849164048935, "learning_rate": 3.2397835315250253e-06, "loss": 0.0568, "step": 2515 }, { "epoch": 4.064620355411955, "grad_norm": 2.574057982030481, "learning_rate": 3.2385675135995397e-06, "loss": 0.0471, "step": 2516 }, { "epoch": 4.066235864297254, "grad_norm": 2.884455661762185, "learning_rate": 3.2373513041957346e-06, "loss": 0.0659, "step": 2517 }, { "epoch": 4.0678513731825525, "grad_norm": 3.152111606406369, "learning_rate": 3.236134903628918e-06, "loss": 0.0766, "step": 2518 }, { "epoch": 4.069466882067851, "grad_norm": 2.740444450802073, "learning_rate": 3.234918312214452e-06, "loss": 0.0683, "step": 2519 }, { "epoch": 4.07108239095315, "grad_norm": 2.537456644840308, "learning_rate": 3.233701530267743e-06, "loss": 0.0587, "step": 2520 }, { "epoch": 4.072697899838449, "grad_norm": 2.962311446881191, "learning_rate": 3.232484558104252e-06, "loss": 0.0802, "step": 2521 }, { "epoch": 4.074313408723748, "grad_norm": 2.469227652326323, "learning_rate": 3.2312673960394846e-06, "loss": 0.0951, "step": 2522 }, { "epoch": 4.075928917609047, "grad_norm": 2.4876232857632212, "learning_rate": 3.2300500443890006e-06, "loss": 0.0612, "step": 2523 }, { "epoch": 4.077544426494346, "grad_norm": 3.016485436920349, "learning_rate": 3.2288325034684026e-06, "loss": 0.0829, "step": 2524 }, { "epoch": 4.079159935379645, "grad_norm": 3.0677215093096866, "learning_rate": 3.227614773593348e-06, "loss": 0.0639, "step": 2525 }, { "epoch": 4.080775444264943, "grad_norm": 2.727029240213888, "learning_rate": 3.226396855079541e-06, "loss": 0.0673, "step": 2526 }, { "epoch": 4.0823909531502425, "grad_norm": 2.511665080080406, "learning_rate": 3.2251787482427343e-06, "loss": 0.0621, "step": 2527 }, { "epoch": 4.084006462035541, "grad_norm": 2.8107579938429152, "learning_rate": 3.22396045339873e-06, "loss": 0.0661, "step": 2528 }, { "epoch": 4.08562197092084, "grad_norm": 2.8307352685505194, "learning_rate": 3.222741970863378e-06, "loss": 0.0729, "step": 2529 }, { "epoch": 4.087237479806139, "grad_norm": 2.2942100380792394, "learning_rate": 3.2215233009525786e-06, "loss": 0.0584, "step": 2530 }, { "epoch": 4.088852988691438, "grad_norm": 3.132761705197964, "learning_rate": 3.2203044439822796e-06, "loss": 0.0667, "step": 2531 }, { "epoch": 4.090468497576737, "grad_norm": 2.7497866205122925, "learning_rate": 3.2190854002684766e-06, "loss": 0.0586, "step": 2532 }, { "epoch": 4.092084006462035, "grad_norm": 3.003429186056713, "learning_rate": 3.2178661701272155e-06, "loss": 0.058, "step": 2533 }, { "epoch": 4.093699515347335, "grad_norm": 2.3618306619686287, "learning_rate": 3.2166467538745894e-06, "loss": 0.0509, "step": 2534 }, { "epoch": 4.095315024232633, "grad_norm": 2.851320717336857, "learning_rate": 3.215427151826738e-06, "loss": 0.0763, "step": 2535 }, { "epoch": 4.096930533117932, "grad_norm": 2.939875516806743, "learning_rate": 3.214207364299853e-06, "loss": 0.0915, "step": 2536 }, { "epoch": 4.098546042003231, "grad_norm": 2.8779863687681586, "learning_rate": 3.2129873916101713e-06, "loss": 0.0634, "step": 2537 }, { "epoch": 4.10016155088853, "grad_norm": 2.8729861418966753, "learning_rate": 3.211767234073978e-06, "loss": 0.0738, "step": 2538 }, { "epoch": 4.101777059773829, "grad_norm": 2.717508762291897, "learning_rate": 3.2105468920076083e-06, "loss": 0.0816, "step": 2539 }, { "epoch": 4.103392568659127, "grad_norm": 2.524234240486319, "learning_rate": 3.209326365727441e-06, "loss": 0.0697, "step": 2540 }, { "epoch": 4.105008077544427, "grad_norm": 1.8656223219572552, "learning_rate": 3.208105655549908e-06, "loss": 0.0372, "step": 2541 }, { "epoch": 4.106623586429725, "grad_norm": 3.739472993481801, "learning_rate": 3.206884761791485e-06, "loss": 0.0996, "step": 2542 }, { "epoch": 4.1082390953150245, "grad_norm": 2.4046519704189593, "learning_rate": 3.2056636847686956e-06, "loss": 0.0499, "step": 2543 }, { "epoch": 4.109854604200323, "grad_norm": 2.480460420918869, "learning_rate": 3.2044424247981133e-06, "loss": 0.0618, "step": 2544 }, { "epoch": 4.111470113085622, "grad_norm": 2.6450780780936842, "learning_rate": 3.203220982196357e-06, "loss": 0.0795, "step": 2545 }, { "epoch": 4.113085621970921, "grad_norm": 2.879175780345021, "learning_rate": 3.2019993572800927e-06, "loss": 0.0676, "step": 2546 }, { "epoch": 4.114701130856219, "grad_norm": 2.5243366188365335, "learning_rate": 3.2007775503660356e-06, "loss": 0.0635, "step": 2547 }, { "epoch": 4.116316639741519, "grad_norm": 2.1173846411388806, "learning_rate": 3.1995555617709455e-06, "loss": 0.0525, "step": 2548 }, { "epoch": 4.117932148626817, "grad_norm": 2.7381544146433923, "learning_rate": 3.198333391811632e-06, "loss": 0.0732, "step": 2549 }, { "epoch": 4.119547657512117, "grad_norm": 2.7420962246409903, "learning_rate": 3.19711104080495e-06, "loss": 0.0624, "step": 2550 }, { "epoch": 4.121163166397415, "grad_norm": 2.7605022665966485, "learning_rate": 3.1958885090678015e-06, "loss": 0.0753, "step": 2551 }, { "epoch": 4.1227786752827145, "grad_norm": 2.727345831656217, "learning_rate": 3.1946657969171356e-06, "loss": 0.074, "step": 2552 }, { "epoch": 4.124394184168013, "grad_norm": 2.7487211334087136, "learning_rate": 3.1934429046699493e-06, "loss": 0.0867, "step": 2553 }, { "epoch": 4.1260096930533114, "grad_norm": 2.408354250442203, "learning_rate": 3.192219832643284e-06, "loss": 0.0593, "step": 2554 }, { "epoch": 4.127625201938611, "grad_norm": 2.716079584164438, "learning_rate": 3.1909965811542293e-06, "loss": 0.0671, "step": 2555 }, { "epoch": 4.129240710823909, "grad_norm": 2.3218291830591093, "learning_rate": 3.1897731505199204e-06, "loss": 0.0611, "step": 2556 }, { "epoch": 4.130856219709209, "grad_norm": 3.197572754531476, "learning_rate": 3.1885495410575403e-06, "loss": 0.0787, "step": 2557 }, { "epoch": 4.132471728594507, "grad_norm": 2.3654337097390252, "learning_rate": 3.187325753084317e-06, "loss": 0.0583, "step": 2558 }, { "epoch": 4.1340872374798066, "grad_norm": 2.5663423713390245, "learning_rate": 3.186101786917525e-06, "loss": 0.0489, "step": 2559 }, { "epoch": 4.135702746365105, "grad_norm": 2.4101758651310807, "learning_rate": 3.1848776428744863e-06, "loss": 0.0533, "step": 2560 }, { "epoch": 4.1373182552504035, "grad_norm": 3.0629711209713633, "learning_rate": 3.1836533212725672e-06, "loss": 0.0747, "step": 2561 }, { "epoch": 4.138933764135703, "grad_norm": 3.1153676459972224, "learning_rate": 3.182428822429181e-06, "loss": 0.0718, "step": 2562 }, { "epoch": 4.140549273021001, "grad_norm": 2.27661744103325, "learning_rate": 3.181204146661788e-06, "loss": 0.0595, "step": 2563 }, { "epoch": 4.142164781906301, "grad_norm": 2.768610084151022, "learning_rate": 3.179979294287891e-06, "loss": 0.0622, "step": 2564 }, { "epoch": 4.143780290791599, "grad_norm": 2.6099246953869004, "learning_rate": 3.1787542656250412e-06, "loss": 0.0567, "step": 2565 }, { "epoch": 4.145395799676899, "grad_norm": 2.4417286616094644, "learning_rate": 3.177529060990837e-06, "loss": 0.0702, "step": 2566 }, { "epoch": 4.147011308562197, "grad_norm": 2.912875956224723, "learning_rate": 3.1763036807029185e-06, "loss": 0.0904, "step": 2567 }, { "epoch": 4.148626817447496, "grad_norm": 2.493397643899944, "learning_rate": 3.175078125078975e-06, "loss": 0.0561, "step": 2568 }, { "epoch": 4.150242326332795, "grad_norm": 2.8965663995623436, "learning_rate": 3.173852394436738e-06, "loss": 0.0699, "step": 2569 }, { "epoch": 4.1518578352180935, "grad_norm": 2.491272279335036, "learning_rate": 3.172626489093986e-06, "loss": 0.0668, "step": 2570 }, { "epoch": 4.153473344103393, "grad_norm": 2.9778309398483227, "learning_rate": 3.1714004093685435e-06, "loss": 0.0593, "step": 2571 }, { "epoch": 4.155088852988691, "grad_norm": 2.4387806777055254, "learning_rate": 3.17017415557828e-06, "loss": 0.0651, "step": 2572 }, { "epoch": 4.156704361873991, "grad_norm": 2.6168359134421695, "learning_rate": 3.1689477280411084e-06, "loss": 0.0634, "step": 2573 }, { "epoch": 4.158319870759289, "grad_norm": 2.8832621654571136, "learning_rate": 3.1677211270749885e-06, "loss": 0.0753, "step": 2574 }, { "epoch": 4.159935379644588, "grad_norm": 3.0980563338325355, "learning_rate": 3.166494352997923e-06, "loss": 0.0694, "step": 2575 }, { "epoch": 4.161550888529887, "grad_norm": 2.1539631991743238, "learning_rate": 3.1652674061279635e-06, "loss": 0.0601, "step": 2576 }, { "epoch": 4.163166397415186, "grad_norm": 2.6725135273945004, "learning_rate": 3.164040286783202e-06, "loss": 0.0787, "step": 2577 }, { "epoch": 4.164781906300485, "grad_norm": 3.1196608793550396, "learning_rate": 3.162812995281777e-06, "loss": 0.0652, "step": 2578 }, { "epoch": 4.166397415185783, "grad_norm": 2.875491406326832, "learning_rate": 3.161585531941872e-06, "loss": 0.0623, "step": 2579 }, { "epoch": 4.168012924071083, "grad_norm": 2.702090488577417, "learning_rate": 3.1603578970817144e-06, "loss": 0.0592, "step": 2580 }, { "epoch": 4.169628432956381, "grad_norm": 2.207432483022482, "learning_rate": 3.159130091019577e-06, "loss": 0.0541, "step": 2581 }, { "epoch": 4.17124394184168, "grad_norm": 2.995669295476563, "learning_rate": 3.1579021140737754e-06, "loss": 0.0719, "step": 2582 }, { "epoch": 4.172859450726979, "grad_norm": 3.1322707574215527, "learning_rate": 3.1566739665626713e-06, "loss": 0.0865, "step": 2583 }, { "epoch": 4.174474959612278, "grad_norm": 2.4237714146977862, "learning_rate": 3.1554456488046693e-06, "loss": 0.064, "step": 2584 }, { "epoch": 4.176090468497577, "grad_norm": 2.771180975475435, "learning_rate": 3.1542171611182184e-06, "loss": 0.0898, "step": 2585 }, { "epoch": 4.1777059773828755, "grad_norm": 2.742563304343025, "learning_rate": 3.1529885038218117e-06, "loss": 0.0721, "step": 2586 }, { "epoch": 4.179321486268175, "grad_norm": 3.122450700387742, "learning_rate": 3.1517596772339876e-06, "loss": 0.0859, "step": 2587 }, { "epoch": 4.180936995153473, "grad_norm": 2.85986059712255, "learning_rate": 3.150530681673326e-06, "loss": 0.0865, "step": 2588 }, { "epoch": 4.182552504038772, "grad_norm": 3.2403697987309523, "learning_rate": 3.149301517458452e-06, "loss": 0.0899, "step": 2589 }, { "epoch": 4.184168012924071, "grad_norm": 2.8414489390633446, "learning_rate": 3.1480721849080344e-06, "loss": 0.0671, "step": 2590 }, { "epoch": 4.18578352180937, "grad_norm": 2.5355333250139602, "learning_rate": 3.146842684340786e-06, "loss": 0.0799, "step": 2591 }, { "epoch": 4.187399030694669, "grad_norm": 2.6183461745921948, "learning_rate": 3.145613016075461e-06, "loss": 0.0557, "step": 2592 }, { "epoch": 4.189014539579968, "grad_norm": 2.290874776048632, "learning_rate": 3.1443831804308606e-06, "loss": 0.0444, "step": 2593 }, { "epoch": 4.190630048465267, "grad_norm": 2.859711536026561, "learning_rate": 3.143153177725827e-06, "loss": 0.0656, "step": 2594 }, { "epoch": 4.1922455573505655, "grad_norm": 2.461322520073305, "learning_rate": 3.141923008279245e-06, "loss": 0.0924, "step": 2595 }, { "epoch": 4.193861066235864, "grad_norm": 3.4747140954910867, "learning_rate": 3.1406926724100456e-06, "loss": 0.0728, "step": 2596 }, { "epoch": 4.195476575121163, "grad_norm": 2.5922708138200856, "learning_rate": 3.1394621704372e-06, "loss": 0.0708, "step": 2597 }, { "epoch": 4.197092084006462, "grad_norm": 2.90275207481832, "learning_rate": 3.138231502679724e-06, "loss": 0.0742, "step": 2598 }, { "epoch": 4.198707592891761, "grad_norm": 2.631892863107552, "learning_rate": 3.137000669456676e-06, "loss": 0.0687, "step": 2599 }, { "epoch": 4.20032310177706, "grad_norm": 3.3269833440951806, "learning_rate": 3.135769671087158e-06, "loss": 0.1152, "step": 2600 }, { "epoch": 4.20032310177706, "eval_loss": 1.3930195569992065, "eval_runtime": 2.3541, "eval_samples_per_second": 127.436, "eval_steps_per_second": 2.974, "step": 2600 }, { "epoch": 4.201938610662358, "grad_norm": 2.610757533168295, "learning_rate": 3.1345385078903128e-06, "loss": 0.0594, "step": 2601 }, { "epoch": 4.203554119547658, "grad_norm": 2.7172491526496505, "learning_rate": 3.1333071801853286e-06, "loss": 0.0648, "step": 2602 }, { "epoch": 4.205169628432956, "grad_norm": 1.9783187752950058, "learning_rate": 3.132075688291434e-06, "loss": 0.0581, "step": 2603 }, { "epoch": 4.206785137318255, "grad_norm": 2.102843665321334, "learning_rate": 3.1308440325279016e-06, "loss": 0.0553, "step": 2604 }, { "epoch": 4.208400646203554, "grad_norm": 2.75144732674043, "learning_rate": 3.1296122132140454e-06, "loss": 0.0791, "step": 2605 }, { "epoch": 4.210016155088853, "grad_norm": 3.3633341138820967, "learning_rate": 3.1283802306692234e-06, "loss": 0.1262, "step": 2606 }, { "epoch": 4.211631663974152, "grad_norm": 2.5350304276171576, "learning_rate": 3.1271480852128343e-06, "loss": 0.0561, "step": 2607 }, { "epoch": 4.21324717285945, "grad_norm": 2.5772006836734156, "learning_rate": 3.125915777164319e-06, "loss": 0.0687, "step": 2608 }, { "epoch": 4.21486268174475, "grad_norm": 2.2076879316320563, "learning_rate": 3.1246833068431626e-06, "loss": 0.0541, "step": 2609 }, { "epoch": 4.216478190630048, "grad_norm": 3.1166201107816933, "learning_rate": 3.123450674568889e-06, "loss": 0.0626, "step": 2610 }, { "epoch": 4.2180936995153475, "grad_norm": 3.3654626846395455, "learning_rate": 3.122217880661068e-06, "loss": 0.0703, "step": 2611 }, { "epoch": 4.219709208400646, "grad_norm": 2.5163602802044482, "learning_rate": 3.1209849254393082e-06, "loss": 0.0804, "step": 2612 }, { "epoch": 4.221324717285945, "grad_norm": 2.9798084602691715, "learning_rate": 3.1197518092232615e-06, "loss": 0.0757, "step": 2613 }, { "epoch": 4.222940226171244, "grad_norm": 2.8782378867023937, "learning_rate": 3.11851853233262e-06, "loss": 0.0645, "step": 2614 }, { "epoch": 4.224555735056542, "grad_norm": 3.347677097719005, "learning_rate": 3.11728509508712e-06, "loss": 0.0775, "step": 2615 }, { "epoch": 4.226171243941842, "grad_norm": 3.233910904682686, "learning_rate": 3.1160514978065366e-06, "loss": 0.0729, "step": 2616 }, { "epoch": 4.22778675282714, "grad_norm": 2.8367051339868072, "learning_rate": 3.114817740810689e-06, "loss": 0.0669, "step": 2617 }, { "epoch": 4.22940226171244, "grad_norm": 2.7072422953813287, "learning_rate": 3.1135838244194362e-06, "loss": 0.0662, "step": 2618 }, { "epoch": 4.231017770597738, "grad_norm": 2.228394528143082, "learning_rate": 3.1123497489526777e-06, "loss": 0.0602, "step": 2619 }, { "epoch": 4.2326332794830375, "grad_norm": 2.3785442455690378, "learning_rate": 3.1111155147303574e-06, "loss": 0.0633, "step": 2620 }, { "epoch": 4.234248788368336, "grad_norm": 2.6549075257498167, "learning_rate": 3.1098811220724573e-06, "loss": 0.0781, "step": 2621 }, { "epoch": 4.2358642972536344, "grad_norm": 2.780460223689987, "learning_rate": 3.1086465712990003e-06, "loss": 0.0709, "step": 2622 }, { "epoch": 4.237479806138934, "grad_norm": 2.223993484064448, "learning_rate": 3.1074118627300544e-06, "loss": 0.0608, "step": 2623 }, { "epoch": 4.239095315024232, "grad_norm": 2.2396622020169064, "learning_rate": 3.1061769966857223e-06, "loss": 0.0537, "step": 2624 }, { "epoch": 4.240710823909532, "grad_norm": 2.192873640432303, "learning_rate": 3.104941973486154e-06, "loss": 0.0491, "step": 2625 }, { "epoch": 4.24232633279483, "grad_norm": 2.4971562540887935, "learning_rate": 3.103706793451536e-06, "loss": 0.0607, "step": 2626 }, { "epoch": 4.24394184168013, "grad_norm": 2.360792933164687, "learning_rate": 3.102471456902095e-06, "loss": 0.0629, "step": 2627 }, { "epoch": 4.245557350565428, "grad_norm": 2.149166389615424, "learning_rate": 3.1012359641581013e-06, "loss": 0.0537, "step": 2628 }, { "epoch": 4.2471728594507265, "grad_norm": 1.9792472190525034, "learning_rate": 3.100000315539865e-06, "loss": 0.0552, "step": 2629 }, { "epoch": 4.248788368336026, "grad_norm": 2.462243316986483, "learning_rate": 3.098764511367734e-06, "loss": 0.0704, "step": 2630 }, { "epoch": 4.250403877221324, "grad_norm": 2.717681467646268, "learning_rate": 3.0975285519620997e-06, "loss": 0.0628, "step": 2631 }, { "epoch": 4.252019386106624, "grad_norm": 3.123059291447039, "learning_rate": 3.096292437643392e-06, "loss": 0.0699, "step": 2632 }, { "epoch": 4.253634894991922, "grad_norm": 3.021952539431264, "learning_rate": 3.0950561687320813e-06, "loss": 0.0734, "step": 2633 }, { "epoch": 4.255250403877222, "grad_norm": 2.0102230964106838, "learning_rate": 3.0938197455486786e-06, "loss": 0.0536, "step": 2634 }, { "epoch": 4.25686591276252, "grad_norm": 2.734545104114804, "learning_rate": 3.0925831684137334e-06, "loss": 0.0701, "step": 2635 }, { "epoch": 4.258481421647819, "grad_norm": 2.3932094406086306, "learning_rate": 3.0913464376478376e-06, "loss": 0.0603, "step": 2636 }, { "epoch": 4.260096930533118, "grad_norm": 3.0185890328204485, "learning_rate": 3.0901095535716207e-06, "loss": 0.066, "step": 2637 }, { "epoch": 4.2617124394184165, "grad_norm": 2.3748944792456705, "learning_rate": 3.0888725165057532e-06, "loss": 0.0576, "step": 2638 }, { "epoch": 4.263327948303716, "grad_norm": 2.824964558986156, "learning_rate": 3.0876353267709443e-06, "loss": 0.0656, "step": 2639 }, { "epoch": 4.264943457189014, "grad_norm": 2.8414612842218974, "learning_rate": 3.086397984687943e-06, "loss": 0.0676, "step": 2640 }, { "epoch": 4.266558966074314, "grad_norm": 3.238371178871112, "learning_rate": 3.085160490577539e-06, "loss": 0.0775, "step": 2641 }, { "epoch": 4.268174474959612, "grad_norm": 2.5051879442239318, "learning_rate": 3.0839228447605603e-06, "loss": 0.0638, "step": 2642 }, { "epoch": 4.269789983844911, "grad_norm": 2.735116784105477, "learning_rate": 3.082685047557874e-06, "loss": 0.0723, "step": 2643 }, { "epoch": 4.27140549273021, "grad_norm": 3.2281635437875367, "learning_rate": 3.0814470992903867e-06, "loss": 0.0782, "step": 2644 }, { "epoch": 4.273021001615509, "grad_norm": 2.911383981686642, "learning_rate": 3.080209000279046e-06, "loss": 0.0797, "step": 2645 }, { "epoch": 4.274636510500808, "grad_norm": 3.3717028368465365, "learning_rate": 3.0789707508448345e-06, "loss": 0.1156, "step": 2646 }, { "epoch": 4.276252019386106, "grad_norm": 2.229694883870711, "learning_rate": 3.0777323513087786e-06, "loss": 0.0594, "step": 2647 }, { "epoch": 4.277867528271406, "grad_norm": 2.6953710684183325, "learning_rate": 3.0764938019919395e-06, "loss": 0.1075, "step": 2648 }, { "epoch": 4.279483037156704, "grad_norm": 2.7432191575459903, "learning_rate": 3.0752551032154194e-06, "loss": 0.0758, "step": 2649 }, { "epoch": 4.281098546042003, "grad_norm": 2.700795334848631, "learning_rate": 3.0740162553003595e-06, "loss": 0.0643, "step": 2650 }, { "epoch": 4.282714054927302, "grad_norm": 2.914252105504093, "learning_rate": 3.072777258567939e-06, "loss": 0.0622, "step": 2651 }, { "epoch": 4.284329563812601, "grad_norm": 2.429884280549669, "learning_rate": 3.071538113339374e-06, "loss": 0.0567, "step": 2652 }, { "epoch": 4.2859450726979, "grad_norm": 2.619909902407422, "learning_rate": 3.0702988199359224e-06, "loss": 0.063, "step": 2653 }, { "epoch": 4.2875605815831985, "grad_norm": 2.1832239512118923, "learning_rate": 3.0690593786788785e-06, "loss": 0.0524, "step": 2654 }, { "epoch": 4.289176090468498, "grad_norm": 2.6155403471489485, "learning_rate": 3.067819789889576e-06, "loss": 0.0649, "step": 2655 }, { "epoch": 4.290791599353796, "grad_norm": 2.5409659326015257, "learning_rate": 3.0665800538893845e-06, "loss": 0.0731, "step": 2656 }, { "epoch": 4.292407108239095, "grad_norm": 2.7306712060989944, "learning_rate": 3.0653401709997144e-06, "loss": 0.0705, "step": 2657 }, { "epoch": 4.294022617124394, "grad_norm": 2.4051559278535137, "learning_rate": 3.0641001415420134e-06, "loss": 0.0708, "step": 2658 }, { "epoch": 4.295638126009693, "grad_norm": 2.435831923799454, "learning_rate": 3.062859965837767e-06, "loss": 0.0544, "step": 2659 }, { "epoch": 4.297253634894992, "grad_norm": 2.892309481192093, "learning_rate": 3.061619644208498e-06, "loss": 0.0612, "step": 2660 }, { "epoch": 4.298869143780291, "grad_norm": 2.45178342789302, "learning_rate": 3.060379176975769e-06, "loss": 0.0585, "step": 2661 }, { "epoch": 4.30048465266559, "grad_norm": 2.7170428302843885, "learning_rate": 3.0591385644611775e-06, "loss": 0.0804, "step": 2662 }, { "epoch": 4.3021001615508885, "grad_norm": 2.622895953227327, "learning_rate": 3.057897806986361e-06, "loss": 0.0613, "step": 2663 }, { "epoch": 4.303715670436187, "grad_norm": 2.458568074755902, "learning_rate": 3.056656904872993e-06, "loss": 0.0599, "step": 2664 }, { "epoch": 4.305331179321486, "grad_norm": 2.7232990560621126, "learning_rate": 3.055415858442786e-06, "loss": 0.0685, "step": 2665 }, { "epoch": 4.306946688206785, "grad_norm": 2.6254043679268997, "learning_rate": 3.0541746680174895e-06, "loss": 0.0607, "step": 2666 }, { "epoch": 4.308562197092084, "grad_norm": 2.525033334053022, "learning_rate": 3.0529333339188894e-06, "loss": 0.0562, "step": 2667 }, { "epoch": 4.310177705977383, "grad_norm": 3.069041363210545, "learning_rate": 3.051691856468809e-06, "loss": 0.0786, "step": 2668 }, { "epoch": 4.311793214862682, "grad_norm": 2.3823798367332727, "learning_rate": 3.0504502359891103e-06, "loss": 0.0658, "step": 2669 }, { "epoch": 4.313408723747981, "grad_norm": 2.578364733378156, "learning_rate": 3.04920847280169e-06, "loss": 0.0683, "step": 2670 }, { "epoch": 4.315024232633279, "grad_norm": 2.684644588163075, "learning_rate": 3.047966567228485e-06, "loss": 0.0668, "step": 2671 }, { "epoch": 4.316639741518578, "grad_norm": 2.971853236916986, "learning_rate": 3.0467245195914646e-06, "loss": 0.0802, "step": 2672 }, { "epoch": 4.318255250403877, "grad_norm": 2.84687416736556, "learning_rate": 3.0454823302126404e-06, "loss": 0.0683, "step": 2673 }, { "epoch": 4.319870759289176, "grad_norm": 2.0549294507884195, "learning_rate": 3.0442399994140555e-06, "loss": 0.0447, "step": 2674 }, { "epoch": 4.321486268174475, "grad_norm": 2.848871056230515, "learning_rate": 3.0429975275177937e-06, "loss": 0.0879, "step": 2675 }, { "epoch": 4.323101777059774, "grad_norm": 2.5646550633512217, "learning_rate": 3.0417549148459724e-06, "loss": 0.0598, "step": 2676 }, { "epoch": 4.324717285945073, "grad_norm": 2.4838373768750395, "learning_rate": 3.040512161720748e-06, "loss": 0.0639, "step": 2677 }, { "epoch": 4.326332794830371, "grad_norm": 2.418758466907535, "learning_rate": 3.0392692684643115e-06, "loss": 0.0661, "step": 2678 }, { "epoch": 4.3279483037156705, "grad_norm": 3.542045048792709, "learning_rate": 3.038026235398891e-06, "loss": 0.0588, "step": 2679 }, { "epoch": 4.329563812600969, "grad_norm": 3.564661954183803, "learning_rate": 3.036783062846751e-06, "loss": 0.084, "step": 2680 }, { "epoch": 4.331179321486268, "grad_norm": 3.131051292291915, "learning_rate": 3.0355397511301914e-06, "loss": 0.087, "step": 2681 }, { "epoch": 4.332794830371567, "grad_norm": 2.9851274699519466, "learning_rate": 3.0342963005715486e-06, "loss": 0.0632, "step": 2682 }, { "epoch": 4.334410339256866, "grad_norm": 2.5098359809443886, "learning_rate": 3.033052711493196e-06, "loss": 0.0579, "step": 2683 }, { "epoch": 4.336025848142165, "grad_norm": 2.883999114954482, "learning_rate": 3.0318089842175404e-06, "loss": 0.0593, "step": 2684 }, { "epoch": 4.337641357027463, "grad_norm": 2.8819293823834427, "learning_rate": 3.0305651190670275e-06, "loss": 0.0725, "step": 2685 }, { "epoch": 4.339256865912763, "grad_norm": 2.533548029071267, "learning_rate": 3.029321116364137e-06, "loss": 0.0638, "step": 2686 }, { "epoch": 4.340872374798061, "grad_norm": 2.8206800560770606, "learning_rate": 3.0280769764313835e-06, "loss": 0.0691, "step": 2687 }, { "epoch": 4.3424878836833605, "grad_norm": 3.4402716363361447, "learning_rate": 3.026832699591319e-06, "loss": 0.0811, "step": 2688 }, { "epoch": 4.344103392568659, "grad_norm": 3.3916840903082845, "learning_rate": 3.0255882861665296e-06, "loss": 0.0844, "step": 2689 }, { "epoch": 4.345718901453958, "grad_norm": 2.434694663194133, "learning_rate": 3.0243437364796386e-06, "loss": 0.0486, "step": 2690 }, { "epoch": 4.347334410339257, "grad_norm": 2.817731664649374, "learning_rate": 3.023099050853302e-06, "loss": 0.0822, "step": 2691 }, { "epoch": 4.348949919224555, "grad_norm": 2.3079769318610137, "learning_rate": 3.0218542296102137e-06, "loss": 0.0507, "step": 2692 }, { "epoch": 4.350565428109855, "grad_norm": 2.9459638681126057, "learning_rate": 3.0206092730731002e-06, "loss": 0.0645, "step": 2693 }, { "epoch": 4.352180936995153, "grad_norm": 2.7092719139983346, "learning_rate": 3.0193641815647256e-06, "loss": 0.0696, "step": 2694 }, { "epoch": 4.353796445880453, "grad_norm": 2.9039365791649634, "learning_rate": 3.0181189554078864e-06, "loss": 0.0686, "step": 2695 }, { "epoch": 4.355411954765751, "grad_norm": 2.7514180349576756, "learning_rate": 3.0168735949254174e-06, "loss": 0.0606, "step": 2696 }, { "epoch": 4.35702746365105, "grad_norm": 2.3471405390014852, "learning_rate": 3.015628100440185e-06, "loss": 0.0665, "step": 2697 }, { "epoch": 4.358642972536349, "grad_norm": 2.3351747580776254, "learning_rate": 3.0143824722750903e-06, "loss": 0.0566, "step": 2698 }, { "epoch": 4.360258481421647, "grad_norm": 2.896178865781117, "learning_rate": 3.013136710753073e-06, "loss": 0.0712, "step": 2699 }, { "epoch": 4.361873990306947, "grad_norm": 2.8682738196115563, "learning_rate": 3.011890816197103e-06, "loss": 0.072, "step": 2700 }, { "epoch": 4.363489499192245, "grad_norm": 2.41677575849451, "learning_rate": 3.010644788930186e-06, "loss": 0.0589, "step": 2701 }, { "epoch": 4.365105008077545, "grad_norm": 2.7705902274304166, "learning_rate": 3.0093986292753636e-06, "loss": 0.0572, "step": 2702 }, { "epoch": 4.366720516962843, "grad_norm": 2.3042896156136545, "learning_rate": 3.0081523375557104e-06, "loss": 0.0579, "step": 2703 }, { "epoch": 4.3683360258481425, "grad_norm": 2.4507834940422657, "learning_rate": 3.0069059140943354e-06, "loss": 0.0631, "step": 2704 }, { "epoch": 4.369951534733441, "grad_norm": 2.4587220902790916, "learning_rate": 3.0056593592143814e-06, "loss": 0.0555, "step": 2705 }, { "epoch": 4.3715670436187395, "grad_norm": 2.636013460087343, "learning_rate": 3.0044126732390256e-06, "loss": 0.0606, "step": 2706 }, { "epoch": 4.373182552504039, "grad_norm": 2.753947995720771, "learning_rate": 3.00316585649148e-06, "loss": 0.0646, "step": 2707 }, { "epoch": 4.374798061389337, "grad_norm": 3.8829882308619634, "learning_rate": 3.0019189092949897e-06, "loss": 0.0709, "step": 2708 }, { "epoch": 4.376413570274637, "grad_norm": 2.8516374402275244, "learning_rate": 3.0006718319728325e-06, "loss": 0.0756, "step": 2709 }, { "epoch": 4.378029079159935, "grad_norm": 3.4277536583707944, "learning_rate": 2.9994246248483225e-06, "loss": 0.1112, "step": 2710 }, { "epoch": 4.379644588045235, "grad_norm": 2.591693569082874, "learning_rate": 2.998177288244806e-06, "loss": 0.0779, "step": 2711 }, { "epoch": 4.381260096930533, "grad_norm": 2.488165411931339, "learning_rate": 2.996929822485661e-06, "loss": 0.0645, "step": 2712 }, { "epoch": 4.382875605815832, "grad_norm": 2.940129957301896, "learning_rate": 2.9956822278943037e-06, "loss": 0.0723, "step": 2713 }, { "epoch": 4.384491114701131, "grad_norm": 2.6148062334039603, "learning_rate": 2.994434504794179e-06, "loss": 0.0776, "step": 2714 }, { "epoch": 4.386106623586429, "grad_norm": 3.4438817058916515, "learning_rate": 2.993186653508767e-06, "loss": 0.0756, "step": 2715 }, { "epoch": 4.387722132471729, "grad_norm": 2.255084808430998, "learning_rate": 2.991938674361583e-06, "loss": 0.0569, "step": 2716 }, { "epoch": 4.389337641357027, "grad_norm": 2.1514086033160513, "learning_rate": 2.9906905676761718e-06, "loss": 0.0486, "step": 2717 }, { "epoch": 4.390953150242327, "grad_norm": 2.8607445058602017, "learning_rate": 2.9894423337761137e-06, "loss": 0.0625, "step": 2718 }, { "epoch": 4.392568659127625, "grad_norm": 2.343493202183242, "learning_rate": 2.9881939729850207e-06, "loss": 0.0615, "step": 2719 }, { "epoch": 4.394184168012924, "grad_norm": 3.2599602467351048, "learning_rate": 2.986945485626538e-06, "loss": 0.0949, "step": 2720 }, { "epoch": 4.395799676898223, "grad_norm": 2.4528742503657175, "learning_rate": 2.985696872024345e-06, "loss": 0.0578, "step": 2721 }, { "epoch": 4.3974151857835215, "grad_norm": 2.3502129710964175, "learning_rate": 2.9844481325021523e-06, "loss": 0.0479, "step": 2722 }, { "epoch": 4.399030694668821, "grad_norm": 2.595936022762751, "learning_rate": 2.983199267383703e-06, "loss": 0.063, "step": 2723 }, { "epoch": 4.400646203554119, "grad_norm": 2.773798448287214, "learning_rate": 2.9819502769927743e-06, "loss": 0.0855, "step": 2724 }, { "epoch": 4.402261712439419, "grad_norm": 2.311290561880998, "learning_rate": 2.9807011616531733e-06, "loss": 0.0561, "step": 2725 }, { "epoch": 4.403877221324717, "grad_norm": 2.781729905304405, "learning_rate": 2.979451921688743e-06, "loss": 0.0723, "step": 2726 }, { "epoch": 4.405492730210016, "grad_norm": 3.0062645470686857, "learning_rate": 2.9782025574233564e-06, "loss": 0.0726, "step": 2727 }, { "epoch": 4.407108239095315, "grad_norm": 2.4157130649921394, "learning_rate": 2.9769530691809173e-06, "loss": 0.0519, "step": 2728 }, { "epoch": 4.408723747980614, "grad_norm": 2.7532285725748946, "learning_rate": 2.9757034572853653e-06, "loss": 0.0598, "step": 2729 }, { "epoch": 4.410339256865913, "grad_norm": 2.664469361098324, "learning_rate": 2.9744537220606697e-06, "loss": 0.0793, "step": 2730 }, { "epoch": 4.4119547657512115, "grad_norm": 2.5994953005333983, "learning_rate": 2.9732038638308325e-06, "loss": 0.0608, "step": 2731 }, { "epoch": 4.413570274636511, "grad_norm": 3.048683235915337, "learning_rate": 2.971953882919887e-06, "loss": 0.0948, "step": 2732 }, { "epoch": 4.415185783521809, "grad_norm": 2.4069769375895667, "learning_rate": 2.9707037796518995e-06, "loss": 0.0679, "step": 2733 }, { "epoch": 4.416801292407108, "grad_norm": 2.6657326778238484, "learning_rate": 2.9694535543509655e-06, "loss": 0.0703, "step": 2734 }, { "epoch": 4.418416801292407, "grad_norm": 3.5571751547158086, "learning_rate": 2.968203207341216e-06, "loss": 0.0807, "step": 2735 }, { "epoch": 4.420032310177706, "grad_norm": 2.6721450631399373, "learning_rate": 2.9669527389468096e-06, "loss": 0.074, "step": 2736 }, { "epoch": 4.421647819063005, "grad_norm": 2.87976794950902, "learning_rate": 2.9657021494919393e-06, "loss": 0.0772, "step": 2737 }, { "epoch": 4.423263327948304, "grad_norm": 2.8293799101327717, "learning_rate": 2.9644514393008283e-06, "loss": 0.0638, "step": 2738 }, { "epoch": 4.424878836833603, "grad_norm": 2.6433563298409184, "learning_rate": 2.9632006086977298e-06, "loss": 0.0564, "step": 2739 }, { "epoch": 4.426494345718901, "grad_norm": 3.24597019027227, "learning_rate": 2.9619496580069316e-06, "loss": 0.0613, "step": 2740 }, { "epoch": 4.4281098546042, "grad_norm": 2.649304116329173, "learning_rate": 2.96069858755275e-06, "loss": 0.0992, "step": 2741 }, { "epoch": 4.429725363489499, "grad_norm": 2.942743150020123, "learning_rate": 2.959447397659532e-06, "loss": 0.0683, "step": 2742 }, { "epoch": 4.431340872374798, "grad_norm": 2.8152229033632823, "learning_rate": 2.958196088651657e-06, "loss": 0.0677, "step": 2743 }, { "epoch": 4.432956381260097, "grad_norm": 2.9226690853979083, "learning_rate": 2.956944660853535e-06, "loss": 0.0767, "step": 2744 }, { "epoch": 4.434571890145396, "grad_norm": 2.6279683456526093, "learning_rate": 2.955693114589607e-06, "loss": 0.0546, "step": 2745 }, { "epoch": 4.436187399030695, "grad_norm": 2.3884222819775154, "learning_rate": 2.954441450184344e-06, "loss": 0.0569, "step": 2746 }, { "epoch": 4.4378029079159935, "grad_norm": 3.1314809332799802, "learning_rate": 2.9531896679622475e-06, "loss": 0.0803, "step": 2747 }, { "epoch": 4.439418416801292, "grad_norm": 2.4496420039547933, "learning_rate": 2.95193776824785e-06, "loss": 0.0589, "step": 2748 }, { "epoch": 4.441033925686591, "grad_norm": 2.585458876673122, "learning_rate": 2.9506857513657154e-06, "loss": 0.0634, "step": 2749 }, { "epoch": 4.44264943457189, "grad_norm": 2.531181444099143, "learning_rate": 2.949433617640436e-06, "loss": 0.0551, "step": 2750 }, { "epoch": 4.444264943457189, "grad_norm": 2.7470820388006127, "learning_rate": 2.9481813673966357e-06, "loss": 0.072, "step": 2751 }, { "epoch": 4.445880452342488, "grad_norm": 2.281242786084643, "learning_rate": 2.946929000958969e-06, "loss": 0.0522, "step": 2752 }, { "epoch": 4.447495961227787, "grad_norm": 2.6978350481116906, "learning_rate": 2.945676518652119e-06, "loss": 0.055, "step": 2753 }, { "epoch": 4.449111470113086, "grad_norm": 3.150224245947312, "learning_rate": 2.9444239208008003e-06, "loss": 0.0536, "step": 2754 }, { "epoch": 4.450726978998384, "grad_norm": 2.7107708739168688, "learning_rate": 2.9431712077297565e-06, "loss": 0.0615, "step": 2755 }, { "epoch": 4.4523424878836835, "grad_norm": 2.0364073034278296, "learning_rate": 2.941918379763761e-06, "loss": 0.0583, "step": 2756 }, { "epoch": 4.453957996768982, "grad_norm": 2.9723420997619034, "learning_rate": 2.940665437227619e-06, "loss": 0.0744, "step": 2757 }, { "epoch": 4.455573505654281, "grad_norm": 2.6152681259777824, "learning_rate": 2.9394123804461617e-06, "loss": 0.0625, "step": 2758 }, { "epoch": 4.45718901453958, "grad_norm": 3.292942893435671, "learning_rate": 2.9381592097442534e-06, "loss": 0.0951, "step": 2759 }, { "epoch": 4.458804523424879, "grad_norm": 2.385296187825472, "learning_rate": 2.9369059254467857e-06, "loss": 0.0599, "step": 2760 }, { "epoch": 4.460420032310178, "grad_norm": 2.5470896783825108, "learning_rate": 2.935652527878682e-06, "loss": 0.064, "step": 2761 }, { "epoch": 4.462035541195476, "grad_norm": 2.6070834967033094, "learning_rate": 2.9343990173648916e-06, "loss": 0.0784, "step": 2762 }, { "epoch": 4.463651050080776, "grad_norm": 2.9190937870884044, "learning_rate": 2.933145394230397e-06, "loss": 0.0742, "step": 2763 }, { "epoch": 4.465266558966074, "grad_norm": 3.0060056832664146, "learning_rate": 2.9318916588002063e-06, "loss": 0.0645, "step": 2764 }, { "epoch": 4.466882067851373, "grad_norm": 2.8789664954494407, "learning_rate": 2.9306378113993595e-06, "loss": 0.0743, "step": 2765 }, { "epoch": 4.468497576736672, "grad_norm": 3.0708266309379, "learning_rate": 2.929383852352924e-06, "loss": 0.07, "step": 2766 }, { "epoch": 4.470113085621971, "grad_norm": 2.515224587011869, "learning_rate": 2.928129781985997e-06, "loss": 0.0596, "step": 2767 }, { "epoch": 4.47172859450727, "grad_norm": 3.660310391723615, "learning_rate": 2.926875600623704e-06, "loss": 0.09, "step": 2768 }, { "epoch": 4.473344103392568, "grad_norm": 2.574616642018607, "learning_rate": 2.9256213085911996e-06, "loss": 0.0574, "step": 2769 }, { "epoch": 4.474959612277868, "grad_norm": 2.952025613828071, "learning_rate": 2.9243669062136665e-06, "loss": 0.0625, "step": 2770 }, { "epoch": 4.476575121163166, "grad_norm": 2.5579467011230705, "learning_rate": 2.9231123938163174e-06, "loss": 0.0595, "step": 2771 }, { "epoch": 4.4781906300484655, "grad_norm": 2.568992275415806, "learning_rate": 2.921857771724391e-06, "loss": 0.0666, "step": 2772 }, { "epoch": 4.479806138933764, "grad_norm": 2.7638762427148382, "learning_rate": 2.920603040263158e-06, "loss": 0.0688, "step": 2773 }, { "epoch": 4.481421647819063, "grad_norm": 2.9634391335789902, "learning_rate": 2.919348199757914e-06, "loss": 0.0697, "step": 2774 }, { "epoch": 4.483037156704362, "grad_norm": 2.6677166913646517, "learning_rate": 2.9180932505339856e-06, "loss": 0.0744, "step": 2775 }, { "epoch": 4.48465266558966, "grad_norm": 3.6191963325563243, "learning_rate": 2.9168381929167254e-06, "loss": 0.0768, "step": 2776 }, { "epoch": 4.48626817447496, "grad_norm": 3.0232348373335896, "learning_rate": 2.915583027231515e-06, "loss": 0.0633, "step": 2777 }, { "epoch": 4.487883683360258, "grad_norm": 2.374451140322033, "learning_rate": 2.914327753803765e-06, "loss": 0.0625, "step": 2778 }, { "epoch": 4.489499192245558, "grad_norm": 3.1018422273107578, "learning_rate": 2.9130723729589127e-06, "loss": 0.0806, "step": 2779 }, { "epoch": 4.491114701130856, "grad_norm": 2.7297752153511357, "learning_rate": 2.9118168850224226e-06, "loss": 0.0663, "step": 2780 }, { "epoch": 4.4927302100161555, "grad_norm": 2.7111390326358373, "learning_rate": 2.9105612903197894e-06, "loss": 0.0534, "step": 2781 }, { "epoch": 4.494345718901454, "grad_norm": 2.9043973513708314, "learning_rate": 2.909305589176533e-06, "loss": 0.0696, "step": 2782 }, { "epoch": 4.4959612277867524, "grad_norm": 3.880591901282395, "learning_rate": 2.908049781918202e-06, "loss": 0.0784, "step": 2783 }, { "epoch": 4.497576736672052, "grad_norm": 2.3428184166188624, "learning_rate": 2.906793868870373e-06, "loss": 0.054, "step": 2784 }, { "epoch": 4.49919224555735, "grad_norm": 2.6972463991119913, "learning_rate": 2.905537850358648e-06, "loss": 0.0615, "step": 2785 }, { "epoch": 4.50080775444265, "grad_norm": 2.9068415342584397, "learning_rate": 2.90428172670866e-06, "loss": 0.0669, "step": 2786 }, { "epoch": 4.502423263327948, "grad_norm": 2.826114994052268, "learning_rate": 2.9030254982460656e-06, "loss": 0.0683, "step": 2787 }, { "epoch": 4.5040387722132476, "grad_norm": 3.018883476870493, "learning_rate": 2.901769165296549e-06, "loss": 0.0747, "step": 2788 }, { "epoch": 4.505654281098546, "grad_norm": 2.3405212753741114, "learning_rate": 2.9005127281858255e-06, "loss": 0.0656, "step": 2789 }, { "epoch": 4.5072697899838445, "grad_norm": 2.4737740576950444, "learning_rate": 2.8992561872396315e-06, "loss": 0.0626, "step": 2790 }, { "epoch": 4.508885298869144, "grad_norm": 3.6206776446389943, "learning_rate": 2.8979995427837345e-06, "loss": 0.0752, "step": 2791 }, { "epoch": 4.510500807754442, "grad_norm": 2.6722401250219914, "learning_rate": 2.8967427951439275e-06, "loss": 0.073, "step": 2792 }, { "epoch": 4.512116316639742, "grad_norm": 2.427935031810679, "learning_rate": 2.8954859446460302e-06, "loss": 0.0632, "step": 2793 }, { "epoch": 4.51373182552504, "grad_norm": 2.9888312517195503, "learning_rate": 2.894228991615889e-06, "loss": 0.0749, "step": 2794 }, { "epoch": 4.51534733441034, "grad_norm": 2.6317096795260775, "learning_rate": 2.8929719363793775e-06, "loss": 0.0634, "step": 2795 }, { "epoch": 4.516962843295638, "grad_norm": 2.338080272023707, "learning_rate": 2.8917147792623944e-06, "loss": 0.0611, "step": 2796 }, { "epoch": 4.518578352180937, "grad_norm": 2.2016620580005792, "learning_rate": 2.8904575205908662e-06, "loss": 0.061, "step": 2797 }, { "epoch": 4.520193861066236, "grad_norm": 2.6614672684650644, "learning_rate": 2.889200160690746e-06, "loss": 0.0631, "step": 2798 }, { "epoch": 4.5218093699515345, "grad_norm": 2.917204601083572, "learning_rate": 2.88794269988801e-06, "loss": 0.0646, "step": 2799 }, { "epoch": 4.523424878836834, "grad_norm": 2.4747961626146684, "learning_rate": 2.8866851385086652e-06, "loss": 0.0646, "step": 2800 }, { "epoch": 4.523424878836834, "eval_loss": 1.402184009552002, "eval_runtime": 2.3507, "eval_samples_per_second": 127.621, "eval_steps_per_second": 2.978, "step": 2800 }, { "epoch": 4.525040387722132, "grad_norm": 2.8580403290820553, "learning_rate": 2.8854274768787417e-06, "loss": 0.0789, "step": 2801 }, { "epoch": 4.526655896607432, "grad_norm": 2.477066076147272, "learning_rate": 2.8841697153242952e-06, "loss": 0.0658, "step": 2802 }, { "epoch": 4.52827140549273, "grad_norm": 2.516881558891313, "learning_rate": 2.8829118541714102e-06, "loss": 0.0686, "step": 2803 }, { "epoch": 4.529886914378029, "grad_norm": 2.6724211600158885, "learning_rate": 2.8816538937461935e-06, "loss": 0.0723, "step": 2804 }, { "epoch": 4.531502423263328, "grad_norm": 2.9207198736268367, "learning_rate": 2.8803958343747806e-06, "loss": 0.1135, "step": 2805 }, { "epoch": 4.533117932148627, "grad_norm": 3.252333888069013, "learning_rate": 2.879137676383331e-06, "loss": 0.0773, "step": 2806 }, { "epoch": 4.534733441033926, "grad_norm": 3.152130215804559, "learning_rate": 2.87787942009803e-06, "loss": 0.0842, "step": 2807 }, { "epoch": 4.536348949919224, "grad_norm": 3.0833974003421862, "learning_rate": 2.8766210658450884e-06, "loss": 0.0718, "step": 2808 }, { "epoch": 4.537964458804524, "grad_norm": 2.8362295355286835, "learning_rate": 2.8753626139507422e-06, "loss": 0.085, "step": 2809 }, { "epoch": 4.539579967689822, "grad_norm": 2.7655856410631485, "learning_rate": 2.874104064741254e-06, "loss": 0.0731, "step": 2810 }, { "epoch": 4.541195476575121, "grad_norm": 2.604497539377677, "learning_rate": 2.87284541854291e-06, "loss": 0.0607, "step": 2811 }, { "epoch": 4.54281098546042, "grad_norm": 2.2753299762281998, "learning_rate": 2.871586675682023e-06, "loss": 0.0626, "step": 2812 }, { "epoch": 4.544426494345719, "grad_norm": 2.967173966963689, "learning_rate": 2.870327836484929e-06, "loss": 0.0715, "step": 2813 }, { "epoch": 4.546042003231018, "grad_norm": 2.280316979308099, "learning_rate": 2.869068901277991e-06, "loss": 0.0615, "step": 2814 }, { "epoch": 4.5476575121163165, "grad_norm": 2.619395252085793, "learning_rate": 2.8678098703875946e-06, "loss": 0.0746, "step": 2815 }, { "epoch": 4.549273021001616, "grad_norm": 2.8177473482069457, "learning_rate": 2.8665507441401537e-06, "loss": 0.0763, "step": 2816 }, { "epoch": 4.550888529886914, "grad_norm": 2.683031076876686, "learning_rate": 2.8652915228621032e-06, "loss": 0.0801, "step": 2817 }, { "epoch": 4.552504038772213, "grad_norm": 2.990878337161389, "learning_rate": 2.8640322068799043e-06, "loss": 0.072, "step": 2818 }, { "epoch": 4.554119547657512, "grad_norm": 2.4729301815163667, "learning_rate": 2.862772796520043e-06, "loss": 0.063, "step": 2819 }, { "epoch": 4.555735056542811, "grad_norm": 2.53580662296709, "learning_rate": 2.8615132921090292e-06, "loss": 0.0505, "step": 2820 }, { "epoch": 4.55735056542811, "grad_norm": 3.110651865897002, "learning_rate": 2.8602536939733984e-06, "loss": 0.0701, "step": 2821 }, { "epoch": 4.558966074313409, "grad_norm": 2.592331251005779, "learning_rate": 2.8589940024397076e-06, "loss": 0.0661, "step": 2822 }, { "epoch": 4.560581583198708, "grad_norm": 2.758268638474064, "learning_rate": 2.857734217834542e-06, "loss": 0.0749, "step": 2823 }, { "epoch": 4.5621970920840065, "grad_norm": 2.8667587242394577, "learning_rate": 2.8564743404845064e-06, "loss": 0.078, "step": 2824 }, { "epoch": 4.563812600969305, "grad_norm": 2.64459405898396, "learning_rate": 2.8552143707162337e-06, "loss": 0.0632, "step": 2825 }, { "epoch": 4.565428109854604, "grad_norm": 2.968595822368749, "learning_rate": 2.8539543088563787e-06, "loss": 0.0526, "step": 2826 }, { "epoch": 4.567043618739903, "grad_norm": 2.9097075759594313, "learning_rate": 2.852694155231621e-06, "loss": 0.0694, "step": 2827 }, { "epoch": 4.568659127625202, "grad_norm": 3.0376149469048253, "learning_rate": 2.8514339101686624e-06, "loss": 0.0709, "step": 2828 }, { "epoch": 4.570274636510501, "grad_norm": 6.936556027430517, "learning_rate": 2.8501735739942295e-06, "loss": 0.1152, "step": 2829 }, { "epoch": 4.5718901453958, "grad_norm": 3.1259477527773853, "learning_rate": 2.848913147035073e-06, "loss": 0.0892, "step": 2830 }, { "epoch": 4.573505654281099, "grad_norm": 2.571348146538268, "learning_rate": 2.8476526296179667e-06, "loss": 0.0639, "step": 2831 }, { "epoch": 4.575121163166397, "grad_norm": 2.633567281109564, "learning_rate": 2.8463920220697066e-06, "loss": 0.0663, "step": 2832 }, { "epoch": 4.576736672051696, "grad_norm": 2.6387010434517855, "learning_rate": 2.845131324717115e-06, "loss": 0.0764, "step": 2833 }, { "epoch": 4.578352180936995, "grad_norm": 2.8795405308551287, "learning_rate": 2.8438705378870342e-06, "loss": 0.0539, "step": 2834 }, { "epoch": 4.579967689822294, "grad_norm": 2.8618488253787566, "learning_rate": 2.8426096619063315e-06, "loss": 0.0745, "step": 2835 }, { "epoch": 4.581583198707593, "grad_norm": 2.564029574141776, "learning_rate": 2.841348697101898e-06, "loss": 0.0541, "step": 2836 }, { "epoch": 4.583198707592892, "grad_norm": 2.6807424067470627, "learning_rate": 2.840087643800645e-06, "loss": 0.0676, "step": 2837 }, { "epoch": 4.584814216478191, "grad_norm": 2.9052456126694346, "learning_rate": 2.8388265023295107e-06, "loss": 0.0615, "step": 2838 }, { "epoch": 4.586429725363489, "grad_norm": 3.620211694313325, "learning_rate": 2.837565273015453e-06, "loss": 0.085, "step": 2839 }, { "epoch": 4.5880452342487885, "grad_norm": 3.610371249185922, "learning_rate": 2.8363039561854534e-06, "loss": 0.0908, "step": 2840 }, { "epoch": 4.589660743134087, "grad_norm": 3.262015728538476, "learning_rate": 2.8350425521665167e-06, "loss": 0.0874, "step": 2841 }, { "epoch": 4.591276252019386, "grad_norm": 3.0579444069861372, "learning_rate": 2.833781061285669e-06, "loss": 0.0819, "step": 2842 }, { "epoch": 4.592891760904685, "grad_norm": 2.5794979091921926, "learning_rate": 2.832519483869961e-06, "loss": 0.0757, "step": 2843 }, { "epoch": 4.594507269789984, "grad_norm": 2.435527892284417, "learning_rate": 2.831257820246464e-06, "loss": 0.0592, "step": 2844 }, { "epoch": 4.596122778675283, "grad_norm": 2.948811677032546, "learning_rate": 2.8299960707422723e-06, "loss": 0.069, "step": 2845 }, { "epoch": 4.597738287560581, "grad_norm": 2.461676210687988, "learning_rate": 2.8287342356845037e-06, "loss": 0.058, "step": 2846 }, { "epoch": 4.599353796445881, "grad_norm": 2.8139355182138175, "learning_rate": 2.827472315400295e-06, "loss": 0.0617, "step": 2847 }, { "epoch": 4.600969305331179, "grad_norm": 2.6169150851153917, "learning_rate": 2.8262103102168087e-06, "loss": 0.0578, "step": 2848 }, { "epoch": 4.6025848142164785, "grad_norm": 3.0918478897848556, "learning_rate": 2.8249482204612266e-06, "loss": 0.0688, "step": 2849 }, { "epoch": 4.604200323101777, "grad_norm": 3.8277947468203397, "learning_rate": 2.8236860464607535e-06, "loss": 0.0842, "step": 2850 }, { "epoch": 4.605815831987076, "grad_norm": 2.889325081943431, "learning_rate": 2.822423788542617e-06, "loss": 0.081, "step": 2851 }, { "epoch": 4.607431340872375, "grad_norm": 2.754362113608988, "learning_rate": 2.821161447034064e-06, "loss": 0.0753, "step": 2852 }, { "epoch": 4.609046849757673, "grad_norm": 2.59581485493687, "learning_rate": 2.8198990222623667e-06, "loss": 0.0671, "step": 2853 }, { "epoch": 4.610662358642973, "grad_norm": 2.674827452430739, "learning_rate": 2.8186365145548143e-06, "loss": 0.0652, "step": 2854 }, { "epoch": 4.612277867528271, "grad_norm": 2.391721992033014, "learning_rate": 2.817373924238722e-06, "loss": 0.0643, "step": 2855 }, { "epoch": 4.613893376413571, "grad_norm": 3.0047581845086944, "learning_rate": 2.8161112516414228e-06, "loss": 0.0838, "step": 2856 }, { "epoch": 4.615508885298869, "grad_norm": 2.8118614927528367, "learning_rate": 2.814848497090274e-06, "loss": 0.0659, "step": 2857 }, { "epoch": 4.617124394184168, "grad_norm": 2.3936124073012466, "learning_rate": 2.8135856609126514e-06, "loss": 0.0619, "step": 2858 }, { "epoch": 4.618739903069467, "grad_norm": 3.1040442232665546, "learning_rate": 2.812322743435954e-06, "loss": 0.0841, "step": 2859 }, { "epoch": 4.620355411954765, "grad_norm": 2.8308819247032377, "learning_rate": 2.8110597449876016e-06, "loss": 0.0989, "step": 2860 }, { "epoch": 4.621970920840065, "grad_norm": 2.3695884869479946, "learning_rate": 2.8097966658950337e-06, "loss": 0.0594, "step": 2861 }, { "epoch": 4.623586429725363, "grad_norm": 3.068480498102275, "learning_rate": 2.8085335064857116e-06, "loss": 0.0701, "step": 2862 }, { "epoch": 4.625201938610663, "grad_norm": 2.6653624550138537, "learning_rate": 2.8072702670871182e-06, "loss": 0.0644, "step": 2863 }, { "epoch": 4.626817447495961, "grad_norm": 3.1790006631955956, "learning_rate": 2.8060069480267553e-06, "loss": 0.0626, "step": 2864 }, { "epoch": 4.6284329563812605, "grad_norm": 3.0655032242136415, "learning_rate": 2.8047435496321474e-06, "loss": 0.0721, "step": 2865 }, { "epoch": 4.630048465266559, "grad_norm": 2.250161120640464, "learning_rate": 2.8034800722308386e-06, "loss": 0.0622, "step": 2866 }, { "epoch": 4.6316639741518575, "grad_norm": 3.0149802043354086, "learning_rate": 2.802216516150392e-06, "loss": 0.0952, "step": 2867 }, { "epoch": 4.633279483037157, "grad_norm": 3.0789232508984514, "learning_rate": 2.800952881718394e-06, "loss": 0.0782, "step": 2868 }, { "epoch": 4.634894991922455, "grad_norm": 2.2245311912786585, "learning_rate": 2.7996891692624494e-06, "loss": 0.0548, "step": 2869 }, { "epoch": 4.636510500807755, "grad_norm": 2.6648484157391863, "learning_rate": 2.7984253791101833e-06, "loss": 0.0658, "step": 2870 }, { "epoch": 4.638126009693053, "grad_norm": 3.6983367765065815, "learning_rate": 2.7971615115892426e-06, "loss": 0.0659, "step": 2871 }, { "epoch": 4.639741518578353, "grad_norm": 3.1916612538020255, "learning_rate": 2.795897567027291e-06, "loss": 0.0629, "step": 2872 }, { "epoch": 4.641357027463651, "grad_norm": 2.8036323629451063, "learning_rate": 2.7946335457520157e-06, "loss": 0.0574, "step": 2873 }, { "epoch": 4.64297253634895, "grad_norm": 3.0273077627460805, "learning_rate": 2.793369448091122e-06, "loss": 0.0675, "step": 2874 }, { "epoch": 4.644588045234249, "grad_norm": 2.757995419201002, "learning_rate": 2.7921052743723347e-06, "loss": 0.0654, "step": 2875 }, { "epoch": 4.646203554119547, "grad_norm": 3.2767236219609504, "learning_rate": 2.7908410249233995e-06, "loss": 0.0716, "step": 2876 }, { "epoch": 4.647819063004847, "grad_norm": 3.494819538637553, "learning_rate": 2.789576700072081e-06, "loss": 0.0756, "step": 2877 }, { "epoch": 4.649434571890145, "grad_norm": 2.550059307745542, "learning_rate": 2.788312300146162e-06, "loss": 0.0688, "step": 2878 }, { "epoch": 4.651050080775445, "grad_norm": 2.9148642317236666, "learning_rate": 2.787047825473449e-06, "loss": 0.0691, "step": 2879 }, { "epoch": 4.652665589660743, "grad_norm": 2.678167591520616, "learning_rate": 2.785783276381762e-06, "loss": 0.0669, "step": 2880 }, { "epoch": 4.654281098546042, "grad_norm": 2.4961291401599364, "learning_rate": 2.784518653198946e-06, "loss": 0.0782, "step": 2881 }, { "epoch": 4.655896607431341, "grad_norm": 2.8303254551627655, "learning_rate": 2.783253956252861e-06, "loss": 0.0586, "step": 2882 }, { "epoch": 4.6575121163166395, "grad_norm": 2.602772510598103, "learning_rate": 2.781989185871389e-06, "loss": 0.0799, "step": 2883 }, { "epoch": 4.659127625201939, "grad_norm": 2.230128824273315, "learning_rate": 2.780724342382428e-06, "loss": 0.0487, "step": 2884 }, { "epoch": 4.660743134087237, "grad_norm": 2.5194208269479956, "learning_rate": 2.7794594261138986e-06, "loss": 0.0748, "step": 2885 }, { "epoch": 4.662358642972537, "grad_norm": 2.5648798292286057, "learning_rate": 2.7781944373937365e-06, "loss": 0.0659, "step": 2886 }, { "epoch": 4.663974151857835, "grad_norm": 2.61460514741235, "learning_rate": 2.7769293765499e-06, "loss": 0.0626, "step": 2887 }, { "epoch": 4.665589660743134, "grad_norm": 2.6056907513475758, "learning_rate": 2.775664243910363e-06, "loss": 0.0682, "step": 2888 }, { "epoch": 4.667205169628433, "grad_norm": 2.833001451873905, "learning_rate": 2.774399039803119e-06, "loss": 0.0615, "step": 2889 }, { "epoch": 4.668820678513732, "grad_norm": 2.9745860027066313, "learning_rate": 2.773133764556181e-06, "loss": 0.081, "step": 2890 }, { "epoch": 4.670436187399031, "grad_norm": 2.5730780633045853, "learning_rate": 2.77186841849758e-06, "loss": 0.0728, "step": 2891 }, { "epoch": 4.6720516962843295, "grad_norm": 2.630263435433219, "learning_rate": 2.7706030019553625e-06, "loss": 0.0647, "step": 2892 }, { "epoch": 4.673667205169629, "grad_norm": 3.3852312325231266, "learning_rate": 2.7693375152575985e-06, "loss": 0.0792, "step": 2893 }, { "epoch": 4.675282714054927, "grad_norm": 2.6418635895037976, "learning_rate": 2.768071958732372e-06, "loss": 0.0622, "step": 2894 }, { "epoch": 4.676898222940226, "grad_norm": 2.974443600133993, "learning_rate": 2.7668063327077875e-06, "loss": 0.0749, "step": 2895 }, { "epoch": 4.678513731825525, "grad_norm": 2.7705837859520366, "learning_rate": 2.7655406375119666e-06, "loss": 0.072, "step": 2896 }, { "epoch": 4.680129240710824, "grad_norm": 2.53315733725435, "learning_rate": 2.7642748734730473e-06, "loss": 0.0684, "step": 2897 }, { "epoch": 4.681744749596123, "grad_norm": 2.6529052418585137, "learning_rate": 2.763009040919189e-06, "loss": 0.0628, "step": 2898 }, { "epoch": 4.683360258481422, "grad_norm": 3.044388112299912, "learning_rate": 2.7617431401785655e-06, "loss": 0.0757, "step": 2899 }, { "epoch": 4.684975767366721, "grad_norm": 2.6272276015846803, "learning_rate": 2.76047717157937e-06, "loss": 0.0769, "step": 2900 }, { "epoch": 4.686591276252019, "grad_norm": 3.0510034916532933, "learning_rate": 2.759211135449813e-06, "loss": 0.0947, "step": 2901 }, { "epoch": 4.688206785137318, "grad_norm": 3.143450071225111, "learning_rate": 2.757945032118122e-06, "loss": 0.0816, "step": 2902 }, { "epoch": 4.689822294022617, "grad_norm": 3.0069124915571823, "learning_rate": 2.756678861912543e-06, "loss": 0.0735, "step": 2903 }, { "epoch": 4.691437802907916, "grad_norm": 2.5465876583364575, "learning_rate": 2.7554126251613376e-06, "loss": 0.0551, "step": 2904 }, { "epoch": 4.693053311793215, "grad_norm": 2.3067434251702568, "learning_rate": 2.7541463221927865e-06, "loss": 0.0586, "step": 2905 }, { "epoch": 4.694668820678514, "grad_norm": 2.6375758120851764, "learning_rate": 2.7528799533351874e-06, "loss": 0.0625, "step": 2906 }, { "epoch": 4.696284329563813, "grad_norm": 3.0892408119705252, "learning_rate": 2.7516135189168536e-06, "loss": 0.0784, "step": 2907 }, { "epoch": 4.6978998384491115, "grad_norm": 2.760376310911798, "learning_rate": 2.750347019266116e-06, "loss": 0.0819, "step": 2908 }, { "epoch": 4.69951534733441, "grad_norm": 3.11059053999567, "learning_rate": 2.7490804547113236e-06, "loss": 0.0649, "step": 2909 }, { "epoch": 4.701130856219709, "grad_norm": 3.0613975810684653, "learning_rate": 2.74781382558084e-06, "loss": 0.0843, "step": 2910 }, { "epoch": 4.702746365105008, "grad_norm": 2.5562426727056216, "learning_rate": 2.7465471322030492e-06, "loss": 0.0673, "step": 2911 }, { "epoch": 4.704361873990307, "grad_norm": 2.4534020226459012, "learning_rate": 2.745280374906347e-06, "loss": 0.0688, "step": 2912 }, { "epoch": 4.705977382875606, "grad_norm": 2.9523227701767234, "learning_rate": 2.7440135540191494e-06, "loss": 0.0814, "step": 2913 }, { "epoch": 4.707592891760905, "grad_norm": 3.3852054937820353, "learning_rate": 2.742746669869887e-06, "loss": 0.076, "step": 2914 }, { "epoch": 4.709208400646204, "grad_norm": 2.668265130074414, "learning_rate": 2.7414797227870087e-06, "loss": 0.0651, "step": 2915 }, { "epoch": 4.710823909531502, "grad_norm": 2.8492462982642963, "learning_rate": 2.7402127130989782e-06, "loss": 0.0767, "step": 2916 }, { "epoch": 4.7124394184168015, "grad_norm": 2.6585712806013384, "learning_rate": 2.738945641134276e-06, "loss": 0.0552, "step": 2917 }, { "epoch": 4.7140549273021, "grad_norm": 2.6127679478087122, "learning_rate": 2.737678507221398e-06, "loss": 0.0595, "step": 2918 }, { "epoch": 4.715670436187399, "grad_norm": 3.3105588726618373, "learning_rate": 2.736411311688856e-06, "loss": 0.0713, "step": 2919 }, { "epoch": 4.717285945072698, "grad_norm": 2.2356603831089297, "learning_rate": 2.735144054865181e-06, "loss": 0.0609, "step": 2920 }, { "epoch": 4.718901453957997, "grad_norm": 2.188529671589261, "learning_rate": 2.7338767370789155e-06, "loss": 0.06, "step": 2921 }, { "epoch": 4.720516962843296, "grad_norm": 2.794199256034624, "learning_rate": 2.7326093586586187e-06, "loss": 0.0677, "step": 2922 }, { "epoch": 4.722132471728594, "grad_norm": 2.330043658837201, "learning_rate": 2.7313419199328693e-06, "loss": 0.0643, "step": 2923 }, { "epoch": 4.723747980613894, "grad_norm": 2.65846442170518, "learning_rate": 2.7300744212302564e-06, "loss": 0.0504, "step": 2924 }, { "epoch": 4.725363489499192, "grad_norm": 2.6516013831135687, "learning_rate": 2.728806862879389e-06, "loss": 0.0701, "step": 2925 }, { "epoch": 4.726978998384491, "grad_norm": 3.5280866908758184, "learning_rate": 2.727539245208888e-06, "loss": 0.0839, "step": 2926 }, { "epoch": 4.72859450726979, "grad_norm": 2.6194304636168897, "learning_rate": 2.7262715685473928e-06, "loss": 0.0736, "step": 2927 }, { "epoch": 4.730210016155089, "grad_norm": 2.597390130852563, "learning_rate": 2.7250038332235553e-06, "loss": 0.0595, "step": 2928 }, { "epoch": 4.731825525040388, "grad_norm": 3.5682566589087883, "learning_rate": 2.723736039566045e-06, "loss": 0.0871, "step": 2929 }, { "epoch": 4.733441033925686, "grad_norm": 3.0367324669316713, "learning_rate": 2.722468187903544e-06, "loss": 0.0679, "step": 2930 }, { "epoch": 4.735056542810986, "grad_norm": 3.2443506920330014, "learning_rate": 2.7212002785647527e-06, "loss": 0.0699, "step": 2931 }, { "epoch": 4.736672051696284, "grad_norm": 2.9587127308368135, "learning_rate": 2.7199323118783838e-06, "loss": 0.0584, "step": 2932 }, { "epoch": 4.7382875605815835, "grad_norm": 2.767162499486999, "learning_rate": 2.7186642881731657e-06, "loss": 0.0682, "step": 2933 }, { "epoch": 4.739903069466882, "grad_norm": 2.4636842536591566, "learning_rate": 2.7173962077778417e-06, "loss": 0.0641, "step": 2934 }, { "epoch": 4.741518578352181, "grad_norm": 3.3850157282127777, "learning_rate": 2.716128071021169e-06, "loss": 0.0665, "step": 2935 }, { "epoch": 4.74313408723748, "grad_norm": 2.8383355761516915, "learning_rate": 2.7148598782319214e-06, "loss": 0.0681, "step": 2936 }, { "epoch": 4.744749596122778, "grad_norm": 2.9907293622199944, "learning_rate": 2.7135916297388852e-06, "loss": 0.071, "step": 2937 }, { "epoch": 4.746365105008078, "grad_norm": 2.448197630457801, "learning_rate": 2.712323325870862e-06, "loss": 0.0497, "step": 2938 }, { "epoch": 4.747980613893376, "grad_norm": 3.023717073961894, "learning_rate": 2.711054966956668e-06, "loss": 0.0737, "step": 2939 }, { "epoch": 4.749596122778676, "grad_norm": 2.829995591008054, "learning_rate": 2.7097865533251316e-06, "loss": 0.0688, "step": 2940 }, { "epoch": 4.751211631663974, "grad_norm": 2.649982378431304, "learning_rate": 2.7085180853051e-06, "loss": 0.0699, "step": 2941 }, { "epoch": 4.7528271405492735, "grad_norm": 2.554947911429926, "learning_rate": 2.7072495632254285e-06, "loss": 0.0751, "step": 2942 }, { "epoch": 4.754442649434572, "grad_norm": 2.835163003790563, "learning_rate": 2.705980987414992e-06, "loss": 0.0637, "step": 2943 }, { "epoch": 4.75605815831987, "grad_norm": 3.084931071094347, "learning_rate": 2.7047123582026758e-06, "loss": 0.0745, "step": 2944 }, { "epoch": 4.75767366720517, "grad_norm": 2.82948319427266, "learning_rate": 2.70344367591738e-06, "loss": 0.0627, "step": 2945 }, { "epoch": 4.759289176090468, "grad_norm": 3.277120774177692, "learning_rate": 2.702174940888018e-06, "loss": 0.1049, "step": 2946 }, { "epoch": 4.760904684975768, "grad_norm": 2.702600166596996, "learning_rate": 2.700906153443519e-06, "loss": 0.0654, "step": 2947 }, { "epoch": 4.762520193861066, "grad_norm": 4.095978097257309, "learning_rate": 2.6996373139128234e-06, "loss": 0.0672, "step": 2948 }, { "epoch": 4.7641357027463656, "grad_norm": 2.783017492725525, "learning_rate": 2.6983684226248853e-06, "loss": 0.0716, "step": 2949 }, { "epoch": 4.765751211631664, "grad_norm": 2.5321580704902225, "learning_rate": 2.6970994799086735e-06, "loss": 0.0571, "step": 2950 }, { "epoch": 4.7673667205169625, "grad_norm": 3.896114220898301, "learning_rate": 2.69583048609317e-06, "loss": 0.0753, "step": 2951 }, { "epoch": 4.768982229402262, "grad_norm": 3.362008795978501, "learning_rate": 2.694561441507368e-06, "loss": 0.0738, "step": 2952 }, { "epoch": 4.77059773828756, "grad_norm": 3.4521567321616065, "learning_rate": 2.6932923464802762e-06, "loss": 0.0872, "step": 2953 }, { "epoch": 4.77221324717286, "grad_norm": 2.2678465344763357, "learning_rate": 2.6920232013409155e-06, "loss": 0.0569, "step": 2954 }, { "epoch": 4.773828756058158, "grad_norm": 2.5262033182811274, "learning_rate": 2.6907540064183203e-06, "loss": 0.0726, "step": 2955 }, { "epoch": 4.775444264943458, "grad_norm": 2.6235392955846115, "learning_rate": 2.689484762041537e-06, "loss": 0.0563, "step": 2956 }, { "epoch": 4.777059773828756, "grad_norm": 2.600860590971953, "learning_rate": 2.6882154685396245e-06, "loss": 0.0694, "step": 2957 }, { "epoch": 4.778675282714055, "grad_norm": 2.9305274338658314, "learning_rate": 2.6869461262416567e-06, "loss": 0.0672, "step": 2958 }, { "epoch": 4.780290791599354, "grad_norm": 2.4003672826231504, "learning_rate": 2.685676735476718e-06, "loss": 0.0668, "step": 2959 }, { "epoch": 4.7819063004846525, "grad_norm": 3.1223738693335985, "learning_rate": 2.6844072965739053e-06, "loss": 0.0709, "step": 2960 }, { "epoch": 4.783521809369952, "grad_norm": 2.8031134964591806, "learning_rate": 2.683137809862331e-06, "loss": 0.09, "step": 2961 }, { "epoch": 4.78513731825525, "grad_norm": 2.4972795495830598, "learning_rate": 2.681868275671114e-06, "loss": 0.0678, "step": 2962 }, { "epoch": 4.78675282714055, "grad_norm": 2.708499515206814, "learning_rate": 2.680598694329392e-06, "loss": 0.0548, "step": 2963 }, { "epoch": 4.788368336025848, "grad_norm": 2.6175529883369038, "learning_rate": 2.679329066166311e-06, "loss": 0.0748, "step": 2964 }, { "epoch": 4.789983844911147, "grad_norm": 2.795194249819967, "learning_rate": 2.67805939151103e-06, "loss": 0.0667, "step": 2965 }, { "epoch": 4.791599353796446, "grad_norm": 2.5504626946838513, "learning_rate": 2.676789670692721e-06, "loss": 0.0661, "step": 2966 }, { "epoch": 4.793214862681745, "grad_norm": 2.2117507549084974, "learning_rate": 2.6755199040405668e-06, "loss": 0.0555, "step": 2967 }, { "epoch": 4.794830371567044, "grad_norm": 2.958221968109627, "learning_rate": 2.6742500918837614e-06, "loss": 0.0619, "step": 2968 }, { "epoch": 4.796445880452342, "grad_norm": 2.7436823234743977, "learning_rate": 2.672980234551513e-06, "loss": 0.0711, "step": 2969 }, { "epoch": 4.798061389337642, "grad_norm": 2.877071953202579, "learning_rate": 2.6717103323730396e-06, "loss": 0.0663, "step": 2970 }, { "epoch": 4.79967689822294, "grad_norm": 3.0123028230794953, "learning_rate": 2.6704403856775723e-06, "loss": 0.0835, "step": 2971 }, { "epoch": 4.801292407108239, "grad_norm": 2.9974225217729127, "learning_rate": 2.6691703947943515e-06, "loss": 0.0799, "step": 2972 }, { "epoch": 4.802907915993538, "grad_norm": 2.9553895892145974, "learning_rate": 2.667900360052632e-06, "loss": 0.0841, "step": 2973 }, { "epoch": 4.804523424878837, "grad_norm": 2.572635133676932, "learning_rate": 2.6666302817816764e-06, "loss": 0.0695, "step": 2974 }, { "epoch": 4.806138933764136, "grad_norm": 2.8530561858076258, "learning_rate": 2.6653601603107622e-06, "loss": 0.0703, "step": 2975 }, { "epoch": 4.8077544426494345, "grad_norm": 3.733193566430678, "learning_rate": 2.6640899959691754e-06, "loss": 0.112, "step": 2976 }, { "epoch": 4.809369951534734, "grad_norm": 2.595390324196754, "learning_rate": 2.662819789086215e-06, "loss": 0.0684, "step": 2977 }, { "epoch": 4.810985460420032, "grad_norm": 3.068548835604169, "learning_rate": 2.66154953999119e-06, "loss": 0.0737, "step": 2978 }, { "epoch": 4.812600969305331, "grad_norm": 2.3563105247116214, "learning_rate": 2.66027924901342e-06, "loss": 0.0748, "step": 2979 }, { "epoch": 4.81421647819063, "grad_norm": 2.63657541545744, "learning_rate": 2.659008916482236e-06, "loss": 0.0609, "step": 2980 }, { "epoch": 4.815831987075929, "grad_norm": 3.2268649155585347, "learning_rate": 2.6577385427269815e-06, "loss": 0.0695, "step": 2981 }, { "epoch": 4.817447495961228, "grad_norm": 2.874492307679095, "learning_rate": 2.6564681280770066e-06, "loss": 0.0623, "step": 2982 }, { "epoch": 4.819063004846527, "grad_norm": 2.760069556815613, "learning_rate": 2.6551976728616756e-06, "loss": 0.0742, "step": 2983 }, { "epoch": 4.820678513731826, "grad_norm": 2.9744760142851696, "learning_rate": 2.653927177410362e-06, "loss": 0.0684, "step": 2984 }, { "epoch": 4.8222940226171245, "grad_norm": 2.5585036039412614, "learning_rate": 2.6526566420524496e-06, "loss": 0.0529, "step": 2985 }, { "epoch": 4.823909531502423, "grad_norm": 2.59615876333752, "learning_rate": 2.6513860671173332e-06, "loss": 0.0742, "step": 2986 }, { "epoch": 4.825525040387722, "grad_norm": 2.637739968158558, "learning_rate": 2.6501154529344162e-06, "loss": 0.0711, "step": 2987 }, { "epoch": 4.827140549273021, "grad_norm": 2.467947973064352, "learning_rate": 2.6488447998331153e-06, "loss": 0.05, "step": 2988 }, { "epoch": 4.82875605815832, "grad_norm": 3.0450550191454298, "learning_rate": 2.647574108142854e-06, "loss": 0.0586, "step": 2989 }, { "epoch": 4.830371567043619, "grad_norm": 2.736601131117551, "learning_rate": 2.6463033781930673e-06, "loss": 0.065, "step": 2990 }, { "epoch": 4.831987075928918, "grad_norm": 2.967035936604886, "learning_rate": 2.6450326103132015e-06, "loss": 0.0656, "step": 2991 }, { "epoch": 4.833602584814217, "grad_norm": 2.5563491440836015, "learning_rate": 2.6437618048327087e-06, "loss": 0.068, "step": 2992 }, { "epoch": 4.835218093699515, "grad_norm": 2.389644939287956, "learning_rate": 2.6424909620810553e-06, "loss": 0.052, "step": 2993 }, { "epoch": 4.836833602584814, "grad_norm": 2.3279297390392713, "learning_rate": 2.6412200823877145e-06, "loss": 0.0657, "step": 2994 }, { "epoch": 4.838449111470113, "grad_norm": 2.9621756944564486, "learning_rate": 2.6399491660821696e-06, "loss": 0.0855, "step": 2995 }, { "epoch": 4.840064620355412, "grad_norm": 3.1021370641023114, "learning_rate": 2.638678213493915e-06, "loss": 0.0715, "step": 2996 }, { "epoch": 4.841680129240711, "grad_norm": 2.5737211267559923, "learning_rate": 2.6374072249524524e-06, "loss": 0.0612, "step": 2997 }, { "epoch": 4.84329563812601, "grad_norm": 2.4308036047050336, "learning_rate": 2.636136200787293e-06, "loss": 0.0711, "step": 2998 }, { "epoch": 4.844911147011309, "grad_norm": 2.488311110285427, "learning_rate": 2.6348651413279598e-06, "loss": 0.0686, "step": 2999 }, { "epoch": 4.846526655896607, "grad_norm": 2.995527260358785, "learning_rate": 2.633594046903981e-06, "loss": 0.0623, "step": 3000 }, { "epoch": 4.846526655896607, "eval_loss": 1.4169965982437134, "eval_runtime": 2.3432, "eval_samples_per_second": 128.031, "eval_steps_per_second": 2.987, "step": 3000 }, { "epoch": 4.8481421647819065, "grad_norm": 3.0457498223350656, "learning_rate": 2.6323229178448974e-06, "loss": 0.0684, "step": 3001 }, { "epoch": 4.849757673667205, "grad_norm": 3.071043374776576, "learning_rate": 2.6310517544802556e-06, "loss": 0.0859, "step": 3002 }, { "epoch": 4.851373182552504, "grad_norm": 2.7436170992603857, "learning_rate": 2.6297805571396146e-06, "loss": 0.0629, "step": 3003 }, { "epoch": 4.852988691437803, "grad_norm": 3.1185678360590066, "learning_rate": 2.6285093261525397e-06, "loss": 0.0736, "step": 3004 }, { "epoch": 4.854604200323102, "grad_norm": 2.6023634094043513, "learning_rate": 2.6272380618486054e-06, "loss": 0.0811, "step": 3005 }, { "epoch": 4.856219709208401, "grad_norm": 2.6310364297127182, "learning_rate": 2.6259667645573948e-06, "loss": 0.0781, "step": 3006 }, { "epoch": 4.857835218093699, "grad_norm": 2.453385298816667, "learning_rate": 2.6246954346085015e-06, "loss": 0.0635, "step": 3007 }, { "epoch": 4.859450726978999, "grad_norm": 2.822086659319813, "learning_rate": 2.6234240723315234e-06, "loss": 0.0947, "step": 3008 }, { "epoch": 4.861066235864297, "grad_norm": 3.2652696847156975, "learning_rate": 2.6221526780560703e-06, "loss": 0.0831, "step": 3009 }, { "epoch": 4.8626817447495965, "grad_norm": 2.610443254944501, "learning_rate": 2.62088125211176e-06, "loss": 0.0547, "step": 3010 }, { "epoch": 4.864297253634895, "grad_norm": 2.3519777656697896, "learning_rate": 2.619609794828216e-06, "loss": 0.0547, "step": 3011 }, { "epoch": 4.865912762520194, "grad_norm": 2.923957542444172, "learning_rate": 2.6183383065350743e-06, "loss": 0.0646, "step": 3012 }, { "epoch": 4.867528271405493, "grad_norm": 2.799136664774208, "learning_rate": 2.6170667875619744e-06, "loss": 0.0718, "step": 3013 }, { "epoch": 4.869143780290791, "grad_norm": 2.7244563057101048, "learning_rate": 2.6157952382385654e-06, "loss": 0.0784, "step": 3014 }, { "epoch": 4.870759289176091, "grad_norm": 2.7526174989309533, "learning_rate": 2.6145236588945055e-06, "loss": 0.0631, "step": 3015 }, { "epoch": 4.872374798061389, "grad_norm": 2.8228277079146022, "learning_rate": 2.61325204985946e-06, "loss": 0.0711, "step": 3016 }, { "epoch": 4.8739903069466886, "grad_norm": 3.449694928914453, "learning_rate": 2.6119804114631e-06, "loss": 0.0666, "step": 3017 }, { "epoch": 4.875605815831987, "grad_norm": 2.478017360124844, "learning_rate": 2.6107087440351076e-06, "loss": 0.0641, "step": 3018 }, { "epoch": 4.877221324717286, "grad_norm": 2.832766820213541, "learning_rate": 2.60943704790517e-06, "loss": 0.0618, "step": 3019 }, { "epoch": 4.878836833602585, "grad_norm": 2.658132930943461, "learning_rate": 2.608165323402982e-06, "loss": 0.0652, "step": 3020 }, { "epoch": 4.880452342487883, "grad_norm": 2.58347817006551, "learning_rate": 2.6068935708582466e-06, "loss": 0.0605, "step": 3021 }, { "epoch": 4.882067851373183, "grad_norm": 2.785843140142494, "learning_rate": 2.6056217906006744e-06, "loss": 0.0706, "step": 3022 }, { "epoch": 4.883683360258481, "grad_norm": 3.426147975454112, "learning_rate": 2.6043499829599814e-06, "loss": 0.0789, "step": 3023 }, { "epoch": 4.885298869143781, "grad_norm": 3.4568583371444253, "learning_rate": 2.603078148265893e-06, "loss": 0.0816, "step": 3024 }, { "epoch": 4.886914378029079, "grad_norm": 2.6863268407563545, "learning_rate": 2.6018062868481387e-06, "loss": 0.071, "step": 3025 }, { "epoch": 4.8885298869143785, "grad_norm": 2.774190597401922, "learning_rate": 2.600534399036459e-06, "loss": 0.0703, "step": 3026 }, { "epoch": 4.890145395799677, "grad_norm": 2.7141263749831848, "learning_rate": 2.599262485160597e-06, "loss": 0.0679, "step": 3027 }, { "epoch": 4.8917609046849755, "grad_norm": 2.8807366444031315, "learning_rate": 2.5979905455503044e-06, "loss": 0.0565, "step": 3028 }, { "epoch": 4.893376413570275, "grad_norm": 2.6662841875938326, "learning_rate": 2.596718580535342e-06, "loss": 0.0497, "step": 3029 }, { "epoch": 4.894991922455573, "grad_norm": 3.250433995891723, "learning_rate": 2.595446590445471e-06, "loss": 0.0837, "step": 3030 }, { "epoch": 4.896607431340873, "grad_norm": 2.2791320230262553, "learning_rate": 2.594174575610467e-06, "loss": 0.0655, "step": 3031 }, { "epoch": 4.898222940226171, "grad_norm": 2.8515972676247725, "learning_rate": 2.5929025363601047e-06, "loss": 0.0645, "step": 3032 }, { "epoch": 4.899838449111471, "grad_norm": 2.847674926238003, "learning_rate": 2.59163047302417e-06, "loss": 0.0633, "step": 3033 }, { "epoch": 4.901453957996769, "grad_norm": 2.4953201374658476, "learning_rate": 2.5903583859324528e-06, "loss": 0.0646, "step": 3034 }, { "epoch": 4.903069466882068, "grad_norm": 2.9459266678088882, "learning_rate": 2.5890862754147507e-06, "loss": 0.0906, "step": 3035 }, { "epoch": 4.904684975767367, "grad_norm": 2.7431646321279137, "learning_rate": 2.5878141418008645e-06, "loss": 0.0797, "step": 3036 }, { "epoch": 4.906300484652665, "grad_norm": 3.062149990286623, "learning_rate": 2.5865419854206055e-06, "loss": 0.0814, "step": 3037 }, { "epoch": 4.907915993537965, "grad_norm": 3.3957158613506997, "learning_rate": 2.5852698066037863e-06, "loss": 0.0737, "step": 3038 }, { "epoch": 4.909531502423263, "grad_norm": 2.768743441546394, "learning_rate": 2.5839976056802274e-06, "loss": 0.0614, "step": 3039 }, { "epoch": 4.911147011308563, "grad_norm": 3.07961957438437, "learning_rate": 2.5827253829797565e-06, "loss": 0.0719, "step": 3040 }, { "epoch": 4.912762520193861, "grad_norm": 3.02528499696071, "learning_rate": 2.5814531388322038e-06, "loss": 0.0632, "step": 3041 }, { "epoch": 4.91437802907916, "grad_norm": 3.825553177814605, "learning_rate": 2.580180873567408e-06, "loss": 0.0836, "step": 3042 }, { "epoch": 4.915993537964459, "grad_norm": 2.8311154773132436, "learning_rate": 2.5789085875152115e-06, "loss": 0.0705, "step": 3043 }, { "epoch": 4.9176090468497575, "grad_norm": 2.2900597690899778, "learning_rate": 2.577636281005461e-06, "loss": 0.0703, "step": 3044 }, { "epoch": 4.919224555735057, "grad_norm": 2.9900983166592847, "learning_rate": 2.576363954368013e-06, "loss": 0.0814, "step": 3045 }, { "epoch": 4.920840064620355, "grad_norm": 2.724743810144727, "learning_rate": 2.5750916079327237e-06, "loss": 0.0633, "step": 3046 }, { "epoch": 4.922455573505655, "grad_norm": 2.5168450377364504, "learning_rate": 2.5738192420294585e-06, "loss": 0.0655, "step": 3047 }, { "epoch": 4.924071082390953, "grad_norm": 2.3516340613101976, "learning_rate": 2.572546856988085e-06, "loss": 0.0583, "step": 3048 }, { "epoch": 4.925686591276252, "grad_norm": 3.1802045241048886, "learning_rate": 2.571274453138479e-06, "loss": 0.0765, "step": 3049 }, { "epoch": 4.927302100161551, "grad_norm": 2.863562190354993, "learning_rate": 2.5700020308105173e-06, "loss": 0.0663, "step": 3050 }, { "epoch": 4.92891760904685, "grad_norm": 2.2892317181682698, "learning_rate": 2.5687295903340847e-06, "loss": 0.0499, "step": 3051 }, { "epoch": 4.930533117932149, "grad_norm": 2.8419580244066895, "learning_rate": 2.567457132039069e-06, "loss": 0.0666, "step": 3052 }, { "epoch": 4.9321486268174475, "grad_norm": 2.8886849507050534, "learning_rate": 2.566184656255364e-06, "loss": 0.074, "step": 3053 }, { "epoch": 4.933764135702747, "grad_norm": 2.7473163735165476, "learning_rate": 2.564912163312866e-06, "loss": 0.0638, "step": 3054 }, { "epoch": 4.935379644588045, "grad_norm": 2.6035189937976884, "learning_rate": 2.5636396535414765e-06, "loss": 0.0641, "step": 3055 }, { "epoch": 4.936995153473344, "grad_norm": 3.0949217358420733, "learning_rate": 2.562367127271103e-06, "loss": 0.0736, "step": 3056 }, { "epoch": 4.938610662358643, "grad_norm": 2.9292576331042204, "learning_rate": 2.561094584831656e-06, "loss": 0.0718, "step": 3057 }, { "epoch": 4.940226171243942, "grad_norm": 3.150861974021073, "learning_rate": 2.5598220265530487e-06, "loss": 0.079, "step": 3058 }, { "epoch": 4.941841680129241, "grad_norm": 2.630537066110635, "learning_rate": 2.5585494527652023e-06, "loss": 0.0658, "step": 3059 }, { "epoch": 4.94345718901454, "grad_norm": 2.7684601808220175, "learning_rate": 2.557276863798037e-06, "loss": 0.0658, "step": 3060 }, { "epoch": 4.945072697899839, "grad_norm": 2.4907571823336845, "learning_rate": 2.5560042599814817e-06, "loss": 0.0653, "step": 3061 }, { "epoch": 4.946688206785137, "grad_norm": 2.6067136957135224, "learning_rate": 2.5547316416454656e-06, "loss": 0.0619, "step": 3062 }, { "epoch": 4.948303715670436, "grad_norm": 3.2131267717072944, "learning_rate": 2.553459009119923e-06, "loss": 0.0761, "step": 3063 }, { "epoch": 4.949919224555735, "grad_norm": 2.7710020956620562, "learning_rate": 2.552186362734794e-06, "loss": 0.079, "step": 3064 }, { "epoch": 4.951534733441034, "grad_norm": 2.811559401785304, "learning_rate": 2.550913702820019e-06, "loss": 0.0646, "step": 3065 }, { "epoch": 4.953150242326333, "grad_norm": 2.627752700571601, "learning_rate": 2.549641029705542e-06, "loss": 0.0794, "step": 3066 }, { "epoch": 4.954765751211632, "grad_norm": 2.406020073716612, "learning_rate": 2.5483683437213134e-06, "loss": 0.061, "step": 3067 }, { "epoch": 4.956381260096931, "grad_norm": 2.653852168338536, "learning_rate": 2.5470956451972845e-06, "loss": 0.0675, "step": 3068 }, { "epoch": 4.9579967689822295, "grad_norm": 2.645851913892567, "learning_rate": 2.5458229344634094e-06, "loss": 0.0837, "step": 3069 }, { "epoch": 4.959612277867528, "grad_norm": 2.6211784326482994, "learning_rate": 2.5445502118496487e-06, "loss": 0.0726, "step": 3070 }, { "epoch": 4.961227786752827, "grad_norm": 2.694201237485074, "learning_rate": 2.5432774776859614e-06, "loss": 0.0563, "step": 3071 }, { "epoch": 4.962843295638126, "grad_norm": 2.4153377366635977, "learning_rate": 2.542004732302314e-06, "loss": 0.0792, "step": 3072 }, { "epoch": 4.964458804523425, "grad_norm": 3.01376837648797, "learning_rate": 2.5407319760286725e-06, "loss": 0.0595, "step": 3073 }, { "epoch": 4.966074313408724, "grad_norm": 2.5532064827962193, "learning_rate": 2.539459209195007e-06, "loss": 0.0598, "step": 3074 }, { "epoch": 4.967689822294023, "grad_norm": 3.2062526353052423, "learning_rate": 2.5381864321312914e-06, "loss": 0.0749, "step": 3075 }, { "epoch": 4.969305331179322, "grad_norm": 2.6030379413946085, "learning_rate": 2.536913645167501e-06, "loss": 0.0622, "step": 3076 }, { "epoch": 4.97092084006462, "grad_norm": 2.542500533300965, "learning_rate": 2.5356408486336127e-06, "loss": 0.0917, "step": 3077 }, { "epoch": 4.9725363489499195, "grad_norm": 3.105164998959881, "learning_rate": 2.5343680428596086e-06, "loss": 0.0671, "step": 3078 }, { "epoch": 4.974151857835218, "grad_norm": 3.277131753259526, "learning_rate": 2.5330952281754713e-06, "loss": 0.0766, "step": 3079 }, { "epoch": 4.975767366720517, "grad_norm": 3.4570333034604364, "learning_rate": 2.5318224049111843e-06, "loss": 0.0672, "step": 3080 }, { "epoch": 4.977382875605816, "grad_norm": 2.6954399917515763, "learning_rate": 2.5305495733967383e-06, "loss": 0.0645, "step": 3081 }, { "epoch": 4.978998384491114, "grad_norm": 5.304194279557486, "learning_rate": 2.52927673396212e-06, "loss": 0.074, "step": 3082 }, { "epoch": 4.980613893376414, "grad_norm": 3.494242357518065, "learning_rate": 2.5280038869373234e-06, "loss": 0.0832, "step": 3083 }, { "epoch": 4.982229402261712, "grad_norm": 2.7427428830340936, "learning_rate": 2.526731032652341e-06, "loss": 0.0684, "step": 3084 }, { "epoch": 4.983844911147012, "grad_norm": 2.7032549313710814, "learning_rate": 2.525458171437168e-06, "loss": 0.0719, "step": 3085 }, { "epoch": 4.98546042003231, "grad_norm": 2.2572089678594787, "learning_rate": 2.5241853036218027e-06, "loss": 0.0564, "step": 3086 }, { "epoch": 4.987075928917609, "grad_norm": 2.7481980259558476, "learning_rate": 2.5229124295362444e-06, "loss": 0.0478, "step": 3087 }, { "epoch": 4.988691437802908, "grad_norm": 2.9451303217850047, "learning_rate": 2.5216395495104923e-06, "loss": 0.0839, "step": 3088 }, { "epoch": 4.990306946688206, "grad_norm": 2.689838740705782, "learning_rate": 2.52036666387455e-06, "loss": 0.0584, "step": 3089 }, { "epoch": 4.991922455573506, "grad_norm": 3.0677622194579057, "learning_rate": 2.5190937729584203e-06, "loss": 0.0619, "step": 3090 }, { "epoch": 4.993537964458804, "grad_norm": 2.798704813926077, "learning_rate": 2.517820877092108e-06, "loss": 0.0831, "step": 3091 }, { "epoch": 4.995153473344104, "grad_norm": 2.4498985779847713, "learning_rate": 2.5165479766056217e-06, "loss": 0.049, "step": 3092 }, { "epoch": 4.996768982229402, "grad_norm": 2.808590129332573, "learning_rate": 2.515275071828966e-06, "loss": 0.0752, "step": 3093 }, { "epoch": 4.9983844911147015, "grad_norm": 2.8939833319319543, "learning_rate": 2.514002163092152e-06, "loss": 0.0729, "step": 3094 }, { "epoch": 5.0, "grad_norm": 2.961671475829401, "learning_rate": 2.512729250725188e-06, "loss": 0.0512, "step": 3095 }, { "epoch": 5.0016155088852985, "grad_norm": 1.6833770924469575, "learning_rate": 2.511456335058084e-06, "loss": 0.0295, "step": 3096 }, { "epoch": 5.003231017770598, "grad_norm": 1.6163460244121282, "learning_rate": 2.5101834164208527e-06, "loss": 0.0296, "step": 3097 }, { "epoch": 5.004846526655896, "grad_norm": 2.187886861099187, "learning_rate": 2.5089104951435067e-06, "loss": 0.0298, "step": 3098 }, { "epoch": 5.006462035541196, "grad_norm": 1.770372551436343, "learning_rate": 2.5076375715560576e-06, "loss": 0.025, "step": 3099 }, { "epoch": 5.008077544426494, "grad_norm": 1.9289803690005525, "learning_rate": 2.5063646459885194e-06, "loss": 0.0224, "step": 3100 }, { "epoch": 5.009693053311794, "grad_norm": 1.6357780386401295, "learning_rate": 2.505091718770906e-06, "loss": 0.0268, "step": 3101 }, { "epoch": 5.011308562197092, "grad_norm": 1.6971414642949918, "learning_rate": 2.5038187902332333e-06, "loss": 0.0301, "step": 3102 }, { "epoch": 5.012924071082391, "grad_norm": 2.328578891448793, "learning_rate": 2.5025458607055146e-06, "loss": 0.0302, "step": 3103 }, { "epoch": 5.01453957996769, "grad_norm": 1.7170868112700377, "learning_rate": 2.5012729305177645e-06, "loss": 0.0263, "step": 3104 }, { "epoch": 5.016155088852988, "grad_norm": 2.4574220966640157, "learning_rate": 2.5e-06, "loss": 0.0327, "step": 3105 }, { "epoch": 5.017770597738288, "grad_norm": 1.9415705009863786, "learning_rate": 2.498727069482236e-06, "loss": 0.0288, "step": 3106 }, { "epoch": 5.019386106623586, "grad_norm": 2.0643973030797196, "learning_rate": 2.4974541392944867e-06, "loss": 0.0304, "step": 3107 }, { "epoch": 5.021001615508886, "grad_norm": 2.1010560122143023, "learning_rate": 2.4961812097667667e-06, "loss": 0.0238, "step": 3108 }, { "epoch": 5.022617124394184, "grad_norm": 1.9366281729849664, "learning_rate": 2.4949082812290935e-06, "loss": 0.0278, "step": 3109 }, { "epoch": 5.024232633279483, "grad_norm": 1.94521594773587, "learning_rate": 2.493635354011481e-06, "loss": 0.0256, "step": 3110 }, { "epoch": 5.025848142164782, "grad_norm": 1.752499094021835, "learning_rate": 2.4923624284439437e-06, "loss": 0.0207, "step": 3111 }, { "epoch": 5.0274636510500805, "grad_norm": 1.6984738236420736, "learning_rate": 2.491089504856495e-06, "loss": 0.021, "step": 3112 }, { "epoch": 5.02907915993538, "grad_norm": 1.8112468989492763, "learning_rate": 2.4898165835791486e-06, "loss": 0.0265, "step": 3113 }, { "epoch": 5.030694668820678, "grad_norm": 2.2781820348398307, "learning_rate": 2.4885436649419163e-06, "loss": 0.0332, "step": 3114 }, { "epoch": 5.032310177705978, "grad_norm": 2.6562016892331997, "learning_rate": 2.4872707492748135e-06, "loss": 0.0268, "step": 3115 }, { "epoch": 5.033925686591276, "grad_norm": 2.1853833016986672, "learning_rate": 2.4859978369078493e-06, "loss": 0.0325, "step": 3116 }, { "epoch": 5.035541195476575, "grad_norm": 1.4273644069974367, "learning_rate": 2.484724928171035e-06, "loss": 0.0184, "step": 3117 }, { "epoch": 5.037156704361874, "grad_norm": 2.1635910635641467, "learning_rate": 2.483452023394379e-06, "loss": 0.0324, "step": 3118 }, { "epoch": 5.038772213247173, "grad_norm": 1.7540426877952864, "learning_rate": 2.482179122907892e-06, "loss": 0.024, "step": 3119 }, { "epoch": 5.040387722132472, "grad_norm": 2.0812489664663083, "learning_rate": 2.4809062270415805e-06, "loss": 0.0294, "step": 3120 }, { "epoch": 5.0420032310177705, "grad_norm": 1.9303664119180584, "learning_rate": 2.479633336125451e-06, "loss": 0.0279, "step": 3121 }, { "epoch": 5.04361873990307, "grad_norm": 1.750522815023732, "learning_rate": 2.4783604504895086e-06, "loss": 0.0231, "step": 3122 }, { "epoch": 5.045234248788368, "grad_norm": 2.4644356187586016, "learning_rate": 2.477087570463757e-06, "loss": 0.0393, "step": 3123 }, { "epoch": 5.046849757673667, "grad_norm": 2.2425645831940995, "learning_rate": 2.475814696378198e-06, "loss": 0.035, "step": 3124 }, { "epoch": 5.048465266558966, "grad_norm": 1.86126164256113, "learning_rate": 2.474541828562832e-06, "loss": 0.0247, "step": 3125 }, { "epoch": 5.050080775444265, "grad_norm": 2.07251811159124, "learning_rate": 2.4732689673476595e-06, "loss": 0.0313, "step": 3126 }, { "epoch": 5.051696284329564, "grad_norm": 1.9604151402270453, "learning_rate": 2.4719961130626775e-06, "loss": 0.0272, "step": 3127 }, { "epoch": 5.053311793214863, "grad_norm": 2.0552226899990953, "learning_rate": 2.4707232660378804e-06, "loss": 0.0304, "step": 3128 }, { "epoch": 5.054927302100162, "grad_norm": 1.8122364000496576, "learning_rate": 2.469450426603263e-06, "loss": 0.0302, "step": 3129 }, { "epoch": 5.05654281098546, "grad_norm": 1.992283220057394, "learning_rate": 2.4681775950888157e-06, "loss": 0.0272, "step": 3130 }, { "epoch": 5.058158319870759, "grad_norm": 1.7027417543569572, "learning_rate": 2.46690477182453e-06, "loss": 0.0245, "step": 3131 }, { "epoch": 5.059773828756058, "grad_norm": 1.4544191083302411, "learning_rate": 2.4656319571403923e-06, "loss": 0.0211, "step": 3132 }, { "epoch": 5.061389337641357, "grad_norm": 1.9566915218799315, "learning_rate": 2.464359151366388e-06, "loss": 0.0315, "step": 3133 }, { "epoch": 5.063004846526656, "grad_norm": 2.0111262553912344, "learning_rate": 2.4630863548325004e-06, "loss": 0.0352, "step": 3134 }, { "epoch": 5.064620355411955, "grad_norm": 2.282046049689732, "learning_rate": 2.4618135678687086e-06, "loss": 0.0293, "step": 3135 }, { "epoch": 5.066235864297254, "grad_norm": 1.747973843064433, "learning_rate": 2.460540790804993e-06, "loss": 0.0262, "step": 3136 }, { "epoch": 5.0678513731825525, "grad_norm": 2.0159365848711777, "learning_rate": 2.4592680239713283e-06, "loss": 0.0256, "step": 3137 }, { "epoch": 5.069466882067851, "grad_norm": 1.8999304572707527, "learning_rate": 2.457995267697687e-06, "loss": 0.0247, "step": 3138 }, { "epoch": 5.07108239095315, "grad_norm": 2.1009849338752518, "learning_rate": 2.456722522314039e-06, "loss": 0.0308, "step": 3139 }, { "epoch": 5.072697899838449, "grad_norm": 2.1836574149309933, "learning_rate": 2.4554497881503526e-06, "loss": 0.0301, "step": 3140 }, { "epoch": 5.074313408723748, "grad_norm": 2.1995264490710165, "learning_rate": 2.4541770655365906e-06, "loss": 0.0331, "step": 3141 }, { "epoch": 5.075928917609047, "grad_norm": 1.7766084483119196, "learning_rate": 2.4529043548027164e-06, "loss": 0.0274, "step": 3142 }, { "epoch": 5.077544426494346, "grad_norm": 1.891149763544877, "learning_rate": 2.4516316562786874e-06, "loss": 0.0353, "step": 3143 }, { "epoch": 5.079159935379645, "grad_norm": 1.5438167667064067, "learning_rate": 2.450358970294459e-06, "loss": 0.0254, "step": 3144 }, { "epoch": 5.080775444264943, "grad_norm": 1.336464594909872, "learning_rate": 2.4490862971799824e-06, "loss": 0.0213, "step": 3145 }, { "epoch": 5.0823909531502425, "grad_norm": 1.94722057714888, "learning_rate": 2.4478136372652063e-06, "loss": 0.0283, "step": 3146 }, { "epoch": 5.084006462035541, "grad_norm": 2.120146173922281, "learning_rate": 2.446540990880077e-06, "loss": 0.0305, "step": 3147 }, { "epoch": 5.08562197092084, "grad_norm": 1.9092779378234677, "learning_rate": 2.4452683583545357e-06, "loss": 0.0327, "step": 3148 }, { "epoch": 5.087237479806139, "grad_norm": 1.4933297931617349, "learning_rate": 2.4439957400185196e-06, "loss": 0.0174, "step": 3149 }, { "epoch": 5.088852988691438, "grad_norm": 1.7076385347839433, "learning_rate": 2.442723136201964e-06, "loss": 0.0224, "step": 3150 }, { "epoch": 5.090468497576737, "grad_norm": 1.840420705242728, "learning_rate": 2.4414505472347993e-06, "loss": 0.0298, "step": 3151 }, { "epoch": 5.092084006462035, "grad_norm": 2.088052494591961, "learning_rate": 2.4401779734469517e-06, "loss": 0.0282, "step": 3152 }, { "epoch": 5.093699515347335, "grad_norm": 1.7416092232275906, "learning_rate": 2.4389054151683447e-06, "loss": 0.0268, "step": 3153 }, { "epoch": 5.095315024232633, "grad_norm": 2.032834579420893, "learning_rate": 2.4376328727288977e-06, "loss": 0.036, "step": 3154 }, { "epoch": 5.096930533117932, "grad_norm": 2.0888045670413087, "learning_rate": 2.4363603464585244e-06, "loss": 0.0392, "step": 3155 }, { "epoch": 5.098546042003231, "grad_norm": 1.891688733420965, "learning_rate": 2.4350878366871357e-06, "loss": 0.0233, "step": 3156 }, { "epoch": 5.10016155088853, "grad_norm": 1.383616488794492, "learning_rate": 2.433815343744636e-06, "loss": 0.0181, "step": 3157 }, { "epoch": 5.101777059773829, "grad_norm": 2.0245627817371337, "learning_rate": 2.432542867960931e-06, "loss": 0.0302, "step": 3158 }, { "epoch": 5.103392568659127, "grad_norm": 2.2690746363074212, "learning_rate": 2.4312704096659157e-06, "loss": 0.0299, "step": 3159 }, { "epoch": 5.105008077544427, "grad_norm": 2.179304469161686, "learning_rate": 2.4299979691894836e-06, "loss": 0.0328, "step": 3160 }, { "epoch": 5.106623586429725, "grad_norm": 1.8728606808466755, "learning_rate": 2.428725546861522e-06, "loss": 0.0332, "step": 3161 }, { "epoch": 5.1082390953150245, "grad_norm": 2.0444360688153753, "learning_rate": 2.4274531430119157e-06, "loss": 0.0315, "step": 3162 }, { "epoch": 5.109854604200323, "grad_norm": 1.5621428294304744, "learning_rate": 2.426180757970542e-06, "loss": 0.0242, "step": 3163 }, { "epoch": 5.111470113085622, "grad_norm": 2.195273883472809, "learning_rate": 2.4249083920672767e-06, "loss": 0.029, "step": 3164 }, { "epoch": 5.113085621970921, "grad_norm": 2.3319595007595857, "learning_rate": 2.423636045631988e-06, "loss": 0.0281, "step": 3165 }, { "epoch": 5.114701130856219, "grad_norm": 1.6442659104311792, "learning_rate": 2.4223637189945393e-06, "loss": 0.0253, "step": 3166 }, { "epoch": 5.116316639741519, "grad_norm": 1.8762619462726817, "learning_rate": 2.42109141248479e-06, "loss": 0.0291, "step": 3167 }, { "epoch": 5.117932148626817, "grad_norm": 1.7829886739864127, "learning_rate": 2.4198191264325923e-06, "loss": 0.0254, "step": 3168 }, { "epoch": 5.119547657512117, "grad_norm": 1.9147797507391275, "learning_rate": 2.4185468611677962e-06, "loss": 0.0266, "step": 3169 }, { "epoch": 5.121163166397415, "grad_norm": 2.1914501844344065, "learning_rate": 2.417274617020244e-06, "loss": 0.0304, "step": 3170 }, { "epoch": 5.1227786752827145, "grad_norm": 1.8927101218818743, "learning_rate": 2.416002394319773e-06, "loss": 0.0267, "step": 3171 }, { "epoch": 5.124394184168013, "grad_norm": 2.391237783113441, "learning_rate": 2.414730193396215e-06, "loss": 0.0327, "step": 3172 }, { "epoch": 5.1260096930533114, "grad_norm": 2.336559330611366, "learning_rate": 2.4134580145793957e-06, "loss": 0.0315, "step": 3173 }, { "epoch": 5.127625201938611, "grad_norm": 2.1101313694636326, "learning_rate": 2.4121858581991354e-06, "loss": 0.0296, "step": 3174 }, { "epoch": 5.129240710823909, "grad_norm": 2.4667832847189457, "learning_rate": 2.41091372458525e-06, "loss": 0.0345, "step": 3175 }, { "epoch": 5.130856219709209, "grad_norm": 2.1206772685429516, "learning_rate": 2.4096416140675476e-06, "loss": 0.0269, "step": 3176 }, { "epoch": 5.132471728594507, "grad_norm": 2.161279650704378, "learning_rate": 2.408369526975831e-06, "loss": 0.0267, "step": 3177 }, { "epoch": 5.1340872374798066, "grad_norm": 1.7673244276006521, "learning_rate": 2.407097463639896e-06, "loss": 0.0238, "step": 3178 }, { "epoch": 5.135702746365105, "grad_norm": 1.7128160494480857, "learning_rate": 2.4058254243895337e-06, "loss": 0.0263, "step": 3179 }, { "epoch": 5.1373182552504035, "grad_norm": 1.7319686504260656, "learning_rate": 2.4045534095545293e-06, "loss": 0.0235, "step": 3180 }, { "epoch": 5.138933764135703, "grad_norm": 1.7299142179087916, "learning_rate": 2.403281419464659e-06, "loss": 0.0256, "step": 3181 }, { "epoch": 5.140549273021001, "grad_norm": 1.74684085205112, "learning_rate": 2.402009454449696e-06, "loss": 0.0284, "step": 3182 }, { "epoch": 5.142164781906301, "grad_norm": 1.8858517630128935, "learning_rate": 2.4007375148394042e-06, "loss": 0.0258, "step": 3183 }, { "epoch": 5.143780290791599, "grad_norm": 2.0006213894872724, "learning_rate": 2.3994656009635425e-06, "loss": 0.0336, "step": 3184 }, { "epoch": 5.145395799676899, "grad_norm": 1.8019256297019712, "learning_rate": 2.3981937131518613e-06, "loss": 0.0279, "step": 3185 }, { "epoch": 5.147011308562197, "grad_norm": 1.8932476226852182, "learning_rate": 2.396921851734108e-06, "loss": 0.031, "step": 3186 }, { "epoch": 5.148626817447496, "grad_norm": 2.5452895066329946, "learning_rate": 2.3956500170400194e-06, "loss": 0.0328, "step": 3187 }, { "epoch": 5.150242326332795, "grad_norm": 1.4847777086746166, "learning_rate": 2.394378209399327e-06, "loss": 0.0292, "step": 3188 }, { "epoch": 5.1518578352180935, "grad_norm": 1.517624543002675, "learning_rate": 2.3931064291417543e-06, "loss": 0.026, "step": 3189 }, { "epoch": 5.153473344103393, "grad_norm": 1.8271022393511174, "learning_rate": 2.3918346765970186e-06, "loss": 0.0254, "step": 3190 }, { "epoch": 5.155088852988691, "grad_norm": 2.5970162954884843, "learning_rate": 2.3905629520948307e-06, "loss": 0.0377, "step": 3191 }, { "epoch": 5.156704361873991, "grad_norm": 1.8534383963996688, "learning_rate": 2.3892912559648932e-06, "loss": 0.0319, "step": 3192 }, { "epoch": 5.158319870759289, "grad_norm": 1.7047884109022002, "learning_rate": 2.388019588536901e-06, "loss": 0.0359, "step": 3193 }, { "epoch": 5.159935379644588, "grad_norm": 1.8134630273223875, "learning_rate": 2.3867479501405414e-06, "loss": 0.0255, "step": 3194 }, { "epoch": 5.161550888529887, "grad_norm": 1.7551745618969832, "learning_rate": 2.3854763411054945e-06, "loss": 0.0283, "step": 3195 }, { "epoch": 5.163166397415186, "grad_norm": 2.313480820747816, "learning_rate": 2.384204761761435e-06, "loss": 0.0354, "step": 3196 }, { "epoch": 5.164781906300485, "grad_norm": 2.2089942150429773, "learning_rate": 2.382933212438027e-06, "loss": 0.0351, "step": 3197 }, { "epoch": 5.166397415185783, "grad_norm": 2.1948867017944425, "learning_rate": 2.3816616934649265e-06, "loss": 0.0317, "step": 3198 }, { "epoch": 5.168012924071083, "grad_norm": 2.2755932764811546, "learning_rate": 2.3803902051717843e-06, "loss": 0.0366, "step": 3199 }, { "epoch": 5.169628432956381, "grad_norm": 2.1675622402729635, "learning_rate": 2.3791187478882413e-06, "loss": 0.0374, "step": 3200 }, { "epoch": 5.169628432956381, "eval_loss": 1.5197125673294067, "eval_runtime": 2.351, "eval_samples_per_second": 127.605, "eval_steps_per_second": 2.977, "step": 3200 }, { "epoch": 5.17124394184168, "grad_norm": 1.7579426651910393, "learning_rate": 2.37784732194393e-06, "loss": 0.026, "step": 3201 }, { "epoch": 5.172859450726979, "grad_norm": 2.0352039035106855, "learning_rate": 2.376575927668477e-06, "loss": 0.0255, "step": 3202 }, { "epoch": 5.174474959612278, "grad_norm": 1.7689121951231124, "learning_rate": 2.3753045653914998e-06, "loss": 0.0277, "step": 3203 }, { "epoch": 5.176090468497577, "grad_norm": 2.4737424986704775, "learning_rate": 2.3740332354426057e-06, "loss": 0.0303, "step": 3204 }, { "epoch": 5.1777059773828755, "grad_norm": 1.9639896749032595, "learning_rate": 2.372761938151396e-06, "loss": 0.0288, "step": 3205 }, { "epoch": 5.179321486268175, "grad_norm": 1.3663448702722583, "learning_rate": 2.3714906738474607e-06, "loss": 0.0211, "step": 3206 }, { "epoch": 5.180936995153473, "grad_norm": 2.1075157584355453, "learning_rate": 2.3702194428603858e-06, "loss": 0.0301, "step": 3207 }, { "epoch": 5.182552504038772, "grad_norm": 1.9636669889683434, "learning_rate": 2.368948245519745e-06, "loss": 0.0259, "step": 3208 }, { "epoch": 5.184168012924071, "grad_norm": 2.211147017403396, "learning_rate": 2.3676770821551035e-06, "loss": 0.0333, "step": 3209 }, { "epoch": 5.18578352180937, "grad_norm": 1.7970210705773715, "learning_rate": 2.36640595309602e-06, "loss": 0.0273, "step": 3210 }, { "epoch": 5.187399030694669, "grad_norm": 1.6182098083477154, "learning_rate": 2.3651348586720415e-06, "loss": 0.0226, "step": 3211 }, { "epoch": 5.189014539579968, "grad_norm": 1.7703348804421264, "learning_rate": 2.3638637992127073e-06, "loss": 0.0238, "step": 3212 }, { "epoch": 5.190630048465267, "grad_norm": 2.0871402453501906, "learning_rate": 2.3625927750475484e-06, "loss": 0.0318, "step": 3213 }, { "epoch": 5.1922455573505655, "grad_norm": 2.1254659675680925, "learning_rate": 2.3613217865060854e-06, "loss": 0.0282, "step": 3214 }, { "epoch": 5.193861066235864, "grad_norm": 2.080477927217574, "learning_rate": 2.360050833917831e-06, "loss": 0.0382, "step": 3215 }, { "epoch": 5.195476575121163, "grad_norm": 1.7133154027917523, "learning_rate": 2.3587799176122867e-06, "loss": 0.0262, "step": 3216 }, { "epoch": 5.197092084006462, "grad_norm": 2.0618232258610654, "learning_rate": 2.357509037918945e-06, "loss": 0.0508, "step": 3217 }, { "epoch": 5.198707592891761, "grad_norm": 2.094553827513378, "learning_rate": 2.3562381951672917e-06, "loss": 0.0319, "step": 3218 }, { "epoch": 5.20032310177706, "grad_norm": 2.345659910403973, "learning_rate": 2.3549673896868e-06, "loss": 0.0347, "step": 3219 }, { "epoch": 5.201938610662358, "grad_norm": 1.9259050743794233, "learning_rate": 2.353696621806933e-06, "loss": 0.0294, "step": 3220 }, { "epoch": 5.203554119547658, "grad_norm": 2.3482895668069346, "learning_rate": 2.352425891857147e-06, "loss": 0.0335, "step": 3221 }, { "epoch": 5.205169628432956, "grad_norm": 1.7787337173230888, "learning_rate": 2.351155200166886e-06, "loss": 0.0293, "step": 3222 }, { "epoch": 5.206785137318255, "grad_norm": 2.044915196699406, "learning_rate": 2.3498845470655837e-06, "loss": 0.0396, "step": 3223 }, { "epoch": 5.208400646203554, "grad_norm": 2.299819786433566, "learning_rate": 2.3486139328826676e-06, "loss": 0.0393, "step": 3224 }, { "epoch": 5.210016155088853, "grad_norm": 2.0268242805375656, "learning_rate": 2.347343357947551e-06, "loss": 0.0273, "step": 3225 }, { "epoch": 5.211631663974152, "grad_norm": 2.549066476467886, "learning_rate": 2.346072822589639e-06, "loss": 0.0367, "step": 3226 }, { "epoch": 5.21324717285945, "grad_norm": 2.132639087862381, "learning_rate": 2.3448023271383256e-06, "loss": 0.0268, "step": 3227 }, { "epoch": 5.21486268174475, "grad_norm": 1.9699313310718156, "learning_rate": 2.3435318719229938e-06, "loss": 0.0235, "step": 3228 }, { "epoch": 5.216478190630048, "grad_norm": 2.2358018660687806, "learning_rate": 2.3422614572730193e-06, "loss": 0.0342, "step": 3229 }, { "epoch": 5.2180936995153475, "grad_norm": 2.351439771173476, "learning_rate": 2.340991083517764e-06, "loss": 0.0382, "step": 3230 }, { "epoch": 5.219709208400646, "grad_norm": 1.52275045175103, "learning_rate": 2.339720750986581e-06, "loss": 0.0204, "step": 3231 }, { "epoch": 5.221324717285945, "grad_norm": 1.72498650344016, "learning_rate": 2.3384504600088113e-06, "loss": 0.0336, "step": 3232 }, { "epoch": 5.222940226171244, "grad_norm": 1.8258886176494018, "learning_rate": 2.337180210913786e-06, "loss": 0.0268, "step": 3233 }, { "epoch": 5.224555735056542, "grad_norm": 2.368445286915191, "learning_rate": 2.3359100040308246e-06, "loss": 0.028, "step": 3234 }, { "epoch": 5.226171243941842, "grad_norm": 1.9551159888882232, "learning_rate": 2.3346398396892386e-06, "loss": 0.0263, "step": 3235 }, { "epoch": 5.22778675282714, "grad_norm": 1.3379426993428913, "learning_rate": 2.3333697182183245e-06, "loss": 0.0168, "step": 3236 }, { "epoch": 5.22940226171244, "grad_norm": 2.3792232056842626, "learning_rate": 2.332099639947369e-06, "loss": 0.0277, "step": 3237 }, { "epoch": 5.231017770597738, "grad_norm": 1.4391319840904238, "learning_rate": 2.330829605205649e-06, "loss": 0.0203, "step": 3238 }, { "epoch": 5.2326332794830375, "grad_norm": 1.7350800292426234, "learning_rate": 2.329559614322428e-06, "loss": 0.0311, "step": 3239 }, { "epoch": 5.234248788368336, "grad_norm": 1.9000080087969529, "learning_rate": 2.3282896676269608e-06, "loss": 0.0294, "step": 3240 }, { "epoch": 5.2358642972536344, "grad_norm": 1.9945781887990564, "learning_rate": 2.3270197654484874e-06, "loss": 0.0255, "step": 3241 }, { "epoch": 5.237479806138934, "grad_norm": 1.8765208430607667, "learning_rate": 2.3257499081162394e-06, "loss": 0.038, "step": 3242 }, { "epoch": 5.239095315024232, "grad_norm": 2.041631958959953, "learning_rate": 2.3244800959594345e-06, "loss": 0.0277, "step": 3243 }, { "epoch": 5.240710823909532, "grad_norm": 2.061983059508691, "learning_rate": 2.3232103293072796e-06, "loss": 0.0277, "step": 3244 }, { "epoch": 5.24232633279483, "grad_norm": 1.7014485844819003, "learning_rate": 2.32194060848897e-06, "loss": 0.0263, "step": 3245 }, { "epoch": 5.24394184168013, "grad_norm": 2.0900398367959947, "learning_rate": 2.3206709338336896e-06, "loss": 0.0244, "step": 3246 }, { "epoch": 5.245557350565428, "grad_norm": 2.1093651871931107, "learning_rate": 2.319401305670609e-06, "loss": 0.029, "step": 3247 }, { "epoch": 5.2471728594507265, "grad_norm": 1.7091999218208684, "learning_rate": 2.318131724328887e-06, "loss": 0.0276, "step": 3248 }, { "epoch": 5.248788368336026, "grad_norm": 1.580796515919428, "learning_rate": 2.316862190137671e-06, "loss": 0.0231, "step": 3249 }, { "epoch": 5.250403877221324, "grad_norm": 2.031835993752323, "learning_rate": 2.3155927034260946e-06, "loss": 0.026, "step": 3250 }, { "epoch": 5.252019386106624, "grad_norm": 2.0204085408099672, "learning_rate": 2.3143232645232823e-06, "loss": 0.0285, "step": 3251 }, { "epoch": 5.253634894991922, "grad_norm": 2.2426600513915482, "learning_rate": 2.3130538737583437e-06, "loss": 0.0332, "step": 3252 }, { "epoch": 5.255250403877222, "grad_norm": 2.034131090231662, "learning_rate": 2.3117845314603763e-06, "loss": 0.032, "step": 3253 }, { "epoch": 5.25686591276252, "grad_norm": 1.7885874725382287, "learning_rate": 2.3105152379584646e-06, "loss": 0.0239, "step": 3254 }, { "epoch": 5.258481421647819, "grad_norm": 1.891447488437099, "learning_rate": 2.30924599358168e-06, "loss": 0.0242, "step": 3255 }, { "epoch": 5.260096930533118, "grad_norm": 1.7136741821341963, "learning_rate": 2.307976798659085e-06, "loss": 0.0284, "step": 3256 }, { "epoch": 5.2617124394184165, "grad_norm": 1.974623300926356, "learning_rate": 2.3067076535197246e-06, "loss": 0.0315, "step": 3257 }, { "epoch": 5.263327948303716, "grad_norm": 1.9733338481451483, "learning_rate": 2.305438558492633e-06, "loss": 0.0296, "step": 3258 }, { "epoch": 5.264943457189014, "grad_norm": 1.8801643605214629, "learning_rate": 2.3041695139068315e-06, "loss": 0.0311, "step": 3259 }, { "epoch": 5.266558966074314, "grad_norm": 1.7481600070461991, "learning_rate": 2.3029005200913273e-06, "loss": 0.0275, "step": 3260 }, { "epoch": 5.268174474959612, "grad_norm": 2.1123362683546842, "learning_rate": 2.3016315773751147e-06, "loss": 0.036, "step": 3261 }, { "epoch": 5.269789983844911, "grad_norm": 1.9719014472317173, "learning_rate": 2.300362686087177e-06, "loss": 0.0286, "step": 3262 }, { "epoch": 5.27140549273021, "grad_norm": 2.2789964593238063, "learning_rate": 2.2990938465564812e-06, "loss": 0.0356, "step": 3263 }, { "epoch": 5.273021001615509, "grad_norm": 1.6609172963126848, "learning_rate": 2.2978250591119826e-06, "loss": 0.0321, "step": 3264 }, { "epoch": 5.274636510500808, "grad_norm": 2.3870678953150475, "learning_rate": 2.2965563240826214e-06, "loss": 0.0388, "step": 3265 }, { "epoch": 5.276252019386106, "grad_norm": 2.021093631101388, "learning_rate": 2.2952876417973246e-06, "loss": 0.0315, "step": 3266 }, { "epoch": 5.277867528271406, "grad_norm": 3.561585447824218, "learning_rate": 2.2940190125850083e-06, "loss": 0.0357, "step": 3267 }, { "epoch": 5.279483037156704, "grad_norm": 2.0373921736514844, "learning_rate": 2.292750436774572e-06, "loss": 0.0331, "step": 3268 }, { "epoch": 5.281098546042003, "grad_norm": 1.7510600672587198, "learning_rate": 2.291481914694901e-06, "loss": 0.0251, "step": 3269 }, { "epoch": 5.282714054927302, "grad_norm": 1.6792133432909595, "learning_rate": 2.290213446674869e-06, "loss": 0.0278, "step": 3270 }, { "epoch": 5.284329563812601, "grad_norm": 1.6066347326807815, "learning_rate": 2.2889450330433334e-06, "loss": 0.0238, "step": 3271 }, { "epoch": 5.2859450726979, "grad_norm": 2.2389219700576932, "learning_rate": 2.287676674129139e-06, "loss": 0.021, "step": 3272 }, { "epoch": 5.2875605815831985, "grad_norm": 2.1204640276953137, "learning_rate": 2.286408370261115e-06, "loss": 0.0525, "step": 3273 }, { "epoch": 5.289176090468498, "grad_norm": 1.9964889359916729, "learning_rate": 2.285140121768079e-06, "loss": 0.0336, "step": 3274 }, { "epoch": 5.290791599353796, "grad_norm": 2.8459943912631416, "learning_rate": 2.2838719289788318e-06, "loss": 0.0297, "step": 3275 }, { "epoch": 5.292407108239095, "grad_norm": 2.0155221665593075, "learning_rate": 2.2826037922221595e-06, "loss": 0.0241, "step": 3276 }, { "epoch": 5.294022617124394, "grad_norm": 2.206393025614205, "learning_rate": 2.2813357118268343e-06, "loss": 0.0292, "step": 3277 }, { "epoch": 5.295638126009693, "grad_norm": 2.0758831142800687, "learning_rate": 2.2800676881216167e-06, "loss": 0.0333, "step": 3278 }, { "epoch": 5.297253634894992, "grad_norm": 1.852503973773811, "learning_rate": 2.2787997214352477e-06, "loss": 0.0273, "step": 3279 }, { "epoch": 5.298869143780291, "grad_norm": 1.8542367830813564, "learning_rate": 2.2775318120964567e-06, "loss": 0.0301, "step": 3280 }, { "epoch": 5.30048465266559, "grad_norm": 2.136416508993798, "learning_rate": 2.276263960433956e-06, "loss": 0.031, "step": 3281 }, { "epoch": 5.3021001615508885, "grad_norm": 2.2043068492627604, "learning_rate": 2.274996166776446e-06, "loss": 0.0267, "step": 3282 }, { "epoch": 5.303715670436187, "grad_norm": 1.708542075590349, "learning_rate": 2.2737284314526077e-06, "loss": 0.023, "step": 3283 }, { "epoch": 5.305331179321486, "grad_norm": 1.7962673564540117, "learning_rate": 2.2724607547911122e-06, "loss": 0.0286, "step": 3284 }, { "epoch": 5.306946688206785, "grad_norm": 1.9666144074474052, "learning_rate": 2.2711931371206115e-06, "loss": 0.0305, "step": 3285 }, { "epoch": 5.308562197092084, "grad_norm": 1.753658126498961, "learning_rate": 2.269925578769744e-06, "loss": 0.0277, "step": 3286 }, { "epoch": 5.310177705977383, "grad_norm": 1.6268980450783055, "learning_rate": 2.268658080067132e-06, "loss": 0.0249, "step": 3287 }, { "epoch": 5.311793214862682, "grad_norm": 1.9868441638809493, "learning_rate": 2.267390641341381e-06, "loss": 0.0359, "step": 3288 }, { "epoch": 5.313408723747981, "grad_norm": 2.08029052421165, "learning_rate": 2.2661232629210853e-06, "loss": 0.038, "step": 3289 }, { "epoch": 5.315024232633279, "grad_norm": 1.6116021817912283, "learning_rate": 2.26485594513482e-06, "loss": 0.0254, "step": 3290 }, { "epoch": 5.316639741518578, "grad_norm": 1.7255093172701905, "learning_rate": 2.2635886883111443e-06, "loss": 0.0241, "step": 3291 }, { "epoch": 5.318255250403877, "grad_norm": 1.8246131692705791, "learning_rate": 2.2623214927786037e-06, "loss": 0.0231, "step": 3292 }, { "epoch": 5.319870759289176, "grad_norm": 1.6793304067736032, "learning_rate": 2.2610543588657253e-06, "loss": 0.029, "step": 3293 }, { "epoch": 5.321486268174475, "grad_norm": 2.6745629595693687, "learning_rate": 2.259787286901022e-06, "loss": 0.0414, "step": 3294 }, { "epoch": 5.323101777059774, "grad_norm": 1.807046430496722, "learning_rate": 2.2585202772129918e-06, "loss": 0.0292, "step": 3295 }, { "epoch": 5.324717285945073, "grad_norm": 1.7500495382833994, "learning_rate": 2.257253330130114e-06, "loss": 0.0304, "step": 3296 }, { "epoch": 5.326332794830371, "grad_norm": 1.8373257244087984, "learning_rate": 2.255986445980852e-06, "loss": 0.0335, "step": 3297 }, { "epoch": 5.3279483037156705, "grad_norm": 2.180956040209482, "learning_rate": 2.2547196250936544e-06, "loss": 0.0323, "step": 3298 }, { "epoch": 5.329563812600969, "grad_norm": 1.8848681084074774, "learning_rate": 2.253452867796952e-06, "loss": 0.0336, "step": 3299 }, { "epoch": 5.331179321486268, "grad_norm": 1.8838609380411762, "learning_rate": 2.2521861744191604e-06, "loss": 0.043, "step": 3300 }, { "epoch": 5.332794830371567, "grad_norm": 2.0339919063269765, "learning_rate": 2.250919545288677e-06, "loss": 0.0281, "step": 3301 }, { "epoch": 5.334410339256866, "grad_norm": 2.1761032289173805, "learning_rate": 2.2496529807338845e-06, "loss": 0.0455, "step": 3302 }, { "epoch": 5.336025848142165, "grad_norm": 1.9102290935864956, "learning_rate": 2.2483864810831476e-06, "loss": 0.0301, "step": 3303 }, { "epoch": 5.337641357027463, "grad_norm": 2.771486539539188, "learning_rate": 2.2471200466648134e-06, "loss": 0.0479, "step": 3304 }, { "epoch": 5.339256865912763, "grad_norm": 1.9213138522925388, "learning_rate": 2.2458536778072135e-06, "loss": 0.0326, "step": 3305 }, { "epoch": 5.340872374798061, "grad_norm": 1.8206129469331218, "learning_rate": 2.244587374838663e-06, "loss": 0.0272, "step": 3306 }, { "epoch": 5.3424878836833605, "grad_norm": 1.536913874833999, "learning_rate": 2.2433211380874583e-06, "loss": 0.0261, "step": 3307 }, { "epoch": 5.344103392568659, "grad_norm": 2.156351968981839, "learning_rate": 2.242054967881879e-06, "loss": 0.0274, "step": 3308 }, { "epoch": 5.345718901453958, "grad_norm": 1.690417298682305, "learning_rate": 2.2407888645501886e-06, "loss": 0.0262, "step": 3309 }, { "epoch": 5.347334410339257, "grad_norm": 2.3514762769607884, "learning_rate": 2.2395228284206303e-06, "loss": 0.0337, "step": 3310 }, { "epoch": 5.348949919224555, "grad_norm": 2.0829272784881248, "learning_rate": 2.238256859821435e-06, "loss": 0.027, "step": 3311 }, { "epoch": 5.350565428109855, "grad_norm": 1.9578688751165079, "learning_rate": 2.236990959080812e-06, "loss": 0.0271, "step": 3312 }, { "epoch": 5.352180936995153, "grad_norm": 1.9445143654721462, "learning_rate": 2.235725126526953e-06, "loss": 0.0277, "step": 3313 }, { "epoch": 5.353796445880453, "grad_norm": 2.2449525039852625, "learning_rate": 2.2344593624880347e-06, "loss": 0.0289, "step": 3314 }, { "epoch": 5.355411954765751, "grad_norm": 2.159991421692605, "learning_rate": 2.233193667292212e-06, "loss": 0.0327, "step": 3315 }, { "epoch": 5.35702746365105, "grad_norm": 1.5937283396720532, "learning_rate": 2.2319280412676283e-06, "loss": 0.0273, "step": 3316 }, { "epoch": 5.358642972536349, "grad_norm": 1.9349802783717565, "learning_rate": 2.230662484742402e-06, "loss": 0.0264, "step": 3317 }, { "epoch": 5.360258481421647, "grad_norm": 1.698031440514735, "learning_rate": 2.229396998044638e-06, "loss": 0.0285, "step": 3318 }, { "epoch": 5.361873990306947, "grad_norm": 2.1063469434800757, "learning_rate": 2.228131581502422e-06, "loss": 0.0317, "step": 3319 }, { "epoch": 5.363489499192245, "grad_norm": 2.268523115897623, "learning_rate": 2.22686623544382e-06, "loss": 0.0317, "step": 3320 }, { "epoch": 5.365105008077545, "grad_norm": 1.6983565544690924, "learning_rate": 2.225600960196881e-06, "loss": 0.0285, "step": 3321 }, { "epoch": 5.366720516962843, "grad_norm": 1.9156200365987102, "learning_rate": 2.2243357560896374e-06, "loss": 0.0232, "step": 3322 }, { "epoch": 5.3683360258481425, "grad_norm": 1.6858406869405331, "learning_rate": 2.223070623450101e-06, "loss": 0.026, "step": 3323 }, { "epoch": 5.369951534733441, "grad_norm": 1.7768496701805248, "learning_rate": 2.221805562606264e-06, "loss": 0.0251, "step": 3324 }, { "epoch": 5.3715670436187395, "grad_norm": 2.037785730628828, "learning_rate": 2.220540573886102e-06, "loss": 0.0366, "step": 3325 }, { "epoch": 5.373182552504039, "grad_norm": 1.796136587777519, "learning_rate": 2.2192756576175723e-06, "loss": 0.027, "step": 3326 }, { "epoch": 5.374798061389337, "grad_norm": 1.4102249271157865, "learning_rate": 2.218010814128612e-06, "loss": 0.0203, "step": 3327 }, { "epoch": 5.376413570274637, "grad_norm": 2.2250359460811295, "learning_rate": 2.2167460437471395e-06, "loss": 0.0323, "step": 3328 }, { "epoch": 5.378029079159935, "grad_norm": 1.662402804901642, "learning_rate": 2.2154813468010544e-06, "loss": 0.0252, "step": 3329 }, { "epoch": 5.379644588045235, "grad_norm": 2.101820457371282, "learning_rate": 2.2142167236182385e-06, "loss": 0.0297, "step": 3330 }, { "epoch": 5.381260096930533, "grad_norm": 1.9118593072027559, "learning_rate": 2.2129521745265527e-06, "loss": 0.022, "step": 3331 }, { "epoch": 5.382875605815832, "grad_norm": 1.7788243063837432, "learning_rate": 2.2116876998538383e-06, "loss": 0.027, "step": 3332 }, { "epoch": 5.384491114701131, "grad_norm": 1.5976568182292943, "learning_rate": 2.21042329992792e-06, "loss": 0.0249, "step": 3333 }, { "epoch": 5.386106623586429, "grad_norm": 1.8878979754874252, "learning_rate": 2.2091589750766013e-06, "loss": 0.0281, "step": 3334 }, { "epoch": 5.387722132471729, "grad_norm": 2.3455159588386505, "learning_rate": 2.2078947256276657e-06, "loss": 0.03, "step": 3335 }, { "epoch": 5.389337641357027, "grad_norm": 2.144681611171738, "learning_rate": 2.206630551908879e-06, "loss": 0.0225, "step": 3336 }, { "epoch": 5.390953150242327, "grad_norm": 2.0969645110619, "learning_rate": 2.2053664542479843e-06, "loss": 0.0241, "step": 3337 }, { "epoch": 5.392568659127625, "grad_norm": 2.0524682918296264, "learning_rate": 2.204102432972709e-06, "loss": 0.0323, "step": 3338 }, { "epoch": 5.394184168012924, "grad_norm": 1.9933608012435409, "learning_rate": 2.2028384884107582e-06, "loss": 0.0308, "step": 3339 }, { "epoch": 5.395799676898223, "grad_norm": 2.0169967660600854, "learning_rate": 2.201574620889817e-06, "loss": 0.0345, "step": 3340 }, { "epoch": 5.3974151857835215, "grad_norm": 1.8191390541640757, "learning_rate": 2.2003108307375514e-06, "loss": 0.0296, "step": 3341 }, { "epoch": 5.399030694668821, "grad_norm": 1.943488093446824, "learning_rate": 2.199047118281607e-06, "loss": 0.0282, "step": 3342 }, { "epoch": 5.400646203554119, "grad_norm": 2.062220795502354, "learning_rate": 2.197783483849608e-06, "loss": 0.0256, "step": 3343 }, { "epoch": 5.402261712439419, "grad_norm": 1.815509767336659, "learning_rate": 2.1965199277691623e-06, "loss": 0.0326, "step": 3344 }, { "epoch": 5.403877221324717, "grad_norm": 1.7220151459922863, "learning_rate": 2.195256450367853e-06, "loss": 0.031, "step": 3345 }, { "epoch": 5.405492730210016, "grad_norm": 1.9725650748009844, "learning_rate": 2.1939930519732455e-06, "loss": 0.0375, "step": 3346 }, { "epoch": 5.407108239095315, "grad_norm": 1.928590928030234, "learning_rate": 2.1927297329128835e-06, "loss": 0.034, "step": 3347 }, { "epoch": 5.408723747980614, "grad_norm": 1.9900056097180165, "learning_rate": 2.1914664935142888e-06, "loss": 0.0261, "step": 3348 }, { "epoch": 5.410339256865913, "grad_norm": 2.275972142968288, "learning_rate": 2.190203334104967e-06, "loss": 0.0308, "step": 3349 }, { "epoch": 5.4119547657512115, "grad_norm": 1.8397694469645587, "learning_rate": 2.1889402550123992e-06, "loss": 0.0251, "step": 3350 }, { "epoch": 5.413570274636511, "grad_norm": 1.7743855761552851, "learning_rate": 2.1876772565640468e-06, "loss": 0.0308, "step": 3351 }, { "epoch": 5.415185783521809, "grad_norm": 2.4327715689448772, "learning_rate": 2.18641433908735e-06, "loss": 0.0259, "step": 3352 }, { "epoch": 5.416801292407108, "grad_norm": 1.8839341761374537, "learning_rate": 2.1851515029097264e-06, "loss": 0.0263, "step": 3353 }, { "epoch": 5.418416801292407, "grad_norm": 1.7915122471186922, "learning_rate": 2.1838887483585776e-06, "loss": 0.0239, "step": 3354 }, { "epoch": 5.420032310177706, "grad_norm": 1.8451997915519103, "learning_rate": 2.1826260757612784e-06, "loss": 0.0266, "step": 3355 }, { "epoch": 5.421647819063005, "grad_norm": 1.916107968874382, "learning_rate": 2.181363485445186e-06, "loss": 0.0289, "step": 3356 }, { "epoch": 5.423263327948304, "grad_norm": 2.217115309820222, "learning_rate": 2.1801009777376337e-06, "loss": 0.0359, "step": 3357 }, { "epoch": 5.424878836833603, "grad_norm": 1.7606136579475071, "learning_rate": 2.1788385529659363e-06, "loss": 0.0298, "step": 3358 }, { "epoch": 5.426494345718901, "grad_norm": 2.1031455192436304, "learning_rate": 2.1775762114573837e-06, "loss": 0.0311, "step": 3359 }, { "epoch": 5.4281098546042, "grad_norm": 1.429220465275634, "learning_rate": 2.176313953539247e-06, "loss": 0.0214, "step": 3360 }, { "epoch": 5.429725363489499, "grad_norm": 1.8905058956863112, "learning_rate": 2.175051779538774e-06, "loss": 0.0292, "step": 3361 }, { "epoch": 5.431340872374798, "grad_norm": 1.5839116972471723, "learning_rate": 2.173789689783192e-06, "loss": 0.0272, "step": 3362 }, { "epoch": 5.432956381260097, "grad_norm": 3.010290281824566, "learning_rate": 2.1725276845997057e-06, "loss": 0.037, "step": 3363 }, { "epoch": 5.434571890145396, "grad_norm": 2.4431716702507047, "learning_rate": 2.171265764315497e-06, "loss": 0.0295, "step": 3364 }, { "epoch": 5.436187399030695, "grad_norm": 2.4078104374612765, "learning_rate": 2.1700039292577273e-06, "loss": 0.0337, "step": 3365 }, { "epoch": 5.4378029079159935, "grad_norm": 2.3677391531945027, "learning_rate": 2.1687421797535365e-06, "loss": 0.0281, "step": 3366 }, { "epoch": 5.439418416801292, "grad_norm": 1.7982612896489751, "learning_rate": 2.16748051613004e-06, "loss": 0.0288, "step": 3367 }, { "epoch": 5.441033925686591, "grad_norm": 2.327226868085504, "learning_rate": 2.166218938714332e-06, "loss": 0.0327, "step": 3368 }, { "epoch": 5.44264943457189, "grad_norm": 2.0683879949058244, "learning_rate": 2.164957447833485e-06, "loss": 0.0315, "step": 3369 }, { "epoch": 5.444264943457189, "grad_norm": 2.181092310842797, "learning_rate": 2.163696043814547e-06, "loss": 0.0404, "step": 3370 }, { "epoch": 5.445880452342488, "grad_norm": 1.6142164371117307, "learning_rate": 2.1624347269845473e-06, "loss": 0.0254, "step": 3371 }, { "epoch": 5.447495961227787, "grad_norm": 2.1606783908948297, "learning_rate": 2.1611734976704897e-06, "loss": 0.0345, "step": 3372 }, { "epoch": 5.449111470113086, "grad_norm": 2.0258663794134137, "learning_rate": 2.1599123561993553e-06, "loss": 0.0277, "step": 3373 }, { "epoch": 5.450726978998384, "grad_norm": 2.104934599161156, "learning_rate": 2.158651302898103e-06, "loss": 0.0373, "step": 3374 }, { "epoch": 5.4523424878836835, "grad_norm": 2.372334632145907, "learning_rate": 2.1573903380936685e-06, "loss": 0.0272, "step": 3375 }, { "epoch": 5.453957996768982, "grad_norm": 1.5684223696323583, "learning_rate": 2.156129462112966e-06, "loss": 0.0234, "step": 3376 }, { "epoch": 5.455573505654281, "grad_norm": 1.7653992667155818, "learning_rate": 2.1548686752828856e-06, "loss": 0.0343, "step": 3377 }, { "epoch": 5.45718901453958, "grad_norm": 2.4226400398527654, "learning_rate": 2.1536079779302942e-06, "loss": 0.0444, "step": 3378 }, { "epoch": 5.458804523424879, "grad_norm": 1.4329377230437124, "learning_rate": 2.1523473703820345e-06, "loss": 0.0207, "step": 3379 }, { "epoch": 5.460420032310178, "grad_norm": 1.8934752248883657, "learning_rate": 2.1510868529649283e-06, "loss": 0.0327, "step": 3380 }, { "epoch": 5.462035541195476, "grad_norm": 2.286622275492239, "learning_rate": 2.149826426005771e-06, "loss": 0.0291, "step": 3381 }, { "epoch": 5.463651050080776, "grad_norm": 1.4693913291155398, "learning_rate": 2.148566089831338e-06, "loss": 0.0213, "step": 3382 }, { "epoch": 5.465266558966074, "grad_norm": 1.8980445717518895, "learning_rate": 2.1473058447683798e-06, "loss": 0.0348, "step": 3383 }, { "epoch": 5.466882067851373, "grad_norm": 1.5554939244600974, "learning_rate": 2.1460456911436218e-06, "loss": 0.0214, "step": 3384 }, { "epoch": 5.468497576736672, "grad_norm": 1.6914857848858793, "learning_rate": 2.1447856292837667e-06, "loss": 0.0288, "step": 3385 }, { "epoch": 5.470113085621971, "grad_norm": 2.1195878387062073, "learning_rate": 2.143525659515494e-06, "loss": 0.0356, "step": 3386 }, { "epoch": 5.47172859450727, "grad_norm": 1.9384970303290523, "learning_rate": 2.142265782165459e-06, "loss": 0.0293, "step": 3387 }, { "epoch": 5.473344103392568, "grad_norm": 1.6231714848506464, "learning_rate": 2.141005997560293e-06, "loss": 0.0263, "step": 3388 }, { "epoch": 5.474959612277868, "grad_norm": 1.7017398147804184, "learning_rate": 2.1397463060266024e-06, "loss": 0.0285, "step": 3389 }, { "epoch": 5.476575121163166, "grad_norm": 1.681040020227336, "learning_rate": 2.1384867078909716e-06, "loss": 0.0283, "step": 3390 }, { "epoch": 5.4781906300484655, "grad_norm": 2.6115356480326892, "learning_rate": 2.1372272034799582e-06, "loss": 0.0639, "step": 3391 }, { "epoch": 5.479806138933764, "grad_norm": 1.6630419638615281, "learning_rate": 2.135967793120097e-06, "loss": 0.026, "step": 3392 }, { "epoch": 5.481421647819063, "grad_norm": 1.732396899032309, "learning_rate": 2.1347084771378976e-06, "loss": 0.0307, "step": 3393 }, { "epoch": 5.483037156704362, "grad_norm": 2.205345935520661, "learning_rate": 2.133449255859847e-06, "loss": 0.0317, "step": 3394 }, { "epoch": 5.48465266558966, "grad_norm": 1.9829946853027156, "learning_rate": 2.132190129612406e-06, "loss": 0.0282, "step": 3395 }, { "epoch": 5.48626817447496, "grad_norm": 1.8869497987088006, "learning_rate": 2.1309310987220106e-06, "loss": 0.026, "step": 3396 }, { "epoch": 5.487883683360258, "grad_norm": 1.78188215282455, "learning_rate": 2.1296721635150715e-06, "loss": 0.0243, "step": 3397 }, { "epoch": 5.489499192245558, "grad_norm": 2.232120361256229, "learning_rate": 2.1284133243179777e-06, "loss": 0.0322, "step": 3398 }, { "epoch": 5.491114701130856, "grad_norm": 2.403351762194644, "learning_rate": 2.1271545814570904e-06, "loss": 0.0416, "step": 3399 }, { "epoch": 5.4927302100161555, "grad_norm": 2.134164414265166, "learning_rate": 2.125895935258747e-06, "loss": 0.0304, "step": 3400 }, { "epoch": 5.4927302100161555, "eval_loss": 1.5526303052902222, "eval_runtime": 2.3478, "eval_samples_per_second": 127.778, "eval_steps_per_second": 2.981, "step": 3400 }, { "epoch": 5.494345718901454, "grad_norm": 3.1425407126817886, "learning_rate": 2.1246373860492586e-06, "loss": 0.0432, "step": 3401 }, { "epoch": 5.4959612277867524, "grad_norm": 3.628990818638682, "learning_rate": 2.1233789341549133e-06, "loss": 0.0533, "step": 3402 }, { "epoch": 5.497576736672052, "grad_norm": 1.9960847096560037, "learning_rate": 2.1221205799019704e-06, "loss": 0.0345, "step": 3403 }, { "epoch": 5.49919224555735, "grad_norm": 2.05354605599221, "learning_rate": 2.1208623236166693e-06, "loss": 0.0302, "step": 3404 }, { "epoch": 5.50080775444265, "grad_norm": 1.970070424746878, "learning_rate": 2.1196041656252198e-06, "loss": 0.023, "step": 3405 }, { "epoch": 5.502423263327948, "grad_norm": 2.99387909453093, "learning_rate": 2.118346106253807e-06, "loss": 0.0508, "step": 3406 }, { "epoch": 5.5040387722132476, "grad_norm": 2.0057434478441394, "learning_rate": 2.1170881458285906e-06, "loss": 0.0349, "step": 3407 }, { "epoch": 5.505654281098546, "grad_norm": 2.2994882700291046, "learning_rate": 2.1158302846757043e-06, "loss": 0.0353, "step": 3408 }, { "epoch": 5.5072697899838445, "grad_norm": 2.5070589928801112, "learning_rate": 2.1145725231212587e-06, "loss": 0.0362, "step": 3409 }, { "epoch": 5.508885298869144, "grad_norm": 2.367802189522088, "learning_rate": 2.113314861491335e-06, "loss": 0.0294, "step": 3410 }, { "epoch": 5.510500807754442, "grad_norm": 1.4987808839957921, "learning_rate": 2.1120573001119902e-06, "loss": 0.0252, "step": 3411 }, { "epoch": 5.512116316639742, "grad_norm": 2.1331401772033702, "learning_rate": 2.1107998393092557e-06, "loss": 0.0313, "step": 3412 }, { "epoch": 5.51373182552504, "grad_norm": 2.958983855827825, "learning_rate": 2.1095424794091333e-06, "loss": 0.0225, "step": 3413 }, { "epoch": 5.51534733441034, "grad_norm": 1.4537535652975382, "learning_rate": 2.108285220737606e-06, "loss": 0.023, "step": 3414 }, { "epoch": 5.516962843295638, "grad_norm": 2.119388322750311, "learning_rate": 2.107028063620623e-06, "loss": 0.0305, "step": 3415 }, { "epoch": 5.518578352180937, "grad_norm": 1.8239095167378185, "learning_rate": 2.1057710083841113e-06, "loss": 0.0308, "step": 3416 }, { "epoch": 5.520193861066236, "grad_norm": 1.8588937564855024, "learning_rate": 2.10451405535397e-06, "loss": 0.0274, "step": 3417 }, { "epoch": 5.5218093699515345, "grad_norm": 1.9996756775634543, "learning_rate": 2.1032572048560734e-06, "loss": 0.0299, "step": 3418 }, { "epoch": 5.523424878836834, "grad_norm": 1.6489661014910346, "learning_rate": 2.102000457216266e-06, "loss": 0.0229, "step": 3419 }, { "epoch": 5.525040387722132, "grad_norm": 1.7939773648477162, "learning_rate": 2.10074381276037e-06, "loss": 0.0281, "step": 3420 }, { "epoch": 5.526655896607432, "grad_norm": 1.960893450944089, "learning_rate": 2.0994872718141758e-06, "loss": 0.0335, "step": 3421 }, { "epoch": 5.52827140549273, "grad_norm": 1.7624042908245785, "learning_rate": 2.0982308347034515e-06, "loss": 0.0287, "step": 3422 }, { "epoch": 5.529886914378029, "grad_norm": 2.2985452592908504, "learning_rate": 2.096974501753936e-06, "loss": 0.0271, "step": 3423 }, { "epoch": 5.531502423263328, "grad_norm": 2.3125183233505706, "learning_rate": 2.095718273291341e-06, "loss": 0.0433, "step": 3424 }, { "epoch": 5.533117932148627, "grad_norm": 1.8960382701542902, "learning_rate": 2.0944621496413524e-06, "loss": 0.034, "step": 3425 }, { "epoch": 5.534733441033926, "grad_norm": 1.5795259820830385, "learning_rate": 2.0932061311296283e-06, "loss": 0.0243, "step": 3426 }, { "epoch": 5.536348949919224, "grad_norm": 1.6694073094486395, "learning_rate": 2.091950218081799e-06, "loss": 0.0267, "step": 3427 }, { "epoch": 5.537964458804524, "grad_norm": 1.6935219978726836, "learning_rate": 2.0906944108234685e-06, "loss": 0.0265, "step": 3428 }, { "epoch": 5.539579967689822, "grad_norm": 1.9263362163243176, "learning_rate": 2.089438709680212e-06, "loss": 0.031, "step": 3429 }, { "epoch": 5.541195476575121, "grad_norm": 1.9476777738714859, "learning_rate": 2.0881831149775774e-06, "loss": 0.0293, "step": 3430 }, { "epoch": 5.54281098546042, "grad_norm": 1.6739024442340777, "learning_rate": 2.086927627041088e-06, "loss": 0.0196, "step": 3431 }, { "epoch": 5.544426494345719, "grad_norm": 2.3462949415186243, "learning_rate": 2.0856722461962356e-06, "loss": 0.0382, "step": 3432 }, { "epoch": 5.546042003231018, "grad_norm": 2.2853133605589395, "learning_rate": 2.0844169727684855e-06, "loss": 0.0357, "step": 3433 }, { "epoch": 5.5476575121163165, "grad_norm": 2.232718027820107, "learning_rate": 2.083161807083276e-06, "loss": 0.0304, "step": 3434 }, { "epoch": 5.549273021001616, "grad_norm": 1.5123958659722443, "learning_rate": 2.081906749466015e-06, "loss": 0.0253, "step": 3435 }, { "epoch": 5.550888529886914, "grad_norm": 1.6701390526059312, "learning_rate": 2.080651800242086e-06, "loss": 0.0226, "step": 3436 }, { "epoch": 5.552504038772213, "grad_norm": 2.3017347699487325, "learning_rate": 2.079396959736843e-06, "loss": 0.032, "step": 3437 }, { "epoch": 5.554119547657512, "grad_norm": 2.4565675849613036, "learning_rate": 2.0781422282756096e-06, "loss": 0.0298, "step": 3438 }, { "epoch": 5.555735056542811, "grad_norm": 1.9809868929890702, "learning_rate": 2.076887606183684e-06, "loss": 0.0211, "step": 3439 }, { "epoch": 5.55735056542811, "grad_norm": 2.509608122265135, "learning_rate": 2.0756330937863347e-06, "loss": 0.0349, "step": 3440 }, { "epoch": 5.558966074313409, "grad_norm": 1.6222166026811569, "learning_rate": 2.074378691408801e-06, "loss": 0.0262, "step": 3441 }, { "epoch": 5.560581583198708, "grad_norm": 1.689033564130909, "learning_rate": 2.0731243993762963e-06, "loss": 0.0277, "step": 3442 }, { "epoch": 5.5621970920840065, "grad_norm": 2.285347691775434, "learning_rate": 2.0718702180140033e-06, "loss": 0.0378, "step": 3443 }, { "epoch": 5.563812600969305, "grad_norm": 2.189278154756173, "learning_rate": 2.0706161476470764e-06, "loss": 0.0271, "step": 3444 }, { "epoch": 5.565428109854604, "grad_norm": 1.6804596456411491, "learning_rate": 2.069362188600641e-06, "loss": 0.0262, "step": 3445 }, { "epoch": 5.567043618739903, "grad_norm": 1.8201839190597338, "learning_rate": 2.0681083411997937e-06, "loss": 0.0221, "step": 3446 }, { "epoch": 5.568659127625202, "grad_norm": 1.6776112725420944, "learning_rate": 2.0668546057696036e-06, "loss": 0.0242, "step": 3447 }, { "epoch": 5.570274636510501, "grad_norm": 2.0685438787350297, "learning_rate": 2.065600982635109e-06, "loss": 0.0283, "step": 3448 }, { "epoch": 5.5718901453958, "grad_norm": 1.8656603513485748, "learning_rate": 2.064347472121319e-06, "loss": 0.0325, "step": 3449 }, { "epoch": 5.573505654281099, "grad_norm": 1.643653482409442, "learning_rate": 2.063094074553215e-06, "loss": 0.0219, "step": 3450 }, { "epoch": 5.575121163166397, "grad_norm": 1.2953603439007852, "learning_rate": 2.061840790255748e-06, "loss": 0.0252, "step": 3451 }, { "epoch": 5.576736672051696, "grad_norm": 2.1222987837459324, "learning_rate": 2.060587619553839e-06, "loss": 0.0345, "step": 3452 }, { "epoch": 5.578352180936995, "grad_norm": 1.643530948961796, "learning_rate": 2.059334562772382e-06, "loss": 0.0296, "step": 3453 }, { "epoch": 5.579967689822294, "grad_norm": 2.24821390202907, "learning_rate": 2.0580816202362393e-06, "loss": 0.0276, "step": 3454 }, { "epoch": 5.581583198707593, "grad_norm": 1.925745397905035, "learning_rate": 2.0568287922702444e-06, "loss": 0.0304, "step": 3455 }, { "epoch": 5.583198707592892, "grad_norm": 1.7768664731370858, "learning_rate": 2.055576079199201e-06, "loss": 0.0313, "step": 3456 }, { "epoch": 5.584814216478191, "grad_norm": 1.4605368906373053, "learning_rate": 2.054323481347881e-06, "loss": 0.0195, "step": 3457 }, { "epoch": 5.586429725363489, "grad_norm": 1.840752692834045, "learning_rate": 2.0530709990410314e-06, "loss": 0.0276, "step": 3458 }, { "epoch": 5.5880452342487885, "grad_norm": 1.943587117174614, "learning_rate": 2.0518186326033647e-06, "loss": 0.0246, "step": 3459 }, { "epoch": 5.589660743134087, "grad_norm": 1.7791202636163488, "learning_rate": 2.0505663823595648e-06, "loss": 0.0326, "step": 3460 }, { "epoch": 5.591276252019386, "grad_norm": 2.053951039581535, "learning_rate": 2.049314248634286e-06, "loss": 0.0301, "step": 3461 }, { "epoch": 5.592891760904685, "grad_norm": 2.2183984259840654, "learning_rate": 2.048062231752151e-06, "loss": 0.0318, "step": 3462 }, { "epoch": 5.594507269789984, "grad_norm": 1.8336375878293776, "learning_rate": 2.046810332037753e-06, "loss": 0.0242, "step": 3463 }, { "epoch": 5.596122778675283, "grad_norm": 1.8551029066788916, "learning_rate": 2.045558549815657e-06, "loss": 0.0308, "step": 3464 }, { "epoch": 5.597738287560581, "grad_norm": 1.8733012587732547, "learning_rate": 2.0443068854103938e-06, "loss": 0.0288, "step": 3465 }, { "epoch": 5.599353796445881, "grad_norm": 1.8576003622196153, "learning_rate": 2.043055339146466e-06, "loss": 0.0289, "step": 3466 }, { "epoch": 5.600969305331179, "grad_norm": 1.6605022944601588, "learning_rate": 2.041803911348344e-06, "loss": 0.0284, "step": 3467 }, { "epoch": 5.6025848142164785, "grad_norm": 2.0895545057285503, "learning_rate": 2.0405526023404685e-06, "loss": 0.0322, "step": 3468 }, { "epoch": 5.604200323101777, "grad_norm": 2.0650415860433076, "learning_rate": 2.039301412447251e-06, "loss": 0.0368, "step": 3469 }, { "epoch": 5.605815831987076, "grad_norm": 1.9443751960142734, "learning_rate": 2.038050341993069e-06, "loss": 0.0286, "step": 3470 }, { "epoch": 5.607431340872375, "grad_norm": 2.079361670701722, "learning_rate": 2.0367993913022706e-06, "loss": 0.0427, "step": 3471 }, { "epoch": 5.609046849757673, "grad_norm": 2.710016935640921, "learning_rate": 2.035548560699173e-06, "loss": 0.0536, "step": 3472 }, { "epoch": 5.610662358642973, "grad_norm": 1.7508663006231926, "learning_rate": 2.0342978505080607e-06, "loss": 0.0263, "step": 3473 }, { "epoch": 5.612277867528271, "grad_norm": 2.261491167289444, "learning_rate": 2.0330472610531908e-06, "loss": 0.0336, "step": 3474 }, { "epoch": 5.613893376413571, "grad_norm": 2.658306702325697, "learning_rate": 2.031796792658785e-06, "loss": 0.0413, "step": 3475 }, { "epoch": 5.615508885298869, "grad_norm": 2.0058641383876594, "learning_rate": 2.030546445649035e-06, "loss": 0.0332, "step": 3476 }, { "epoch": 5.617124394184168, "grad_norm": 2.261770399601535, "learning_rate": 2.0292962203481014e-06, "loss": 0.0312, "step": 3477 }, { "epoch": 5.618739903069467, "grad_norm": 1.9550889640404874, "learning_rate": 2.0280461170801134e-06, "loss": 0.0267, "step": 3478 }, { "epoch": 5.620355411954765, "grad_norm": 1.8442261055162772, "learning_rate": 2.026796136169168e-06, "loss": 0.0323, "step": 3479 }, { "epoch": 5.621970920840065, "grad_norm": 1.7048819498027612, "learning_rate": 2.0255462779393307e-06, "loss": 0.0321, "step": 3480 }, { "epoch": 5.623586429725363, "grad_norm": 1.8872516772070842, "learning_rate": 2.024296542714635e-06, "loss": 0.0341, "step": 3481 }, { "epoch": 5.625201938610663, "grad_norm": 1.9271052929687995, "learning_rate": 2.0230469308190835e-06, "loss": 0.0329, "step": 3482 }, { "epoch": 5.626817447495961, "grad_norm": 1.529621557257482, "learning_rate": 2.0217974425766453e-06, "loss": 0.0216, "step": 3483 }, { "epoch": 5.6284329563812605, "grad_norm": 1.830300961067277, "learning_rate": 2.0205480783112576e-06, "loss": 0.0265, "step": 3484 }, { "epoch": 5.630048465266559, "grad_norm": 2.2989533136506086, "learning_rate": 2.0192988383468266e-06, "loss": 0.0331, "step": 3485 }, { "epoch": 5.6316639741518575, "grad_norm": 1.767604840845287, "learning_rate": 2.0180497230072265e-06, "loss": 0.0347, "step": 3486 }, { "epoch": 5.633279483037157, "grad_norm": 1.8480940878790686, "learning_rate": 2.0168007326162974e-06, "loss": 0.0302, "step": 3487 }, { "epoch": 5.634894991922455, "grad_norm": 1.762080345905609, "learning_rate": 2.0155518674978485e-06, "loss": 0.031, "step": 3488 }, { "epoch": 5.636510500807755, "grad_norm": 1.4354862762578593, "learning_rate": 2.014303127975656e-06, "loss": 0.0214, "step": 3489 }, { "epoch": 5.638126009693053, "grad_norm": 2.1450832315006503, "learning_rate": 2.013054514373462e-06, "loss": 0.0473, "step": 3490 }, { "epoch": 5.639741518578353, "grad_norm": 2.153503742120066, "learning_rate": 2.01180602701498e-06, "loss": 0.0291, "step": 3491 }, { "epoch": 5.641357027463651, "grad_norm": 2.252253296429072, "learning_rate": 2.0105576662238876e-06, "loss": 0.0372, "step": 3492 }, { "epoch": 5.64297253634895, "grad_norm": 2.081354774954897, "learning_rate": 2.009309432323829e-06, "loss": 0.0236, "step": 3493 }, { "epoch": 5.644588045234249, "grad_norm": 2.1447806629707173, "learning_rate": 2.008061325638418e-06, "loss": 0.0286, "step": 3494 }, { "epoch": 5.646203554119547, "grad_norm": 1.8252059530869893, "learning_rate": 2.0068133464912324e-06, "loss": 0.0296, "step": 3495 }, { "epoch": 5.647819063004847, "grad_norm": 1.8158054042253204, "learning_rate": 2.0055654952058216e-06, "loss": 0.0294, "step": 3496 }, { "epoch": 5.649434571890145, "grad_norm": 2.5964509170192005, "learning_rate": 2.004317772105697e-06, "loss": 0.0261, "step": 3497 }, { "epoch": 5.651050080775445, "grad_norm": 1.947917311460898, "learning_rate": 2.0030701775143396e-06, "loss": 0.0276, "step": 3498 }, { "epoch": 5.652665589660743, "grad_norm": 1.4331971834010386, "learning_rate": 2.001822711755196e-06, "loss": 0.0191, "step": 3499 }, { "epoch": 5.654281098546042, "grad_norm": 1.815612647390174, "learning_rate": 2.0005753751516787e-06, "loss": 0.026, "step": 3500 }, { "epoch": 5.655896607431341, "grad_norm": 1.8921699784145207, "learning_rate": 1.999328168027168e-06, "loss": 0.0225, "step": 3501 }, { "epoch": 5.6575121163166395, "grad_norm": 2.7631245781948177, "learning_rate": 1.998081090705011e-06, "loss": 0.0351, "step": 3502 }, { "epoch": 5.659127625201939, "grad_norm": 1.965392320198182, "learning_rate": 1.9968341435085204e-06, "loss": 0.0284, "step": 3503 }, { "epoch": 5.660743134087237, "grad_norm": 1.7557212923761045, "learning_rate": 1.9955873267609752e-06, "loss": 0.0272, "step": 3504 }, { "epoch": 5.662358642972537, "grad_norm": 1.71176386349887, "learning_rate": 1.9943406407856194e-06, "loss": 0.034, "step": 3505 }, { "epoch": 5.663974151857835, "grad_norm": 2.0499226407648052, "learning_rate": 1.993094085905665e-06, "loss": 0.0322, "step": 3506 }, { "epoch": 5.665589660743134, "grad_norm": 2.498522626961916, "learning_rate": 1.99184766244429e-06, "loss": 0.0279, "step": 3507 }, { "epoch": 5.667205169628433, "grad_norm": 1.91706383855275, "learning_rate": 1.9906013707246373e-06, "loss": 0.0273, "step": 3508 }, { "epoch": 5.668820678513732, "grad_norm": 2.251860776656665, "learning_rate": 1.9893552110698144e-06, "loss": 0.0332, "step": 3509 }, { "epoch": 5.670436187399031, "grad_norm": 2.5814235322376455, "learning_rate": 1.9881091838028983e-06, "loss": 0.0415, "step": 3510 }, { "epoch": 5.6720516962843295, "grad_norm": 1.6127327942875693, "learning_rate": 1.9868632892469284e-06, "loss": 0.025, "step": 3511 }, { "epoch": 5.673667205169629, "grad_norm": 1.9115484087351007, "learning_rate": 1.98561752772491e-06, "loss": 0.0263, "step": 3512 }, { "epoch": 5.675282714054927, "grad_norm": 2.0115021641968767, "learning_rate": 1.984371899559816e-06, "loss": 0.0283, "step": 3513 }, { "epoch": 5.676898222940226, "grad_norm": 2.0163485451460557, "learning_rate": 1.9831264050745835e-06, "loss": 0.0281, "step": 3514 }, { "epoch": 5.678513731825525, "grad_norm": 1.848321108382374, "learning_rate": 1.981881044592114e-06, "loss": 0.0296, "step": 3515 }, { "epoch": 5.680129240710824, "grad_norm": 2.4359750528187663, "learning_rate": 1.9806358184352757e-06, "loss": 0.0407, "step": 3516 }, { "epoch": 5.681744749596123, "grad_norm": 2.298947459428209, "learning_rate": 1.9793907269269e-06, "loss": 0.0315, "step": 3517 }, { "epoch": 5.683360258481422, "grad_norm": 2.1889323170814636, "learning_rate": 1.9781457703897867e-06, "loss": 0.0318, "step": 3518 }, { "epoch": 5.684975767366721, "grad_norm": 1.9260159579719516, "learning_rate": 1.9769009491466985e-06, "loss": 0.0377, "step": 3519 }, { "epoch": 5.686591276252019, "grad_norm": 1.6769429943011824, "learning_rate": 1.9756562635203622e-06, "loss": 0.0242, "step": 3520 }, { "epoch": 5.688206785137318, "grad_norm": 1.8970924160914953, "learning_rate": 1.9744117138334712e-06, "loss": 0.0267, "step": 3521 }, { "epoch": 5.689822294022617, "grad_norm": 3.8613962641015678, "learning_rate": 1.973167300408681e-06, "loss": 0.0602, "step": 3522 }, { "epoch": 5.691437802907916, "grad_norm": 1.825830057458577, "learning_rate": 1.971923023568617e-06, "loss": 0.0355, "step": 3523 }, { "epoch": 5.693053311793215, "grad_norm": 2.2025149400687636, "learning_rate": 1.970678883635864e-06, "loss": 0.028, "step": 3524 }, { "epoch": 5.694668820678514, "grad_norm": 2.1258382863126384, "learning_rate": 1.9694348809329734e-06, "loss": 0.0375, "step": 3525 }, { "epoch": 5.696284329563813, "grad_norm": 1.913144123226687, "learning_rate": 1.9681910157824604e-06, "loss": 0.0363, "step": 3526 }, { "epoch": 5.6978998384491115, "grad_norm": 1.783414102156964, "learning_rate": 1.9669472885068054e-06, "loss": 0.026, "step": 3527 }, { "epoch": 5.69951534733441, "grad_norm": 2.207713014000358, "learning_rate": 1.9657036994284518e-06, "loss": 0.0329, "step": 3528 }, { "epoch": 5.701130856219709, "grad_norm": 1.730321579347936, "learning_rate": 1.9644602488698095e-06, "loss": 0.0302, "step": 3529 }, { "epoch": 5.702746365105008, "grad_norm": 1.50377351543822, "learning_rate": 1.96321693715325e-06, "loss": 0.022, "step": 3530 }, { "epoch": 5.704361873990307, "grad_norm": 2.1406764613106564, "learning_rate": 1.96197376460111e-06, "loss": 0.025, "step": 3531 }, { "epoch": 5.705977382875606, "grad_norm": 2.4083799268019073, "learning_rate": 1.96073073153569e-06, "loss": 0.0294, "step": 3532 }, { "epoch": 5.707592891760905, "grad_norm": 1.7275911289013068, "learning_rate": 1.959487838279252e-06, "loss": 0.0253, "step": 3533 }, { "epoch": 5.709208400646204, "grad_norm": 3.7183165532347453, "learning_rate": 1.958245085154028e-06, "loss": 0.0321, "step": 3534 }, { "epoch": 5.710823909531502, "grad_norm": 1.432949828106459, "learning_rate": 1.9570024724822075e-06, "loss": 0.0193, "step": 3535 }, { "epoch": 5.7124394184168015, "grad_norm": 1.8043527508268182, "learning_rate": 1.9557600005859458e-06, "loss": 0.0265, "step": 3536 }, { "epoch": 5.7140549273021, "grad_norm": 2.311785768382283, "learning_rate": 1.9545176697873605e-06, "loss": 0.0317, "step": 3537 }, { "epoch": 5.715670436187399, "grad_norm": 2.1399287769255877, "learning_rate": 1.953275480408536e-06, "loss": 0.0227, "step": 3538 }, { "epoch": 5.717285945072698, "grad_norm": 2.107946838620129, "learning_rate": 1.952033432771516e-06, "loss": 0.0284, "step": 3539 }, { "epoch": 5.718901453957997, "grad_norm": 1.6503310434386882, "learning_rate": 1.9507915271983106e-06, "loss": 0.0211, "step": 3540 }, { "epoch": 5.720516962843296, "grad_norm": 2.21353936260625, "learning_rate": 1.9495497640108905e-06, "loss": 0.0286, "step": 3541 }, { "epoch": 5.722132471728594, "grad_norm": 2.3217958494437476, "learning_rate": 1.9483081435311918e-06, "loss": 0.0282, "step": 3542 }, { "epoch": 5.723747980613894, "grad_norm": 2.3920575161331827, "learning_rate": 1.947066666081112e-06, "loss": 0.0343, "step": 3543 }, { "epoch": 5.725363489499192, "grad_norm": 1.9047848146589257, "learning_rate": 1.945825331982511e-06, "loss": 0.0231, "step": 3544 }, { "epoch": 5.726978998384491, "grad_norm": 2.0359009917360646, "learning_rate": 1.944584141557214e-06, "loss": 0.0336, "step": 3545 }, { "epoch": 5.72859450726979, "grad_norm": 1.985705976914303, "learning_rate": 1.9433430951270073e-06, "loss": 0.0291, "step": 3546 }, { "epoch": 5.730210016155089, "grad_norm": 2.0685377835128436, "learning_rate": 1.94210219301364e-06, "loss": 0.0281, "step": 3547 }, { "epoch": 5.731825525040388, "grad_norm": 2.787622310487923, "learning_rate": 1.940861435538824e-06, "loss": 0.0618, "step": 3548 }, { "epoch": 5.733441033925686, "grad_norm": 2.3331746219535177, "learning_rate": 1.939620823024232e-06, "loss": 0.0344, "step": 3549 }, { "epoch": 5.735056542810986, "grad_norm": 2.7918825772267053, "learning_rate": 1.938380355791502e-06, "loss": 0.0448, "step": 3550 }, { "epoch": 5.736672051696284, "grad_norm": 1.7552791557141993, "learning_rate": 1.9371400341622333e-06, "loss": 0.0265, "step": 3551 }, { "epoch": 5.7382875605815835, "grad_norm": 2.048020002786704, "learning_rate": 1.935899858457987e-06, "loss": 0.0395, "step": 3552 }, { "epoch": 5.739903069466882, "grad_norm": 2.099466743897673, "learning_rate": 1.9346598290002865e-06, "loss": 0.0308, "step": 3553 }, { "epoch": 5.741518578352181, "grad_norm": 2.747098120699748, "learning_rate": 1.9334199461106167e-06, "loss": 0.0347, "step": 3554 }, { "epoch": 5.74313408723748, "grad_norm": 2.226886086599465, "learning_rate": 1.9321802101104246e-06, "loss": 0.0377, "step": 3555 }, { "epoch": 5.744749596122778, "grad_norm": 1.8209019419911143, "learning_rate": 1.930940621321122e-06, "loss": 0.0291, "step": 3556 }, { "epoch": 5.746365105008078, "grad_norm": 2.184013107496835, "learning_rate": 1.929701180064078e-06, "loss": 0.0272, "step": 3557 }, { "epoch": 5.747980613893376, "grad_norm": 1.8364105044938561, "learning_rate": 1.928461886660627e-06, "loss": 0.0345, "step": 3558 }, { "epoch": 5.749596122778676, "grad_norm": 1.7243496548341373, "learning_rate": 1.9272227414320628e-06, "loss": 0.0256, "step": 3559 }, { "epoch": 5.751211631663974, "grad_norm": 2.5163437790513115, "learning_rate": 1.9259837446996413e-06, "loss": 0.0311, "step": 3560 }, { "epoch": 5.7528271405492735, "grad_norm": 1.8290353152626957, "learning_rate": 1.9247448967845806e-06, "loss": 0.0278, "step": 3561 }, { "epoch": 5.754442649434572, "grad_norm": 1.818271488361276, "learning_rate": 1.923506198008061e-06, "loss": 0.0318, "step": 3562 }, { "epoch": 5.75605815831987, "grad_norm": 1.6986529901172884, "learning_rate": 1.9222676486912223e-06, "loss": 0.0221, "step": 3563 }, { "epoch": 5.75767366720517, "grad_norm": 1.9827254889509496, "learning_rate": 1.921029249155166e-06, "loss": 0.026, "step": 3564 }, { "epoch": 5.759289176090468, "grad_norm": 2.034335454319056, "learning_rate": 1.919790999720955e-06, "loss": 0.0259, "step": 3565 }, { "epoch": 5.760904684975768, "grad_norm": 1.5544355519288715, "learning_rate": 1.9185529007096133e-06, "loss": 0.0241, "step": 3566 }, { "epoch": 5.762520193861066, "grad_norm": 1.9658141317686788, "learning_rate": 1.9173149524421265e-06, "loss": 0.0242, "step": 3567 }, { "epoch": 5.7641357027463656, "grad_norm": 2.245201586603563, "learning_rate": 1.916077155239441e-06, "loss": 0.0299, "step": 3568 }, { "epoch": 5.765751211631664, "grad_norm": 2.334040435706337, "learning_rate": 1.9148395094224614e-06, "loss": 0.0439, "step": 3569 }, { "epoch": 5.7673667205169625, "grad_norm": 1.77369651089485, "learning_rate": 1.9136020153120575e-06, "loss": 0.0273, "step": 3570 }, { "epoch": 5.768982229402262, "grad_norm": 2.243831095578403, "learning_rate": 1.912364673229057e-06, "loss": 0.0292, "step": 3571 }, { "epoch": 5.77059773828756, "grad_norm": 2.405655686216392, "learning_rate": 1.9111274834942476e-06, "loss": 0.0315, "step": 3572 }, { "epoch": 5.77221324717286, "grad_norm": 1.5966019558631281, "learning_rate": 1.9098904464283797e-06, "loss": 0.0219, "step": 3573 }, { "epoch": 5.773828756058158, "grad_norm": 1.7730055211495708, "learning_rate": 1.9086535623521628e-06, "loss": 0.0263, "step": 3574 }, { "epoch": 5.775444264943458, "grad_norm": 2.209823104844273, "learning_rate": 1.9074168315862674e-06, "loss": 0.0242, "step": 3575 }, { "epoch": 5.777059773828756, "grad_norm": 1.7069922843228833, "learning_rate": 1.9061802544513227e-06, "loss": 0.0281, "step": 3576 }, { "epoch": 5.778675282714055, "grad_norm": 1.7645467358041058, "learning_rate": 1.9049438312679189e-06, "loss": 0.0258, "step": 3577 }, { "epoch": 5.780290791599354, "grad_norm": 1.7181658834761604, "learning_rate": 1.9037075623566085e-06, "loss": 0.0288, "step": 3578 }, { "epoch": 5.7819063004846525, "grad_norm": 2.4549212147187385, "learning_rate": 1.9024714480379009e-06, "loss": 0.0344, "step": 3579 }, { "epoch": 5.783521809369952, "grad_norm": 2.0729887082337544, "learning_rate": 1.9012354886322668e-06, "loss": 0.0248, "step": 3580 }, { "epoch": 5.78513731825525, "grad_norm": 2.4570291600222576, "learning_rate": 1.8999996844601362e-06, "loss": 0.0335, "step": 3581 }, { "epoch": 5.78675282714055, "grad_norm": 2.258062812075541, "learning_rate": 1.8987640358418985e-06, "loss": 0.0245, "step": 3582 }, { "epoch": 5.788368336025848, "grad_norm": 1.6919590465978305, "learning_rate": 1.8975285430979055e-06, "loss": 0.0341, "step": 3583 }, { "epoch": 5.789983844911147, "grad_norm": 1.81471763745382, "learning_rate": 1.8962932065484651e-06, "loss": 0.0266, "step": 3584 }, { "epoch": 5.791599353796446, "grad_norm": 1.783171197784634, "learning_rate": 1.8950580265138467e-06, "loss": 0.0248, "step": 3585 }, { "epoch": 5.793214862681745, "grad_norm": 1.9810949870488368, "learning_rate": 1.893823003314278e-06, "loss": 0.0275, "step": 3586 }, { "epoch": 5.794830371567044, "grad_norm": 2.0596585771045346, "learning_rate": 1.892588137269947e-06, "loss": 0.0281, "step": 3587 }, { "epoch": 5.796445880452342, "grad_norm": 1.9257912408012585, "learning_rate": 1.8913534287009994e-06, "loss": 0.0325, "step": 3588 }, { "epoch": 5.798061389337642, "grad_norm": 1.633117581032971, "learning_rate": 1.8901188779275437e-06, "loss": 0.0265, "step": 3589 }, { "epoch": 5.79967689822294, "grad_norm": 2.4146732471410117, "learning_rate": 1.8888844852696432e-06, "loss": 0.0674, "step": 3590 }, { "epoch": 5.801292407108239, "grad_norm": 1.7887137473116868, "learning_rate": 1.8876502510473227e-06, "loss": 0.0331, "step": 3591 }, { "epoch": 5.802907915993538, "grad_norm": 2.413713382877321, "learning_rate": 1.8864161755805648e-06, "loss": 0.0381, "step": 3592 }, { "epoch": 5.804523424878837, "grad_norm": 1.6837911991789705, "learning_rate": 1.885182259189311e-06, "loss": 0.0219, "step": 3593 }, { "epoch": 5.806138933764136, "grad_norm": 1.891567343187096, "learning_rate": 1.8839485021934636e-06, "loss": 0.0334, "step": 3594 }, { "epoch": 5.8077544426494345, "grad_norm": 1.7662470975024975, "learning_rate": 1.882714904912881e-06, "loss": 0.0222, "step": 3595 }, { "epoch": 5.809369951534734, "grad_norm": 2.297313281281325, "learning_rate": 1.881481467667381e-06, "loss": 0.0323, "step": 3596 }, { "epoch": 5.810985460420032, "grad_norm": 2.434295881535688, "learning_rate": 1.8802481907767395e-06, "loss": 0.0282, "step": 3597 }, { "epoch": 5.812600969305331, "grad_norm": 1.8851695272339577, "learning_rate": 1.8790150745606926e-06, "loss": 0.0337, "step": 3598 }, { "epoch": 5.81421647819063, "grad_norm": 2.2282121492132103, "learning_rate": 1.8777821193389323e-06, "loss": 0.0379, "step": 3599 }, { "epoch": 5.815831987075929, "grad_norm": 2.0720382318195774, "learning_rate": 1.8765493254311114e-06, "loss": 0.0298, "step": 3600 }, { "epoch": 5.815831987075929, "eval_loss": 1.5364038944244385, "eval_runtime": 2.3479, "eval_samples_per_second": 127.772, "eval_steps_per_second": 2.981, "step": 3600 }, { "epoch": 5.817447495961228, "grad_norm": 2.794149327002196, "learning_rate": 1.8753166931568385e-06, "loss": 0.0344, "step": 3601 }, { "epoch": 5.819063004846527, "grad_norm": 1.782594241815284, "learning_rate": 1.8740842228356818e-06, "loss": 0.0332, "step": 3602 }, { "epoch": 5.820678513731826, "grad_norm": 2.020992683677999, "learning_rate": 1.8728519147871671e-06, "loss": 0.0216, "step": 3603 }, { "epoch": 5.8222940226171245, "grad_norm": 2.1184474151017527, "learning_rate": 1.8716197693307774e-06, "loss": 0.0391, "step": 3604 }, { "epoch": 5.823909531502423, "grad_norm": 1.7733195101228585, "learning_rate": 1.870387786785955e-06, "loss": 0.0264, "step": 3605 }, { "epoch": 5.825525040387722, "grad_norm": 1.7721270328491288, "learning_rate": 1.8691559674720994e-06, "loss": 0.0293, "step": 3606 }, { "epoch": 5.827140549273021, "grad_norm": 2.046438846423976, "learning_rate": 1.867924311708567e-06, "loss": 0.0344, "step": 3607 }, { "epoch": 5.82875605815832, "grad_norm": 2.23545525327542, "learning_rate": 1.8666928198146729e-06, "loss": 0.0287, "step": 3608 }, { "epoch": 5.830371567043619, "grad_norm": 2.083658603419166, "learning_rate": 1.8654614921096883e-06, "loss": 0.0264, "step": 3609 }, { "epoch": 5.831987075928918, "grad_norm": 1.886864894289507, "learning_rate": 1.8642303289128427e-06, "loss": 0.0256, "step": 3610 }, { "epoch": 5.833602584814217, "grad_norm": 1.9281526857050348, "learning_rate": 1.862999330543324e-06, "loss": 0.0299, "step": 3611 }, { "epoch": 5.835218093699515, "grad_norm": 1.7994807755517102, "learning_rate": 1.8617684973202766e-06, "loss": 0.0332, "step": 3612 }, { "epoch": 5.836833602584814, "grad_norm": 2.1782316122418774, "learning_rate": 1.860537829562801e-06, "loss": 0.0332, "step": 3613 }, { "epoch": 5.838449111470113, "grad_norm": 1.8971486228773395, "learning_rate": 1.8593073275899556e-06, "loss": 0.0313, "step": 3614 }, { "epoch": 5.840064620355412, "grad_norm": 2.1773416227495237, "learning_rate": 1.8580769917207553e-06, "loss": 0.0285, "step": 3615 }, { "epoch": 5.841680129240711, "grad_norm": 1.7590027898752199, "learning_rate": 1.8568468222741737e-06, "loss": 0.0322, "step": 3616 }, { "epoch": 5.84329563812601, "grad_norm": 1.7646672951929863, "learning_rate": 1.8556168195691399e-06, "loss": 0.0268, "step": 3617 }, { "epoch": 5.844911147011309, "grad_norm": 1.7011574636118878, "learning_rate": 1.85438698392454e-06, "loss": 0.0314, "step": 3618 }, { "epoch": 5.846526655896607, "grad_norm": 1.7961509016436215, "learning_rate": 1.8531573156592156e-06, "loss": 0.0267, "step": 3619 }, { "epoch": 5.8481421647819065, "grad_norm": 2.0989700862047704, "learning_rate": 1.8519278150919668e-06, "loss": 0.0341, "step": 3620 }, { "epoch": 5.849757673667205, "grad_norm": 2.1061901741463958, "learning_rate": 1.8506984825415485e-06, "loss": 0.0319, "step": 3621 }, { "epoch": 5.851373182552504, "grad_norm": 2.0050888431985343, "learning_rate": 1.849469318326675e-06, "loss": 0.0266, "step": 3622 }, { "epoch": 5.852988691437803, "grad_norm": 1.6708903221017068, "learning_rate": 1.8482403227660135e-06, "loss": 0.0215, "step": 3623 }, { "epoch": 5.854604200323102, "grad_norm": 2.1953303883960307, "learning_rate": 1.847011496178189e-06, "loss": 0.0344, "step": 3624 }, { "epoch": 5.856219709208401, "grad_norm": 2.2069753605713536, "learning_rate": 1.8457828388817824e-06, "loss": 0.0266, "step": 3625 }, { "epoch": 5.857835218093699, "grad_norm": 1.9760338772576862, "learning_rate": 1.8445543511953313e-06, "loss": 0.034, "step": 3626 }, { "epoch": 5.859450726978999, "grad_norm": 1.972834771793491, "learning_rate": 1.8433260334373293e-06, "loss": 0.0291, "step": 3627 }, { "epoch": 5.861066235864297, "grad_norm": 2.1336964583732425, "learning_rate": 1.8420978859262254e-06, "loss": 0.0309, "step": 3628 }, { "epoch": 5.8626817447495965, "grad_norm": 2.080735883133551, "learning_rate": 1.8408699089804236e-06, "loss": 0.0289, "step": 3629 }, { "epoch": 5.864297253634895, "grad_norm": 1.82371515281941, "learning_rate": 1.8396421029182862e-06, "loss": 0.0307, "step": 3630 }, { "epoch": 5.865912762520194, "grad_norm": 1.8638626198484152, "learning_rate": 1.8384144680581285e-06, "loss": 0.0335, "step": 3631 }, { "epoch": 5.867528271405493, "grad_norm": 1.733561662420202, "learning_rate": 1.8371870047182238e-06, "loss": 0.0245, "step": 3632 }, { "epoch": 5.869143780290791, "grad_norm": 1.9364303847467623, "learning_rate": 1.8359597132167987e-06, "loss": 0.0266, "step": 3633 }, { "epoch": 5.870759289176091, "grad_norm": 1.7657259580535776, "learning_rate": 1.8347325938720373e-06, "loss": 0.0276, "step": 3634 }, { "epoch": 5.872374798061389, "grad_norm": 2.1823417786957857, "learning_rate": 1.8335056470020773e-06, "loss": 0.0357, "step": 3635 }, { "epoch": 5.8739903069466886, "grad_norm": 1.913666897172906, "learning_rate": 1.832278872925013e-06, "loss": 0.0354, "step": 3636 }, { "epoch": 5.875605815831987, "grad_norm": 2.075290920791023, "learning_rate": 1.8310522719588918e-06, "loss": 0.0307, "step": 3637 }, { "epoch": 5.877221324717286, "grad_norm": 1.723053848620966, "learning_rate": 1.8298258444217203e-06, "loss": 0.0298, "step": 3638 }, { "epoch": 5.878836833602585, "grad_norm": 1.5099402612280164, "learning_rate": 1.8285995906314569e-06, "loss": 0.0217, "step": 3639 }, { "epoch": 5.880452342487883, "grad_norm": 1.677436887313604, "learning_rate": 1.8273735109060147e-06, "loss": 0.0189, "step": 3640 }, { "epoch": 5.882067851373183, "grad_norm": 2.1717907673120758, "learning_rate": 1.8261476055632635e-06, "loss": 0.037, "step": 3641 }, { "epoch": 5.883683360258481, "grad_norm": 1.9846878265753005, "learning_rate": 1.8249218749210253e-06, "loss": 0.0312, "step": 3642 }, { "epoch": 5.885298869143781, "grad_norm": 2.2703730989353446, "learning_rate": 1.8236963192970814e-06, "loss": 0.0359, "step": 3643 }, { "epoch": 5.886914378029079, "grad_norm": 2.0157976244699607, "learning_rate": 1.8224709390091633e-06, "loss": 0.0332, "step": 3644 }, { "epoch": 5.8885298869143785, "grad_norm": 1.8691962725792477, "learning_rate": 1.821245734374959e-06, "loss": 0.0249, "step": 3645 }, { "epoch": 5.890145395799677, "grad_norm": 1.7056390571839863, "learning_rate": 1.8200207057121103e-06, "loss": 0.0275, "step": 3646 }, { "epoch": 5.8917609046849755, "grad_norm": 1.7966889183188328, "learning_rate": 1.8187958533382138e-06, "loss": 0.0266, "step": 3647 }, { "epoch": 5.893376413570275, "grad_norm": 1.7807709922386785, "learning_rate": 1.8175711775708188e-06, "loss": 0.0296, "step": 3648 }, { "epoch": 5.894991922455573, "grad_norm": 1.9331005143371325, "learning_rate": 1.8163466787274332e-06, "loss": 0.0358, "step": 3649 }, { "epoch": 5.896607431340873, "grad_norm": 2.2025428846955877, "learning_rate": 1.8151223571255145e-06, "loss": 0.0394, "step": 3650 }, { "epoch": 5.898222940226171, "grad_norm": 1.968406277809508, "learning_rate": 1.8138982130824757e-06, "loss": 0.0432, "step": 3651 }, { "epoch": 5.899838449111471, "grad_norm": 1.5902460800620422, "learning_rate": 1.8126742469156843e-06, "loss": 0.0263, "step": 3652 }, { "epoch": 5.901453957996769, "grad_norm": 1.7091257776099937, "learning_rate": 1.8114504589424603e-06, "loss": 0.0248, "step": 3653 }, { "epoch": 5.903069466882068, "grad_norm": 1.9667900169172956, "learning_rate": 1.81022684948008e-06, "loss": 0.0301, "step": 3654 }, { "epoch": 5.904684975767367, "grad_norm": 1.5875385633872028, "learning_rate": 1.8090034188457718e-06, "loss": 0.0212, "step": 3655 }, { "epoch": 5.906300484652665, "grad_norm": 2.2593468836622574, "learning_rate": 1.807780167356717e-06, "loss": 0.0299, "step": 3656 }, { "epoch": 5.907915993537965, "grad_norm": 1.982253986704133, "learning_rate": 1.8065570953300511e-06, "loss": 0.0279, "step": 3657 }, { "epoch": 5.909531502423263, "grad_norm": 3.025496469996556, "learning_rate": 1.8053342030828646e-06, "loss": 0.031, "step": 3658 }, { "epoch": 5.911147011308563, "grad_norm": 1.7039757785545828, "learning_rate": 1.804111490932199e-06, "loss": 0.0271, "step": 3659 }, { "epoch": 5.912762520193861, "grad_norm": 2.2918304956761344, "learning_rate": 1.8028889591950507e-06, "loss": 0.0309, "step": 3660 }, { "epoch": 5.91437802907916, "grad_norm": 1.71886702842955, "learning_rate": 1.8016666081883685e-06, "loss": 0.0273, "step": 3661 }, { "epoch": 5.915993537964459, "grad_norm": 2.4264494490135937, "learning_rate": 1.8004444382290553e-06, "loss": 0.0428, "step": 3662 }, { "epoch": 5.9176090468497575, "grad_norm": 2.299263153886968, "learning_rate": 1.7992224496339657e-06, "loss": 0.0335, "step": 3663 }, { "epoch": 5.919224555735057, "grad_norm": 1.590544572329627, "learning_rate": 1.798000642719908e-06, "loss": 0.0255, "step": 3664 }, { "epoch": 5.920840064620355, "grad_norm": 1.6997334833007416, "learning_rate": 1.7967790178036438e-06, "loss": 0.0297, "step": 3665 }, { "epoch": 5.922455573505655, "grad_norm": 1.9770415137394002, "learning_rate": 1.7955575752018873e-06, "loss": 0.035, "step": 3666 }, { "epoch": 5.924071082390953, "grad_norm": 2.0812646856670587, "learning_rate": 1.7943363152313048e-06, "loss": 0.0266, "step": 3667 }, { "epoch": 5.925686591276252, "grad_norm": 1.9620431690707325, "learning_rate": 1.7931152382085163e-06, "loss": 0.0322, "step": 3668 }, { "epoch": 5.927302100161551, "grad_norm": 1.9672461071999507, "learning_rate": 1.791894344450093e-06, "loss": 0.0258, "step": 3669 }, { "epoch": 5.92891760904685, "grad_norm": 1.754917332928912, "learning_rate": 1.7906736342725588e-06, "loss": 0.0311, "step": 3670 }, { "epoch": 5.930533117932149, "grad_norm": 1.779762351375682, "learning_rate": 1.7894531079923927e-06, "loss": 0.0303, "step": 3671 }, { "epoch": 5.9321486268174475, "grad_norm": 1.8127733065996678, "learning_rate": 1.7882327659260224e-06, "loss": 0.0267, "step": 3672 }, { "epoch": 5.933764135702747, "grad_norm": 1.53792474493035, "learning_rate": 1.7870126083898298e-06, "loss": 0.0233, "step": 3673 }, { "epoch": 5.935379644588045, "grad_norm": 1.9983286455471223, "learning_rate": 1.785792635700148e-06, "loss": 0.0356, "step": 3674 }, { "epoch": 5.936995153473344, "grad_norm": 1.8738098979667421, "learning_rate": 1.784572848173262e-06, "loss": 0.0227, "step": 3675 }, { "epoch": 5.938610662358643, "grad_norm": 1.902534849219897, "learning_rate": 1.7833532461254117e-06, "loss": 0.0258, "step": 3676 }, { "epoch": 5.940226171243942, "grad_norm": 1.7491422670114294, "learning_rate": 1.7821338298727853e-06, "loss": 0.0237, "step": 3677 }, { "epoch": 5.941841680129241, "grad_norm": 2.304418068157673, "learning_rate": 1.7809145997315242e-06, "loss": 0.0417, "step": 3678 }, { "epoch": 5.94345718901454, "grad_norm": 2.2303314202075186, "learning_rate": 1.7796955560177217e-06, "loss": 0.0421, "step": 3679 }, { "epoch": 5.945072697899839, "grad_norm": 2.071577392023133, "learning_rate": 1.778476699047423e-06, "loss": 0.0307, "step": 3680 }, { "epoch": 5.946688206785137, "grad_norm": 1.647395228482575, "learning_rate": 1.7772580291366226e-06, "loss": 0.0346, "step": 3681 }, { "epoch": 5.948303715670436, "grad_norm": 1.9500414847996046, "learning_rate": 1.776039546601271e-06, "loss": 0.0284, "step": 3682 }, { "epoch": 5.949919224555735, "grad_norm": 1.5259800441705957, "learning_rate": 1.7748212517572666e-06, "loss": 0.0252, "step": 3683 }, { "epoch": 5.951534733441034, "grad_norm": 2.557671313517698, "learning_rate": 1.77360314492046e-06, "loss": 0.0403, "step": 3684 }, { "epoch": 5.953150242326333, "grad_norm": 1.926262520866917, "learning_rate": 1.7723852264066524e-06, "loss": 0.0313, "step": 3685 }, { "epoch": 5.954765751211632, "grad_norm": 1.519379719106884, "learning_rate": 1.7711674965315978e-06, "loss": 0.0212, "step": 3686 }, { "epoch": 5.956381260096931, "grad_norm": 2.210952251165322, "learning_rate": 1.7699499556110005e-06, "loss": 0.0315, "step": 3687 }, { "epoch": 5.9579967689822295, "grad_norm": 1.4606323661802199, "learning_rate": 1.7687326039605158e-06, "loss": 0.0237, "step": 3688 }, { "epoch": 5.959612277867528, "grad_norm": 1.758041139091829, "learning_rate": 1.7675154418957486e-06, "loss": 0.0198, "step": 3689 }, { "epoch": 5.961227786752827, "grad_norm": 1.8132751459362373, "learning_rate": 1.7662984697322576e-06, "loss": 0.0321, "step": 3690 }, { "epoch": 5.962843295638126, "grad_norm": 1.7582860766409247, "learning_rate": 1.765081687785549e-06, "loss": 0.028, "step": 3691 }, { "epoch": 5.964458804523425, "grad_norm": 2.0666724227967714, "learning_rate": 1.7638650963710824e-06, "loss": 0.0392, "step": 3692 }, { "epoch": 5.966074313408724, "grad_norm": 2.5011025964177804, "learning_rate": 1.7626486958042663e-06, "loss": 0.0375, "step": 3693 }, { "epoch": 5.967689822294023, "grad_norm": 2.020660752518577, "learning_rate": 1.7614324864004605e-06, "loss": 0.0326, "step": 3694 }, { "epoch": 5.969305331179322, "grad_norm": 1.3971626602152754, "learning_rate": 1.7602164684749753e-06, "loss": 0.0231, "step": 3695 }, { "epoch": 5.97092084006462, "grad_norm": 2.1502161012069125, "learning_rate": 1.7590006423430706e-06, "loss": 0.0273, "step": 3696 }, { "epoch": 5.9725363489499195, "grad_norm": 1.971617874234724, "learning_rate": 1.757785008319956e-06, "loss": 0.0307, "step": 3697 }, { "epoch": 5.974151857835218, "grad_norm": 1.705490601674834, "learning_rate": 1.756569566720795e-06, "loss": 0.0296, "step": 3698 }, { "epoch": 5.975767366720517, "grad_norm": 1.5205285275119707, "learning_rate": 1.7553543178606966e-06, "loss": 0.0271, "step": 3699 }, { "epoch": 5.977382875605816, "grad_norm": 2.146150367187254, "learning_rate": 1.7541392620547223e-06, "loss": 0.0336, "step": 3700 }, { "epoch": 5.978998384491114, "grad_norm": 1.8950710505392254, "learning_rate": 1.7529243996178829e-06, "loss": 0.0275, "step": 3701 }, { "epoch": 5.980613893376414, "grad_norm": 1.927692834986998, "learning_rate": 1.7517097308651387e-06, "loss": 0.0264, "step": 3702 }, { "epoch": 5.982229402261712, "grad_norm": 1.6238337935186684, "learning_rate": 1.7504952561114013e-06, "loss": 0.0251, "step": 3703 }, { "epoch": 5.983844911147012, "grad_norm": 1.6980771118306166, "learning_rate": 1.7492809756715312e-06, "loss": 0.0236, "step": 3704 }, { "epoch": 5.98546042003231, "grad_norm": 2.2238726301541463, "learning_rate": 1.7480668898603372e-06, "loss": 0.0288, "step": 3705 }, { "epoch": 5.987075928917609, "grad_norm": 2.4569293516286774, "learning_rate": 1.7468529989925794e-06, "loss": 0.036, "step": 3706 }, { "epoch": 5.988691437802908, "grad_norm": 2.415370125383574, "learning_rate": 1.7456393033829667e-06, "loss": 0.0456, "step": 3707 }, { "epoch": 5.990306946688206, "grad_norm": 2.2795540230844193, "learning_rate": 1.7444258033461562e-06, "loss": 0.0308, "step": 3708 }, { "epoch": 5.991922455573506, "grad_norm": 1.500900510154935, "learning_rate": 1.7432124991967575e-06, "loss": 0.022, "step": 3709 }, { "epoch": 5.993537964458804, "grad_norm": 1.5663081427571832, "learning_rate": 1.7419993912493265e-06, "loss": 0.0221, "step": 3710 }, { "epoch": 5.995153473344104, "grad_norm": 2.1520888080627643, "learning_rate": 1.7407864798183692e-06, "loss": 0.0341, "step": 3711 }, { "epoch": 5.996768982229402, "grad_norm": 2.007078065999495, "learning_rate": 1.7395737652183404e-06, "loss": 0.0242, "step": 3712 }, { "epoch": 5.9983844911147015, "grad_norm": 1.6793218077244476, "learning_rate": 1.738361247763643e-06, "loss": 0.0247, "step": 3713 }, { "epoch": 6.0, "grad_norm": 1.8248821982827053, "learning_rate": 1.7371489277686326e-06, "loss": 0.0217, "step": 3714 }, { "epoch": 6.0016155088852985, "grad_norm": 1.2086997477023707, "learning_rate": 1.7359368055476089e-06, "loss": 0.0168, "step": 3715 }, { "epoch": 6.003231017770598, "grad_norm": 1.3005604476359522, "learning_rate": 1.7347248814148226e-06, "loss": 0.0162, "step": 3716 }, { "epoch": 6.004846526655896, "grad_norm": 1.226215977679583, "learning_rate": 1.7335131556844725e-06, "loss": 0.0106, "step": 3717 }, { "epoch": 6.006462035541196, "grad_norm": 1.189787071751824, "learning_rate": 1.732301628670707e-06, "loss": 0.0128, "step": 3718 }, { "epoch": 6.008077544426494, "grad_norm": 1.0198238781745306, "learning_rate": 1.7310903006876211e-06, "loss": 0.0117, "step": 3719 }, { "epoch": 6.009693053311794, "grad_norm": 0.8505981900202273, "learning_rate": 1.7298791720492603e-06, "loss": 0.0094, "step": 3720 }, { "epoch": 6.011308562197092, "grad_norm": 1.0025884233120124, "learning_rate": 1.7286682430696168e-06, "loss": 0.0124, "step": 3721 }, { "epoch": 6.012924071082391, "grad_norm": 1.336979871788655, "learning_rate": 1.7274575140626318e-06, "loss": 0.0163, "step": 3722 }, { "epoch": 6.01453957996769, "grad_norm": 1.5044398714988592, "learning_rate": 1.726246985342195e-06, "loss": 0.0296, "step": 3723 }, { "epoch": 6.016155088852988, "grad_norm": 1.3482830879805805, "learning_rate": 1.725036657222142e-06, "loss": 0.0125, "step": 3724 }, { "epoch": 6.017770597738288, "grad_norm": 1.562681768021429, "learning_rate": 1.7238265300162592e-06, "loss": 0.0104, "step": 3725 }, { "epoch": 6.019386106623586, "grad_norm": 2.2761023743528215, "learning_rate": 1.7226166040382807e-06, "loss": 0.0205, "step": 3726 }, { "epoch": 6.021001615508886, "grad_norm": 1.077428251049811, "learning_rate": 1.7214068796018862e-06, "loss": 0.0177, "step": 3727 }, { "epoch": 6.022617124394184, "grad_norm": 1.2475721318055701, "learning_rate": 1.7201973570207054e-06, "loss": 0.0116, "step": 3728 }, { "epoch": 6.024232633279483, "grad_norm": 1.2079625508890688, "learning_rate": 1.7189880366083134e-06, "loss": 0.0126, "step": 3729 }, { "epoch": 6.025848142164782, "grad_norm": 1.4582425667596506, "learning_rate": 1.7177789186782342e-06, "loss": 0.0104, "step": 3730 }, { "epoch": 6.0274636510500805, "grad_norm": 1.011599058152423, "learning_rate": 1.7165700035439412e-06, "loss": 0.0122, "step": 3731 }, { "epoch": 6.02907915993538, "grad_norm": 1.344998804294711, "learning_rate": 1.7153612915188517e-06, "loss": 0.009, "step": 3732 }, { "epoch": 6.030694668820678, "grad_norm": 2.4372119728607666, "learning_rate": 1.714152782916333e-06, "loss": 0.0114, "step": 3733 }, { "epoch": 6.032310177705978, "grad_norm": 0.9676439232981267, "learning_rate": 1.7129444780496973e-06, "loss": 0.0091, "step": 3734 }, { "epoch": 6.033925686591276, "grad_norm": 1.1192453181660895, "learning_rate": 1.7117363772322056e-06, "loss": 0.0111, "step": 3735 }, { "epoch": 6.035541195476575, "grad_norm": 1.353113572819528, "learning_rate": 1.7105284807770673e-06, "loss": 0.0129, "step": 3736 }, { "epoch": 6.037156704361874, "grad_norm": 1.5080589111390608, "learning_rate": 1.7093207889974356e-06, "loss": 0.0172, "step": 3737 }, { "epoch": 6.038772213247173, "grad_norm": 1.4754357129792388, "learning_rate": 1.708113302206413e-06, "loss": 0.0152, "step": 3738 }, { "epoch": 6.040387722132472, "grad_norm": 1.1482219380595213, "learning_rate": 1.7069060207170478e-06, "loss": 0.0126, "step": 3739 }, { "epoch": 6.0420032310177705, "grad_norm": 1.1869833192031958, "learning_rate": 1.7056989448423347e-06, "loss": 0.0114, "step": 3740 }, { "epoch": 6.04361873990307, "grad_norm": 1.1082163568702892, "learning_rate": 1.7044920748952177e-06, "loss": 0.0105, "step": 3741 }, { "epoch": 6.045234248788368, "grad_norm": 1.5120257834801563, "learning_rate": 1.7032854111885838e-06, "loss": 0.0149, "step": 3742 }, { "epoch": 6.046849757673667, "grad_norm": 1.0957307740934539, "learning_rate": 1.7020789540352695e-06, "loss": 0.0149, "step": 3743 }, { "epoch": 6.048465266558966, "grad_norm": 1.4158280679895558, "learning_rate": 1.700872703748055e-06, "loss": 0.0121, "step": 3744 }, { "epoch": 6.050080775444265, "grad_norm": 1.8926911945606566, "learning_rate": 1.6996666606396695e-06, "loss": 0.0207, "step": 3745 }, { "epoch": 6.051696284329564, "grad_norm": 1.0275001963884676, "learning_rate": 1.6984608250227862e-06, "loss": 0.0125, "step": 3746 }, { "epoch": 6.053311793214863, "grad_norm": 1.427304736908955, "learning_rate": 1.6972551972100276e-06, "loss": 0.0115, "step": 3747 }, { "epoch": 6.054927302100162, "grad_norm": 1.2272000607449571, "learning_rate": 1.696049777513959e-06, "loss": 0.0136, "step": 3748 }, { "epoch": 6.05654281098546, "grad_norm": 1.4396536829525195, "learning_rate": 1.6948445662470926e-06, "loss": 0.012, "step": 3749 }, { "epoch": 6.058158319870759, "grad_norm": 1.428246613692539, "learning_rate": 1.6936395637218883e-06, "loss": 0.0134, "step": 3750 }, { "epoch": 6.059773828756058, "grad_norm": 1.7910458064541959, "learning_rate": 1.6924347702507495e-06, "loss": 0.0185, "step": 3751 }, { "epoch": 6.061389337641357, "grad_norm": 1.5327789776664436, "learning_rate": 1.691230186146028e-06, "loss": 0.0132, "step": 3752 }, { "epoch": 6.063004846526656, "grad_norm": 1.3969102319722073, "learning_rate": 1.6900258117200185e-06, "loss": 0.0131, "step": 3753 }, { "epoch": 6.064620355411955, "grad_norm": 1.2556073996064472, "learning_rate": 1.6888216472849638e-06, "loss": 0.0134, "step": 3754 }, { "epoch": 6.066235864297254, "grad_norm": 1.310746272865231, "learning_rate": 1.6876176931530509e-06, "loss": 0.0133, "step": 3755 }, { "epoch": 6.0678513731825525, "grad_norm": 1.3604979901615426, "learning_rate": 1.6864139496364124e-06, "loss": 0.0167, "step": 3756 }, { "epoch": 6.069466882067851, "grad_norm": 1.389580246594958, "learning_rate": 1.685210417047125e-06, "loss": 0.0161, "step": 3757 }, { "epoch": 6.07108239095315, "grad_norm": 1.2691816194426722, "learning_rate": 1.684007095697215e-06, "loss": 0.0113, "step": 3758 }, { "epoch": 6.072697899838449, "grad_norm": 1.160815804495725, "learning_rate": 1.6828039858986494e-06, "loss": 0.0122, "step": 3759 }, { "epoch": 6.074313408723748, "grad_norm": 1.19970299582979, "learning_rate": 1.6816010879633424e-06, "loss": 0.0146, "step": 3760 }, { "epoch": 6.075928917609047, "grad_norm": 1.853032083564138, "learning_rate": 1.6803984022031528e-06, "loss": 0.016, "step": 3761 }, { "epoch": 6.077544426494346, "grad_norm": 1.3668138672051915, "learning_rate": 1.6791959289298838e-06, "loss": 0.0157, "step": 3762 }, { "epoch": 6.079159935379645, "grad_norm": 1.3014386918541399, "learning_rate": 1.677993668455286e-06, "loss": 0.0218, "step": 3763 }, { "epoch": 6.080775444264943, "grad_norm": 1.3812632779044682, "learning_rate": 1.6767916210910518e-06, "loss": 0.0144, "step": 3764 }, { "epoch": 6.0823909531502425, "grad_norm": 1.4698014127053782, "learning_rate": 1.6755897871488203e-06, "loss": 0.0157, "step": 3765 }, { "epoch": 6.084006462035541, "grad_norm": 1.070902986632019, "learning_rate": 1.674388166940174e-06, "loss": 0.0126, "step": 3766 }, { "epoch": 6.08562197092084, "grad_norm": 1.1159088579214083, "learning_rate": 1.6731867607766404e-06, "loss": 0.0098, "step": 3767 }, { "epoch": 6.087237479806139, "grad_norm": 0.9344764526916128, "learning_rate": 1.6719855689696918e-06, "loss": 0.0118, "step": 3768 }, { "epoch": 6.088852988691438, "grad_norm": 1.417877714306274, "learning_rate": 1.6707845918307458e-06, "loss": 0.0148, "step": 3769 }, { "epoch": 6.090468497576737, "grad_norm": 2.0569874188640016, "learning_rate": 1.6695838296711624e-06, "loss": 0.0154, "step": 3770 }, { "epoch": 6.092084006462035, "grad_norm": 1.203977634201218, "learning_rate": 1.6683832828022473e-06, "loss": 0.0116, "step": 3771 }, { "epoch": 6.093699515347335, "grad_norm": 1.335655187573929, "learning_rate": 1.6671829515352494e-06, "loss": 0.0218, "step": 3772 }, { "epoch": 6.095315024232633, "grad_norm": 1.1419827710599588, "learning_rate": 1.6659828361813616e-06, "loss": 0.0131, "step": 3773 }, { "epoch": 6.096930533117932, "grad_norm": 1.384996797139566, "learning_rate": 1.6647829370517232e-06, "loss": 0.0131, "step": 3774 }, { "epoch": 6.098546042003231, "grad_norm": 1.0289724773837001, "learning_rate": 1.6635832544574148e-06, "loss": 0.0126, "step": 3775 }, { "epoch": 6.10016155088853, "grad_norm": 1.6390496522151017, "learning_rate": 1.6623837887094615e-06, "loss": 0.0187, "step": 3776 }, { "epoch": 6.101777059773829, "grad_norm": 1.3826523847733014, "learning_rate": 1.6611845401188323e-06, "loss": 0.0149, "step": 3777 }, { "epoch": 6.103392568659127, "grad_norm": 1.4655017552769836, "learning_rate": 1.6599855089964406e-06, "loss": 0.0171, "step": 3778 }, { "epoch": 6.105008077544427, "grad_norm": 2.190377414319819, "learning_rate": 1.6587866956531423e-06, "loss": 0.0271, "step": 3779 }, { "epoch": 6.106623586429725, "grad_norm": 1.583961487715067, "learning_rate": 1.6575881003997382e-06, "loss": 0.0141, "step": 3780 }, { "epoch": 6.1082390953150245, "grad_norm": 1.2781365154739865, "learning_rate": 1.6563897235469705e-06, "loss": 0.0133, "step": 3781 }, { "epoch": 6.109854604200323, "grad_norm": 1.3487214039168016, "learning_rate": 1.6551915654055272e-06, "loss": 0.0183, "step": 3782 }, { "epoch": 6.111470113085622, "grad_norm": 1.2605988932980252, "learning_rate": 1.6539936262860379e-06, "loss": 0.0126, "step": 3783 }, { "epoch": 6.113085621970921, "grad_norm": 1.0817434134025106, "learning_rate": 1.6527959064990753e-06, "loss": 0.0151, "step": 3784 }, { "epoch": 6.114701130856219, "grad_norm": 0.9702826568630697, "learning_rate": 1.6515984063551566e-06, "loss": 0.012, "step": 3785 }, { "epoch": 6.116316639741519, "grad_norm": 1.2477609726054764, "learning_rate": 1.6504011261647417e-06, "loss": 0.017, "step": 3786 }, { "epoch": 6.117932148626817, "grad_norm": 0.9523103339091012, "learning_rate": 1.6492040662382324e-06, "loss": 0.0093, "step": 3787 }, { "epoch": 6.119547657512117, "grad_norm": 1.0158338803420042, "learning_rate": 1.6480072268859745e-06, "loss": 0.0109, "step": 3788 }, { "epoch": 6.121163166397415, "grad_norm": 1.5686623687162613, "learning_rate": 1.6468106084182558e-06, "loss": 0.0182, "step": 3789 }, { "epoch": 6.1227786752827145, "grad_norm": 1.4519239919945641, "learning_rate": 1.6456142111453065e-06, "loss": 0.018, "step": 3790 }, { "epoch": 6.124394184168013, "grad_norm": 1.4220225480298094, "learning_rate": 1.6444180353773023e-06, "loss": 0.009, "step": 3791 }, { "epoch": 6.1260096930533114, "grad_norm": 1.3711808394780207, "learning_rate": 1.6432220814243582e-06, "loss": 0.0153, "step": 3792 }, { "epoch": 6.127625201938611, "grad_norm": 1.026223264375371, "learning_rate": 1.6420263495965328e-06, "loss": 0.0123, "step": 3793 }, { "epoch": 6.129240710823909, "grad_norm": 1.3529395230942098, "learning_rate": 1.640830840203827e-06, "loss": 0.0165, "step": 3794 }, { "epoch": 6.130856219709209, "grad_norm": 0.9146047364957252, "learning_rate": 1.6396355535561833e-06, "loss": 0.0108, "step": 3795 }, { "epoch": 6.132471728594507, "grad_norm": 1.4438941636304086, "learning_rate": 1.6384404899634899e-06, "loss": 0.0162, "step": 3796 }, { "epoch": 6.1340872374798066, "grad_norm": 1.406932410486601, "learning_rate": 1.6372456497355732e-06, "loss": 0.0152, "step": 3797 }, { "epoch": 6.135702746365105, "grad_norm": 1.3006457808865695, "learning_rate": 1.6360510331822027e-06, "loss": 0.0187, "step": 3798 }, { "epoch": 6.1373182552504035, "grad_norm": 0.9902023175874156, "learning_rate": 1.6348566406130912e-06, "loss": 0.0086, "step": 3799 }, { "epoch": 6.138933764135703, "grad_norm": 0.9624364910115876, "learning_rate": 1.6336624723378911e-06, "loss": 0.0099, "step": 3800 }, { "epoch": 6.138933764135703, "eval_loss": 1.6377763748168945, "eval_runtime": 2.3559, "eval_samples_per_second": 127.34, "eval_steps_per_second": 2.971, "step": 3800 }, { "epoch": 6.140549273021001, "grad_norm": 1.2314233059365849, "learning_rate": 1.6324685286662001e-06, "loss": 0.0137, "step": 3801 }, { "epoch": 6.142164781906301, "grad_norm": 1.499021856608979, "learning_rate": 1.6312748099075545e-06, "loss": 0.0103, "step": 3802 }, { "epoch": 6.143780290791599, "grad_norm": 1.2514123627439464, "learning_rate": 1.6300813163714342e-06, "loss": 0.0117, "step": 3803 }, { "epoch": 6.145395799676899, "grad_norm": 2.12444265706618, "learning_rate": 1.6288880483672587e-06, "loss": 0.0251, "step": 3804 }, { "epoch": 6.147011308562197, "grad_norm": 1.6328557197216715, "learning_rate": 1.6276950062043912e-06, "loss": 0.0173, "step": 3805 }, { "epoch": 6.148626817447496, "grad_norm": 1.158998936224715, "learning_rate": 1.6265021901921351e-06, "loss": 0.0112, "step": 3806 }, { "epoch": 6.150242326332795, "grad_norm": 1.5668274726037417, "learning_rate": 1.6253096006397365e-06, "loss": 0.0184, "step": 3807 }, { "epoch": 6.1518578352180935, "grad_norm": 1.5361075929597385, "learning_rate": 1.6241172378563808e-06, "loss": 0.0172, "step": 3808 }, { "epoch": 6.153473344103393, "grad_norm": 1.4189592418981738, "learning_rate": 1.6229251021511955e-06, "loss": 0.0232, "step": 3809 }, { "epoch": 6.155088852988691, "grad_norm": 1.182530002734368, "learning_rate": 1.6217331938332505e-06, "loss": 0.0153, "step": 3810 }, { "epoch": 6.156704361873991, "grad_norm": 1.667562262410916, "learning_rate": 1.6205415132115542e-06, "loss": 0.0168, "step": 3811 }, { "epoch": 6.158319870759289, "grad_norm": 1.44806784324572, "learning_rate": 1.6193500605950589e-06, "loss": 0.0175, "step": 3812 }, { "epoch": 6.159935379644588, "grad_norm": 0.966354370691736, "learning_rate": 1.6181588362926543e-06, "loss": 0.0111, "step": 3813 }, { "epoch": 6.161550888529887, "grad_norm": 1.060599805638232, "learning_rate": 1.616967840613175e-06, "loss": 0.012, "step": 3814 }, { "epoch": 6.163166397415186, "grad_norm": 1.1268733498261727, "learning_rate": 1.615777073865393e-06, "loss": 0.0132, "step": 3815 }, { "epoch": 6.164781906300485, "grad_norm": 1.0609693142821635, "learning_rate": 1.614586536358022e-06, "loss": 0.0153, "step": 3816 }, { "epoch": 6.166397415185783, "grad_norm": 1.098183805007306, "learning_rate": 1.6133962283997163e-06, "loss": 0.0159, "step": 3817 }, { "epoch": 6.168012924071083, "grad_norm": 1.1991470828539952, "learning_rate": 1.6122061502990722e-06, "loss": 0.017, "step": 3818 }, { "epoch": 6.169628432956381, "grad_norm": 1.1459931732721407, "learning_rate": 1.6110163023646235e-06, "loss": 0.0097, "step": 3819 }, { "epoch": 6.17124394184168, "grad_norm": 1.525278324394973, "learning_rate": 1.6098266849048468e-06, "loss": 0.0225, "step": 3820 }, { "epoch": 6.172859450726979, "grad_norm": 1.7222665913782738, "learning_rate": 1.6086372982281578e-06, "loss": 0.0176, "step": 3821 }, { "epoch": 6.174474959612278, "grad_norm": 1.2041828192925321, "learning_rate": 1.607448142642911e-06, "loss": 0.0109, "step": 3822 }, { "epoch": 6.176090468497577, "grad_norm": 1.212118044527225, "learning_rate": 1.606259218457405e-06, "loss": 0.013, "step": 3823 }, { "epoch": 6.1777059773828755, "grad_norm": 1.176044910882707, "learning_rate": 1.6050705259798744e-06, "loss": 0.0141, "step": 3824 }, { "epoch": 6.179321486268175, "grad_norm": 1.2330626997259437, "learning_rate": 1.603882065518496e-06, "loss": 0.0142, "step": 3825 }, { "epoch": 6.180936995153473, "grad_norm": 0.8229639933524889, "learning_rate": 1.6026938373813856e-06, "loss": 0.011, "step": 3826 }, { "epoch": 6.182552504038772, "grad_norm": 1.345886442165989, "learning_rate": 1.6015058418765984e-06, "loss": 0.0112, "step": 3827 }, { "epoch": 6.184168012924071, "grad_norm": 1.2324454026671374, "learning_rate": 1.6003180793121293e-06, "loss": 0.0134, "step": 3828 }, { "epoch": 6.18578352180937, "grad_norm": 1.3979344702169187, "learning_rate": 1.5991305499959147e-06, "loss": 0.0184, "step": 3829 }, { "epoch": 6.187399030694669, "grad_norm": 1.344589462076588, "learning_rate": 1.597943254235829e-06, "loss": 0.0161, "step": 3830 }, { "epoch": 6.189014539579968, "grad_norm": 1.5043431731201018, "learning_rate": 1.5967561923396854e-06, "loss": 0.0241, "step": 3831 }, { "epoch": 6.190630048465267, "grad_norm": 1.0984823879894088, "learning_rate": 1.5955693646152376e-06, "loss": 0.012, "step": 3832 }, { "epoch": 6.1922455573505655, "grad_norm": 1.4710597947996258, "learning_rate": 1.5943827713701771e-06, "loss": 0.0166, "step": 3833 }, { "epoch": 6.193861066235864, "grad_norm": 1.4747873187070688, "learning_rate": 1.5931964129121378e-06, "loss": 0.0146, "step": 3834 }, { "epoch": 6.195476575121163, "grad_norm": 1.5280925694067835, "learning_rate": 1.5920102895486894e-06, "loss": 0.0117, "step": 3835 }, { "epoch": 6.197092084006462, "grad_norm": 1.1898021366138012, "learning_rate": 1.5908244015873426e-06, "loss": 0.0152, "step": 3836 }, { "epoch": 6.198707592891761, "grad_norm": 0.997903171362714, "learning_rate": 1.5896387493355454e-06, "loss": 0.0116, "step": 3837 }, { "epoch": 6.20032310177706, "grad_norm": 1.9892032044337848, "learning_rate": 1.5884533331006863e-06, "loss": 0.0146, "step": 3838 }, { "epoch": 6.201938610662358, "grad_norm": 1.2453896083935527, "learning_rate": 1.5872681531900918e-06, "loss": 0.0101, "step": 3839 }, { "epoch": 6.203554119547658, "grad_norm": 1.0945691359650949, "learning_rate": 1.5860832099110277e-06, "loss": 0.0132, "step": 3840 }, { "epoch": 6.205169628432956, "grad_norm": 0.8038689721629918, "learning_rate": 1.5848985035706974e-06, "loss": 0.0075, "step": 3841 }, { "epoch": 6.206785137318255, "grad_norm": 1.149166057299419, "learning_rate": 1.5837140344762445e-06, "loss": 0.0123, "step": 3842 }, { "epoch": 6.208400646203554, "grad_norm": 1.545041333805319, "learning_rate": 1.5825298029347498e-06, "loss": 0.016, "step": 3843 }, { "epoch": 6.210016155088853, "grad_norm": 1.7288410507518317, "learning_rate": 1.5813458092532316e-06, "loss": 0.0219, "step": 3844 }, { "epoch": 6.211631663974152, "grad_norm": 0.9515882415077892, "learning_rate": 1.5801620537386492e-06, "loss": 0.0098, "step": 3845 }, { "epoch": 6.21324717285945, "grad_norm": 1.5308283661047206, "learning_rate": 1.5789785366978987e-06, "loss": 0.0149, "step": 3846 }, { "epoch": 6.21486268174475, "grad_norm": 1.350581587976531, "learning_rate": 1.5777952584378143e-06, "loss": 0.0139, "step": 3847 }, { "epoch": 6.216478190630048, "grad_norm": 1.5322996214494138, "learning_rate": 1.576612219265168e-06, "loss": 0.0153, "step": 3848 }, { "epoch": 6.2180936995153475, "grad_norm": 1.697346791039615, "learning_rate": 1.5754294194866696e-06, "loss": 0.0152, "step": 3849 }, { "epoch": 6.219709208400646, "grad_norm": 1.4246078681543184, "learning_rate": 1.574246859408969e-06, "loss": 0.0122, "step": 3850 }, { "epoch": 6.221324717285945, "grad_norm": 0.8936341254720206, "learning_rate": 1.5730645393386519e-06, "loss": 0.0131, "step": 3851 }, { "epoch": 6.222940226171244, "grad_norm": 1.630198513156465, "learning_rate": 1.571882459582242e-06, "loss": 0.0387, "step": 3852 }, { "epoch": 6.224555735056542, "grad_norm": 1.1732087759930727, "learning_rate": 1.570700620446201e-06, "loss": 0.0163, "step": 3853 }, { "epoch": 6.226171243941842, "grad_norm": 1.2464708964326423, "learning_rate": 1.5695190222369284e-06, "loss": 0.0159, "step": 3854 }, { "epoch": 6.22778675282714, "grad_norm": 1.3971192208798067, "learning_rate": 1.5683376652607595e-06, "loss": 0.0149, "step": 3855 }, { "epoch": 6.22940226171244, "grad_norm": 1.9467058003843052, "learning_rate": 1.5671565498239717e-06, "loss": 0.0184, "step": 3856 }, { "epoch": 6.231017770597738, "grad_norm": 1.352843343768791, "learning_rate": 1.5659756762327744e-06, "loss": 0.0134, "step": 3857 }, { "epoch": 6.2326332794830375, "grad_norm": 1.2933168999440783, "learning_rate": 1.5647950447933174e-06, "loss": 0.0119, "step": 3858 }, { "epoch": 6.234248788368336, "grad_norm": 1.4634482243259452, "learning_rate": 1.5636146558116865e-06, "loss": 0.0136, "step": 3859 }, { "epoch": 6.2358642972536344, "grad_norm": 1.421867534342686, "learning_rate": 1.5624345095939046e-06, "loss": 0.0145, "step": 3860 }, { "epoch": 6.237479806138934, "grad_norm": 1.3362216588063431, "learning_rate": 1.5612546064459333e-06, "loss": 0.0132, "step": 3861 }, { "epoch": 6.239095315024232, "grad_norm": 1.8002225791780193, "learning_rate": 1.5600749466736697e-06, "loss": 0.0192, "step": 3862 }, { "epoch": 6.240710823909532, "grad_norm": 2.305341229146634, "learning_rate": 1.5588955305829477e-06, "loss": 0.0283, "step": 3863 }, { "epoch": 6.24232633279483, "grad_norm": 1.190741391490641, "learning_rate": 1.5577163584795386e-06, "loss": 0.0078, "step": 3864 }, { "epoch": 6.24394184168013, "grad_norm": 1.4648551053479608, "learning_rate": 1.5565374306691493e-06, "loss": 0.0209, "step": 3865 }, { "epoch": 6.245557350565428, "grad_norm": 1.4263748501052687, "learning_rate": 1.5553587474574255e-06, "loss": 0.0158, "step": 3866 }, { "epoch": 6.2471728594507265, "grad_norm": 1.8074562577580398, "learning_rate": 1.5541803091499485e-06, "loss": 0.0181, "step": 3867 }, { "epoch": 6.248788368336026, "grad_norm": 1.076252830388492, "learning_rate": 1.553002116052235e-06, "loss": 0.0104, "step": 3868 }, { "epoch": 6.250403877221324, "grad_norm": 1.5646941414616198, "learning_rate": 1.5518241684697394e-06, "loss": 0.016, "step": 3869 }, { "epoch": 6.252019386106624, "grad_norm": 1.3543799160671348, "learning_rate": 1.5506464667078528e-06, "loss": 0.0151, "step": 3870 }, { "epoch": 6.253634894991922, "grad_norm": 0.8711208610186977, "learning_rate": 1.5494690110719005e-06, "loss": 0.0112, "step": 3871 }, { "epoch": 6.255250403877222, "grad_norm": 1.3848322746119304, "learning_rate": 1.5482918018671467e-06, "loss": 0.0164, "step": 3872 }, { "epoch": 6.25686591276252, "grad_norm": 1.1439984483544978, "learning_rate": 1.547114839398789e-06, "loss": 0.0095, "step": 3873 }, { "epoch": 6.258481421647819, "grad_norm": 0.7875691671979401, "learning_rate": 1.545938123971964e-06, "loss": 0.008, "step": 3874 }, { "epoch": 6.260096930533118, "grad_norm": 1.7097259115712444, "learning_rate": 1.5447616558917414e-06, "loss": 0.0225, "step": 3875 }, { "epoch": 6.2617124394184165, "grad_norm": 1.3671340165181243, "learning_rate": 1.5435854354631285e-06, "loss": 0.0127, "step": 3876 }, { "epoch": 6.263327948303716, "grad_norm": 1.259565719619627, "learning_rate": 1.542409462991067e-06, "loss": 0.0123, "step": 3877 }, { "epoch": 6.264943457189014, "grad_norm": 1.3466223835325088, "learning_rate": 1.541233738780437e-06, "loss": 0.0173, "step": 3878 }, { "epoch": 6.266558966074314, "grad_norm": 1.3428601632497073, "learning_rate": 1.5400582631360517e-06, "loss": 0.0207, "step": 3879 }, { "epoch": 6.268174474959612, "grad_norm": 1.0948794422531252, "learning_rate": 1.53888303636266e-06, "loss": 0.0126, "step": 3880 }, { "epoch": 6.269789983844911, "grad_norm": 2.1954873631581684, "learning_rate": 1.537708058764948e-06, "loss": 0.0219, "step": 3881 }, { "epoch": 6.27140549273021, "grad_norm": 1.6823276007226045, "learning_rate": 1.536533330647534e-06, "loss": 0.0177, "step": 3882 }, { "epoch": 6.273021001615509, "grad_norm": 1.6301369763800366, "learning_rate": 1.5353588523149766e-06, "loss": 0.0154, "step": 3883 }, { "epoch": 6.274636510500808, "grad_norm": 1.1184206728377442, "learning_rate": 1.534184624071765e-06, "loss": 0.0141, "step": 3884 }, { "epoch": 6.276252019386106, "grad_norm": 1.0352905536193693, "learning_rate": 1.533010646222326e-06, "loss": 0.0113, "step": 3885 }, { "epoch": 6.277867528271406, "grad_norm": 1.2706193070979683, "learning_rate": 1.5318369190710202e-06, "loss": 0.0162, "step": 3886 }, { "epoch": 6.279483037156704, "grad_norm": 1.5999230583843942, "learning_rate": 1.5306634429221446e-06, "loss": 0.0184, "step": 3887 }, { "epoch": 6.281098546042003, "grad_norm": 1.3374180045868658, "learning_rate": 1.5294902180799288e-06, "loss": 0.0158, "step": 3888 }, { "epoch": 6.282714054927302, "grad_norm": 1.2833865766182002, "learning_rate": 1.5283172448485406e-06, "loss": 0.0143, "step": 3889 }, { "epoch": 6.284329563812601, "grad_norm": 1.2185744283538609, "learning_rate": 1.5271445235320803e-06, "loss": 0.0112, "step": 3890 }, { "epoch": 6.2859450726979, "grad_norm": 1.1969832301867285, "learning_rate": 1.5259720544345829e-06, "loss": 0.0128, "step": 3891 }, { "epoch": 6.2875605815831985, "grad_norm": 0.9397899128046385, "learning_rate": 1.5247998378600194e-06, "loss": 0.0109, "step": 3892 }, { "epoch": 6.289176090468498, "grad_norm": 1.4888380368391272, "learning_rate": 1.5236278741122918e-06, "loss": 0.0172, "step": 3893 }, { "epoch": 6.290791599353796, "grad_norm": 1.7102451814150825, "learning_rate": 1.5224561634952423e-06, "loss": 0.0147, "step": 3894 }, { "epoch": 6.292407108239095, "grad_norm": 1.1205693967945387, "learning_rate": 1.521284706312643e-06, "loss": 0.0115, "step": 3895 }, { "epoch": 6.294022617124394, "grad_norm": 1.6554331742602562, "learning_rate": 1.520113502868202e-06, "loss": 0.0156, "step": 3896 }, { "epoch": 6.295638126009693, "grad_norm": 0.8181474326438833, "learning_rate": 1.5189425534655599e-06, "loss": 0.0137, "step": 3897 }, { "epoch": 6.297253634894992, "grad_norm": 1.2954962491193907, "learning_rate": 1.5177718584082945e-06, "loss": 0.0118, "step": 3898 }, { "epoch": 6.298869143780291, "grad_norm": 1.2183519042062285, "learning_rate": 1.5166014179999145e-06, "loss": 0.0159, "step": 3899 }, { "epoch": 6.30048465266559, "grad_norm": 1.2571262096083653, "learning_rate": 1.5154312325438652e-06, "loss": 0.0134, "step": 3900 }, { "epoch": 6.3021001615508885, "grad_norm": 1.711886368113088, "learning_rate": 1.5142613023435238e-06, "loss": 0.0113, "step": 3901 }, { "epoch": 6.303715670436187, "grad_norm": 1.2652678881707686, "learning_rate": 1.5130916277022028e-06, "loss": 0.0159, "step": 3902 }, { "epoch": 6.305331179321486, "grad_norm": 1.1738971574047132, "learning_rate": 1.5119222089231477e-06, "loss": 0.0152, "step": 3903 }, { "epoch": 6.306946688206785, "grad_norm": 1.400040257544045, "learning_rate": 1.5107530463095365e-06, "loss": 0.0146, "step": 3904 }, { "epoch": 6.308562197092084, "grad_norm": 1.203120104342889, "learning_rate": 1.509584140164484e-06, "loss": 0.012, "step": 3905 }, { "epoch": 6.310177705977383, "grad_norm": 1.2924220390360406, "learning_rate": 1.5084154907910353e-06, "loss": 0.0163, "step": 3906 }, { "epoch": 6.311793214862682, "grad_norm": 1.1446774485323263, "learning_rate": 1.5072470984921705e-06, "loss": 0.0159, "step": 3907 }, { "epoch": 6.313408723747981, "grad_norm": 1.4879493911476456, "learning_rate": 1.506078963570803e-06, "loss": 0.0161, "step": 3908 }, { "epoch": 6.315024232633279, "grad_norm": 1.1900947262636832, "learning_rate": 1.5049110863297772e-06, "loss": 0.0145, "step": 3909 }, { "epoch": 6.316639741518578, "grad_norm": 1.325717900374547, "learning_rate": 1.503743467071876e-06, "loss": 0.0157, "step": 3910 }, { "epoch": 6.318255250403877, "grad_norm": 1.2474055443402905, "learning_rate": 1.50257610609981e-06, "loss": 0.0141, "step": 3911 }, { "epoch": 6.319870759289176, "grad_norm": 1.0958452210730865, "learning_rate": 1.5014090037162255e-06, "loss": 0.0116, "step": 3912 }, { "epoch": 6.321486268174475, "grad_norm": 1.2487237550849006, "learning_rate": 1.5002421602237017e-06, "loss": 0.012, "step": 3913 }, { "epoch": 6.323101777059774, "grad_norm": 1.1765560104794637, "learning_rate": 1.4990755759247488e-06, "loss": 0.0136, "step": 3914 }, { "epoch": 6.324717285945073, "grad_norm": 1.323033734322898, "learning_rate": 1.4979092511218113e-06, "loss": 0.017, "step": 3915 }, { "epoch": 6.326332794830371, "grad_norm": 1.600065208976109, "learning_rate": 1.4967431861172683e-06, "loss": 0.0133, "step": 3916 }, { "epoch": 6.3279483037156705, "grad_norm": 1.2912639702243633, "learning_rate": 1.495577381213428e-06, "loss": 0.0139, "step": 3917 }, { "epoch": 6.329563812600969, "grad_norm": 1.1321163071602625, "learning_rate": 1.4944118367125332e-06, "loss": 0.0153, "step": 3918 }, { "epoch": 6.331179321486268, "grad_norm": 0.9004533554893062, "learning_rate": 1.4932465529167584e-06, "loss": 0.0081, "step": 3919 }, { "epoch": 6.332794830371567, "grad_norm": 1.3310865270189285, "learning_rate": 1.4920815301282099e-06, "loss": 0.0175, "step": 3920 }, { "epoch": 6.334410339256866, "grad_norm": 1.0191823133188913, "learning_rate": 1.4909167686489296e-06, "loss": 0.0099, "step": 3921 }, { "epoch": 6.336025848142165, "grad_norm": 1.346919139909777, "learning_rate": 1.4897522687808877e-06, "loss": 0.0206, "step": 3922 }, { "epoch": 6.337641357027463, "grad_norm": 1.7176498051370195, "learning_rate": 1.4885880308259886e-06, "loss": 0.0176, "step": 3923 }, { "epoch": 6.339256865912763, "grad_norm": 1.3103175383185222, "learning_rate": 1.4874240550860686e-06, "loss": 0.0149, "step": 3924 }, { "epoch": 6.340872374798061, "grad_norm": 1.2005981253934364, "learning_rate": 1.4862603418628946e-06, "loss": 0.0122, "step": 3925 }, { "epoch": 6.3424878836833605, "grad_norm": 1.6007827638835221, "learning_rate": 1.4850968914581676e-06, "loss": 0.0183, "step": 3926 }, { "epoch": 6.344103392568659, "grad_norm": 1.8628696716441204, "learning_rate": 1.4839337041735202e-06, "loss": 0.0191, "step": 3927 }, { "epoch": 6.345718901453958, "grad_norm": 0.967055433832531, "learning_rate": 1.4827707803105152e-06, "loss": 0.0128, "step": 3928 }, { "epoch": 6.347334410339257, "grad_norm": 1.3675991811440422, "learning_rate": 1.4816081201706479e-06, "loss": 0.013, "step": 3929 }, { "epoch": 6.348949919224555, "grad_norm": 1.3036228443696511, "learning_rate": 1.4804457240553457e-06, "loss": 0.0159, "step": 3930 }, { "epoch": 6.350565428109855, "grad_norm": 1.287936134966083, "learning_rate": 1.4792835922659668e-06, "loss": 0.0218, "step": 3931 }, { "epoch": 6.352180936995153, "grad_norm": 0.9724170383591842, "learning_rate": 1.4781217251038022e-06, "loss": 0.0092, "step": 3932 }, { "epoch": 6.353796445880453, "grad_norm": 1.1000855516124908, "learning_rate": 1.4769601228700716e-06, "loss": 0.0122, "step": 3933 }, { "epoch": 6.355411954765751, "grad_norm": 1.116202988198876, "learning_rate": 1.4757987858659299e-06, "loss": 0.0126, "step": 3934 }, { "epoch": 6.35702746365105, "grad_norm": 1.752550002191991, "learning_rate": 1.4746377143924594e-06, "loss": 0.0132, "step": 3935 }, { "epoch": 6.358642972536349, "grad_norm": 1.119555294865661, "learning_rate": 1.4734769087506764e-06, "loss": 0.012, "step": 3936 }, { "epoch": 6.360258481421647, "grad_norm": 1.3700690822936838, "learning_rate": 1.4723163692415254e-06, "loss": 0.0164, "step": 3937 }, { "epoch": 6.361873990306947, "grad_norm": 1.2910464246026145, "learning_rate": 1.471156096165886e-06, "loss": 0.0138, "step": 3938 }, { "epoch": 6.363489499192245, "grad_norm": 1.5507592017506793, "learning_rate": 1.4699960898245652e-06, "loss": 0.012, "step": 3939 }, { "epoch": 6.365105008077545, "grad_norm": 1.2427291610157944, "learning_rate": 1.4688363505183017e-06, "loss": 0.0152, "step": 3940 }, { "epoch": 6.366720516962843, "grad_norm": 1.6862278462745826, "learning_rate": 1.467676878547766e-06, "loss": 0.014, "step": 3941 }, { "epoch": 6.3683360258481425, "grad_norm": 1.1642863990488355, "learning_rate": 1.466517674213557e-06, "loss": 0.0146, "step": 3942 }, { "epoch": 6.369951534733441, "grad_norm": 2.4355182999555796, "learning_rate": 1.4653587378162083e-06, "loss": 0.0122, "step": 3943 }, { "epoch": 6.3715670436187395, "grad_norm": 1.7363226854812241, "learning_rate": 1.4642000696561797e-06, "loss": 0.0188, "step": 3944 }, { "epoch": 6.373182552504039, "grad_norm": 1.5781152728680259, "learning_rate": 1.463041670033864e-06, "loss": 0.0211, "step": 3945 }, { "epoch": 6.374798061389337, "grad_norm": 1.314373515824917, "learning_rate": 1.4618835392495834e-06, "loss": 0.0136, "step": 3946 }, { "epoch": 6.376413570274637, "grad_norm": 1.208124077843529, "learning_rate": 1.4607256776035905e-06, "loss": 0.0104, "step": 3947 }, { "epoch": 6.378029079159935, "grad_norm": 1.5186917837770744, "learning_rate": 1.4595680853960674e-06, "loss": 0.0197, "step": 3948 }, { "epoch": 6.379644588045235, "grad_norm": 1.2944247184161919, "learning_rate": 1.4584107629271302e-06, "loss": 0.017, "step": 3949 }, { "epoch": 6.381260096930533, "grad_norm": 1.223150367024821, "learning_rate": 1.4572537104968194e-06, "loss": 0.0124, "step": 3950 }, { "epoch": 6.382875605815832, "grad_norm": 1.6185788247859643, "learning_rate": 1.4560969284051096e-06, "loss": 0.017, "step": 3951 }, { "epoch": 6.384491114701131, "grad_norm": 1.3739978377436584, "learning_rate": 1.4549404169519027e-06, "loss": 0.0138, "step": 3952 }, { "epoch": 6.386106623586429, "grad_norm": 1.096465335188151, "learning_rate": 1.4537841764370319e-06, "loss": 0.0107, "step": 3953 }, { "epoch": 6.387722132471729, "grad_norm": 1.2013121902267134, "learning_rate": 1.4526282071602603e-06, "loss": 0.0169, "step": 3954 }, { "epoch": 6.389337641357027, "grad_norm": 1.3308552592503062, "learning_rate": 1.4514725094212817e-06, "loss": 0.0146, "step": 3955 }, { "epoch": 6.390953150242327, "grad_norm": 1.5068872850638642, "learning_rate": 1.4503170835197156e-06, "loss": 0.0175, "step": 3956 }, { "epoch": 6.392568659127625, "grad_norm": 1.4282260173643226, "learning_rate": 1.4491619297551152e-06, "loss": 0.0142, "step": 3957 }, { "epoch": 6.394184168012924, "grad_norm": 1.4956330125697683, "learning_rate": 1.4480070484269598e-06, "loss": 0.0233, "step": 3958 }, { "epoch": 6.395799676898223, "grad_norm": 1.3577241439482939, "learning_rate": 1.4468524398346612e-06, "loss": 0.0178, "step": 3959 }, { "epoch": 6.3974151857835215, "grad_norm": 1.470465205651135, "learning_rate": 1.4456981042775582e-06, "loss": 0.0148, "step": 3960 }, { "epoch": 6.399030694668821, "grad_norm": 0.8437482442153454, "learning_rate": 1.4445440420549205e-06, "loss": 0.01, "step": 3961 }, { "epoch": 6.400646203554119, "grad_norm": 1.249718497287616, "learning_rate": 1.443390253465945e-06, "loss": 0.0149, "step": 3962 }, { "epoch": 6.402261712439419, "grad_norm": 1.2698617586720087, "learning_rate": 1.4422367388097603e-06, "loss": 0.0118, "step": 3963 }, { "epoch": 6.403877221324717, "grad_norm": 1.440785758235375, "learning_rate": 1.4410834983854194e-06, "loss": 0.0156, "step": 3964 }, { "epoch": 6.405492730210016, "grad_norm": 0.9080842657903334, "learning_rate": 1.4399305324919108e-06, "loss": 0.0096, "step": 3965 }, { "epoch": 6.407108239095315, "grad_norm": 1.4984699365786285, "learning_rate": 1.4387778414281456e-06, "loss": 0.0139, "step": 3966 }, { "epoch": 6.408723747980614, "grad_norm": 1.2439767185497976, "learning_rate": 1.4376254254929685e-06, "loss": 0.0114, "step": 3967 }, { "epoch": 6.410339256865913, "grad_norm": 1.1335549300792558, "learning_rate": 1.4364732849851484e-06, "loss": 0.0138, "step": 3968 }, { "epoch": 6.4119547657512115, "grad_norm": 1.5550784728571545, "learning_rate": 1.435321420203386e-06, "loss": 0.0173, "step": 3969 }, { "epoch": 6.413570274636511, "grad_norm": 1.4805160274475162, "learning_rate": 1.4341698314463098e-06, "loss": 0.0167, "step": 3970 }, { "epoch": 6.415185783521809, "grad_norm": 1.4176967812123178, "learning_rate": 1.4330185190124768e-06, "loss": 0.0133, "step": 3971 }, { "epoch": 6.416801292407108, "grad_norm": 1.2018823611604745, "learning_rate": 1.431867483200371e-06, "loss": 0.011, "step": 3972 }, { "epoch": 6.418416801292407, "grad_norm": 1.704505314305979, "learning_rate": 1.4307167243084064e-06, "loss": 0.0195, "step": 3973 }, { "epoch": 6.420032310177706, "grad_norm": 1.2003167746879102, "learning_rate": 1.4295662426349255e-06, "loss": 0.0107, "step": 3974 }, { "epoch": 6.421647819063005, "grad_norm": 1.2465939685601108, "learning_rate": 1.428416038478196e-06, "loss": 0.011, "step": 3975 }, { "epoch": 6.423263327948304, "grad_norm": 1.204048058272269, "learning_rate": 1.4272661121364161e-06, "loss": 0.0126, "step": 3976 }, { "epoch": 6.424878836833603, "grad_norm": 1.486027101806917, "learning_rate": 1.4261164639077124e-06, "loss": 0.0196, "step": 3977 }, { "epoch": 6.426494345718901, "grad_norm": 1.2281060410856282, "learning_rate": 1.424967094090139e-06, "loss": 0.0182, "step": 3978 }, { "epoch": 6.4281098546042, "grad_norm": 1.3979084575306637, "learning_rate": 1.4238180029816755e-06, "loss": 0.0139, "step": 3979 }, { "epoch": 6.429725363489499, "grad_norm": 1.2860083041780808, "learning_rate": 1.4226691908802326e-06, "loss": 0.0104, "step": 3980 }, { "epoch": 6.431340872374798, "grad_norm": 0.8049973762035696, "learning_rate": 1.421520658083646e-06, "loss": 0.0105, "step": 3981 }, { "epoch": 6.432956381260097, "grad_norm": 1.227297686397315, "learning_rate": 1.4203724048896817e-06, "loss": 0.0151, "step": 3982 }, { "epoch": 6.434571890145396, "grad_norm": 1.044359216232768, "learning_rate": 1.4192244315960297e-06, "loss": 0.0126, "step": 3983 }, { "epoch": 6.436187399030695, "grad_norm": 1.4310752417411983, "learning_rate": 1.4180767385003112e-06, "loss": 0.0207, "step": 3984 }, { "epoch": 6.4378029079159935, "grad_norm": 1.0662272182508856, "learning_rate": 1.416929325900071e-06, "loss": 0.0128, "step": 3985 }, { "epoch": 6.439418416801292, "grad_norm": 1.5435124331482077, "learning_rate": 1.415782194092784e-06, "loss": 0.0171, "step": 3986 }, { "epoch": 6.441033925686591, "grad_norm": 1.439623724032713, "learning_rate": 1.414635343375852e-06, "loss": 0.0175, "step": 3987 }, { "epoch": 6.44264943457189, "grad_norm": 1.7027894316559762, "learning_rate": 1.4134887740466036e-06, "loss": 0.0163, "step": 3988 }, { "epoch": 6.444264943457189, "grad_norm": 1.4310414189201703, "learning_rate": 1.4123424864022926e-06, "loss": 0.0148, "step": 3989 }, { "epoch": 6.445880452342488, "grad_norm": 1.5228491211535127, "learning_rate": 1.411196480740103e-06, "loss": 0.0134, "step": 3990 }, { "epoch": 6.447495961227787, "grad_norm": 1.2877190426187546, "learning_rate": 1.4100507573571411e-06, "loss": 0.0156, "step": 3991 }, { "epoch": 6.449111470113086, "grad_norm": 2.0318537580451776, "learning_rate": 1.4089053165504473e-06, "loss": 0.0205, "step": 3992 }, { "epoch": 6.450726978998384, "grad_norm": 1.2704197777917798, "learning_rate": 1.4077601586169815e-06, "loss": 0.0139, "step": 3993 }, { "epoch": 6.4523424878836835, "grad_norm": 1.2339691632032064, "learning_rate": 1.4066152838536346e-06, "loss": 0.0122, "step": 3994 }, { "epoch": 6.453957996768982, "grad_norm": 1.8423476975262099, "learning_rate": 1.405470692557221e-06, "loss": 0.0133, "step": 3995 }, { "epoch": 6.455573505654281, "grad_norm": 1.116973137839592, "learning_rate": 1.4043263850244853e-06, "loss": 0.015, "step": 3996 }, { "epoch": 6.45718901453958, "grad_norm": 1.4988808983165345, "learning_rate": 1.4031823615520934e-06, "loss": 0.0133, "step": 3997 }, { "epoch": 6.458804523424879, "grad_norm": 0.9515594995854075, "learning_rate": 1.4020386224366444e-06, "loss": 0.0113, "step": 3998 }, { "epoch": 6.460420032310178, "grad_norm": 1.1422345586086677, "learning_rate": 1.4008951679746575e-06, "loss": 0.0116, "step": 3999 }, { "epoch": 6.462035541195476, "grad_norm": 1.1364658800615983, "learning_rate": 1.3997519984625823e-06, "loss": 0.011, "step": 4000 }, { "epoch": 6.462035541195476, "eval_loss": 1.6083439588546753, "eval_runtime": 2.3465, "eval_samples_per_second": 127.851, "eval_steps_per_second": 2.983, "step": 4000 }, { "epoch": 6.463651050080776, "grad_norm": 1.2559143169714853, "learning_rate": 1.3986091141967906e-06, "loss": 0.0175, "step": 4001 }, { "epoch": 6.465266558966074, "grad_norm": 1.2275820374716555, "learning_rate": 1.3974665154735833e-06, "loss": 0.0137, "step": 4002 }, { "epoch": 6.466882067851373, "grad_norm": 1.6204815661874779, "learning_rate": 1.3963242025891868e-06, "loss": 0.0236, "step": 4003 }, { "epoch": 6.468497576736672, "grad_norm": 1.5156190445581834, "learning_rate": 1.3951821758397538e-06, "loss": 0.0192, "step": 4004 }, { "epoch": 6.470113085621971, "grad_norm": 0.9198059555421682, "learning_rate": 1.39404043552136e-06, "loss": 0.0095, "step": 4005 }, { "epoch": 6.47172859450727, "grad_norm": 1.9575115926390563, "learning_rate": 1.3928989819300092e-06, "loss": 0.0184, "step": 4006 }, { "epoch": 6.473344103392568, "grad_norm": 1.3272502350273174, "learning_rate": 1.3917578153616328e-06, "loss": 0.0179, "step": 4007 }, { "epoch": 6.474959612277868, "grad_norm": 1.1892969717395923, "learning_rate": 1.3906169361120814e-06, "loss": 0.011, "step": 4008 }, { "epoch": 6.476575121163166, "grad_norm": 1.4727548018859227, "learning_rate": 1.3894763444771397e-06, "loss": 0.0152, "step": 4009 }, { "epoch": 6.4781906300484655, "grad_norm": 2.0012675763759353, "learning_rate": 1.3883360407525098e-06, "loss": 0.0113, "step": 4010 }, { "epoch": 6.479806138933764, "grad_norm": 1.3340211742186228, "learning_rate": 1.3871960252338257e-06, "loss": 0.0151, "step": 4011 }, { "epoch": 6.481421647819063, "grad_norm": 1.9308830144219553, "learning_rate": 1.3860562982166404e-06, "loss": 0.0159, "step": 4012 }, { "epoch": 6.483037156704362, "grad_norm": 1.3148495027359006, "learning_rate": 1.3849168599964374e-06, "loss": 0.012, "step": 4013 }, { "epoch": 6.48465266558966, "grad_norm": 1.3581099406350603, "learning_rate": 1.383777710868623e-06, "loss": 0.0112, "step": 4014 }, { "epoch": 6.48626817447496, "grad_norm": 0.9066216017681507, "learning_rate": 1.3826388511285293e-06, "loss": 0.0099, "step": 4015 }, { "epoch": 6.487883683360258, "grad_norm": 1.7889225121238053, "learning_rate": 1.3815002810714115e-06, "loss": 0.0217, "step": 4016 }, { "epoch": 6.489499192245558, "grad_norm": 1.468877012905173, "learning_rate": 1.3803620009924528e-06, "loss": 0.0162, "step": 4017 }, { "epoch": 6.491114701130856, "grad_norm": 1.6289316817817163, "learning_rate": 1.3792240111867572e-06, "loss": 0.0133, "step": 4018 }, { "epoch": 6.4927302100161555, "grad_norm": 1.1976027914965361, "learning_rate": 1.3780863119493592e-06, "loss": 0.0106, "step": 4019 }, { "epoch": 6.494345718901454, "grad_norm": 1.82459048371994, "learning_rate": 1.3769489035752115e-06, "loss": 0.0217, "step": 4020 }, { "epoch": 6.4959612277867524, "grad_norm": 1.3758872052085234, "learning_rate": 1.3758117863591968e-06, "loss": 0.016, "step": 4021 }, { "epoch": 6.497576736672052, "grad_norm": 1.7637275107385408, "learning_rate": 1.3746749605961174e-06, "loss": 0.0257, "step": 4022 }, { "epoch": 6.49919224555735, "grad_norm": 1.4660364516807616, "learning_rate": 1.3735384265807056e-06, "loss": 0.0185, "step": 4023 }, { "epoch": 6.50080775444265, "grad_norm": 1.3223564696216752, "learning_rate": 1.3724021846076107e-06, "loss": 0.0155, "step": 4024 }, { "epoch": 6.502423263327948, "grad_norm": 1.7287667511904787, "learning_rate": 1.3712662349714162e-06, "loss": 0.0169, "step": 4025 }, { "epoch": 6.5040387722132476, "grad_norm": 1.6865325518955416, "learning_rate": 1.3701305779666199e-06, "loss": 0.0134, "step": 4026 }, { "epoch": 6.505654281098546, "grad_norm": 1.0476327462250403, "learning_rate": 1.3689952138876508e-06, "loss": 0.0106, "step": 4027 }, { "epoch": 6.5072697899838445, "grad_norm": 1.602853848148325, "learning_rate": 1.3678601430288574e-06, "loss": 0.013, "step": 4028 }, { "epoch": 6.508885298869144, "grad_norm": 1.1586661483130212, "learning_rate": 1.3667253656845142e-06, "loss": 0.0151, "step": 4029 }, { "epoch": 6.510500807754442, "grad_norm": 1.7328203148672636, "learning_rate": 1.3655908821488204e-06, "loss": 0.0182, "step": 4030 }, { "epoch": 6.512116316639742, "grad_norm": 1.3720812529365176, "learning_rate": 1.3644566927158983e-06, "loss": 0.0119, "step": 4031 }, { "epoch": 6.51373182552504, "grad_norm": 1.2528657945486974, "learning_rate": 1.3633227976797926e-06, "loss": 0.0143, "step": 4032 }, { "epoch": 6.51534733441034, "grad_norm": 1.393451954362037, "learning_rate": 1.3621891973344725e-06, "loss": 0.0188, "step": 4033 }, { "epoch": 6.516962843295638, "grad_norm": 1.3821836594076824, "learning_rate": 1.361055891973833e-06, "loss": 0.0122, "step": 4034 }, { "epoch": 6.518578352180937, "grad_norm": 0.7955036912442786, "learning_rate": 1.359922881891689e-06, "loss": 0.01, "step": 4035 }, { "epoch": 6.520193861066236, "grad_norm": 1.4446031367675265, "learning_rate": 1.358790167381781e-06, "loss": 0.0128, "step": 4036 }, { "epoch": 6.5218093699515345, "grad_norm": 1.226229114224885, "learning_rate": 1.3576577487377722e-06, "loss": 0.0147, "step": 4037 }, { "epoch": 6.523424878836834, "grad_norm": 1.1223027795912446, "learning_rate": 1.356525626253251e-06, "loss": 0.0134, "step": 4038 }, { "epoch": 6.525040387722132, "grad_norm": 1.6414505467743574, "learning_rate": 1.3553938002217248e-06, "loss": 0.0132, "step": 4039 }, { "epoch": 6.526655896607432, "grad_norm": 1.3525125844789625, "learning_rate": 1.3542622709366283e-06, "loss": 0.0129, "step": 4040 }, { "epoch": 6.52827140549273, "grad_norm": 1.2873864538367124, "learning_rate": 1.353131038691317e-06, "loss": 0.0131, "step": 4041 }, { "epoch": 6.529886914378029, "grad_norm": 1.3432878849786702, "learning_rate": 1.3520001037790712e-06, "loss": 0.0148, "step": 4042 }, { "epoch": 6.531502423263328, "grad_norm": 1.2792758474517512, "learning_rate": 1.350869466493091e-06, "loss": 0.0124, "step": 4043 }, { "epoch": 6.533117932148627, "grad_norm": 1.4563193525419746, "learning_rate": 1.3497391271265037e-06, "loss": 0.0105, "step": 4044 }, { "epoch": 6.534733441033926, "grad_norm": 0.951239298164916, "learning_rate": 1.3486090859723544e-06, "loss": 0.0141, "step": 4045 }, { "epoch": 6.536348949919224, "grad_norm": 1.0347947860535205, "learning_rate": 1.347479343323615e-06, "loss": 0.0094, "step": 4046 }, { "epoch": 6.537964458804524, "grad_norm": 1.2776125176980184, "learning_rate": 1.3463498994731778e-06, "loss": 0.0141, "step": 4047 }, { "epoch": 6.539579967689822, "grad_norm": 2.126148096091444, "learning_rate": 1.3452207547138596e-06, "loss": 0.0223, "step": 4048 }, { "epoch": 6.541195476575121, "grad_norm": 1.5912519301647066, "learning_rate": 1.3440919093383966e-06, "loss": 0.0103, "step": 4049 }, { "epoch": 6.54281098546042, "grad_norm": 1.069160998311333, "learning_rate": 1.3429633636394509e-06, "loss": 0.0127, "step": 4050 }, { "epoch": 6.544426494345719, "grad_norm": 1.2986945893119666, "learning_rate": 1.3418351179096022e-06, "loss": 0.013, "step": 4051 }, { "epoch": 6.546042003231018, "grad_norm": 1.6046705499729352, "learning_rate": 1.3407071724413593e-06, "loss": 0.0122, "step": 4052 }, { "epoch": 6.5476575121163165, "grad_norm": 2.2571406907930607, "learning_rate": 1.339579527527146e-06, "loss": 0.0194, "step": 4053 }, { "epoch": 6.549273021001616, "grad_norm": 1.42822890243243, "learning_rate": 1.3384521834593138e-06, "loss": 0.0117, "step": 4054 }, { "epoch": 6.550888529886914, "grad_norm": 1.147411685978251, "learning_rate": 1.3373251405301321e-06, "loss": 0.0095, "step": 4055 }, { "epoch": 6.552504038772213, "grad_norm": 1.217039406756574, "learning_rate": 1.3361983990317953e-06, "loss": 0.0134, "step": 4056 }, { "epoch": 6.554119547657512, "grad_norm": 1.331064783735414, "learning_rate": 1.3350719592564155e-06, "loss": 0.0116, "step": 4057 }, { "epoch": 6.555735056542811, "grad_norm": 1.2469027374857358, "learning_rate": 1.3339458214960332e-06, "loss": 0.0159, "step": 4058 }, { "epoch": 6.55735056542811, "grad_norm": 1.4090716240625556, "learning_rate": 1.3328199860426045e-06, "loss": 0.0182, "step": 4059 }, { "epoch": 6.558966074313409, "grad_norm": 1.3062875663606437, "learning_rate": 1.3316944531880102e-06, "loss": 0.0149, "step": 4060 }, { "epoch": 6.560581583198708, "grad_norm": 1.5898870231434448, "learning_rate": 1.3305692232240509e-06, "loss": 0.0214, "step": 4061 }, { "epoch": 6.5621970920840065, "grad_norm": 1.3481867942159929, "learning_rate": 1.32944429644245e-06, "loss": 0.0137, "step": 4062 }, { "epoch": 6.563812600969305, "grad_norm": 1.3488518128390796, "learning_rate": 1.3283196731348525e-06, "loss": 0.0149, "step": 4063 }, { "epoch": 6.565428109854604, "grad_norm": 1.1438575351772406, "learning_rate": 1.3271953535928245e-06, "loss": 0.0148, "step": 4064 }, { "epoch": 6.567043618739903, "grad_norm": 1.1677898877921744, "learning_rate": 1.3260713381078511e-06, "loss": 0.0102, "step": 4065 }, { "epoch": 6.568659127625202, "grad_norm": 2.972204751178143, "learning_rate": 1.3249476269713418e-06, "loss": 0.016, "step": 4066 }, { "epoch": 6.570274636510501, "grad_norm": 1.8996328452588185, "learning_rate": 1.3238242204746266e-06, "loss": 0.0251, "step": 4067 }, { "epoch": 6.5718901453958, "grad_norm": 1.0083587904044902, "learning_rate": 1.3227011189089528e-06, "loss": 0.0122, "step": 4068 }, { "epoch": 6.573505654281099, "grad_norm": 1.457831439146694, "learning_rate": 1.3215783225654958e-06, "loss": 0.0096, "step": 4069 }, { "epoch": 6.575121163166397, "grad_norm": 1.377290114247536, "learning_rate": 1.3204558317353443e-06, "loss": 0.0123, "step": 4070 }, { "epoch": 6.576736672051696, "grad_norm": 1.1310124760409799, "learning_rate": 1.3193336467095136e-06, "loss": 0.01, "step": 4071 }, { "epoch": 6.578352180936995, "grad_norm": 1.339062135947998, "learning_rate": 1.318211767778935e-06, "loss": 0.0119, "step": 4072 }, { "epoch": 6.579967689822294, "grad_norm": 1.4939823377950971, "learning_rate": 1.317090195234464e-06, "loss": 0.0227, "step": 4073 }, { "epoch": 6.581583198707593, "grad_norm": 1.600953149732839, "learning_rate": 1.315968929366875e-06, "loss": 0.0158, "step": 4074 }, { "epoch": 6.583198707592892, "grad_norm": 1.1894693452127896, "learning_rate": 1.314847970466865e-06, "loss": 0.0133, "step": 4075 }, { "epoch": 6.584814216478191, "grad_norm": 1.5807385736662738, "learning_rate": 1.3137273188250465e-06, "loss": 0.0224, "step": 4076 }, { "epoch": 6.586429725363489, "grad_norm": 1.3998506978919727, "learning_rate": 1.3126069747319591e-06, "loss": 0.0137, "step": 4077 }, { "epoch": 6.5880452342487885, "grad_norm": 1.3224294945999737, "learning_rate": 1.311486938478055e-06, "loss": 0.0113, "step": 4078 }, { "epoch": 6.589660743134087, "grad_norm": 1.7065239944280548, "learning_rate": 1.310367210353715e-06, "loss": 0.0178, "step": 4079 }, { "epoch": 6.591276252019386, "grad_norm": 1.1881902990804794, "learning_rate": 1.309247790649233e-06, "loss": 0.0164, "step": 4080 }, { "epoch": 6.592891760904685, "grad_norm": 1.2315196036437217, "learning_rate": 1.3081286796548276e-06, "loss": 0.0161, "step": 4081 }, { "epoch": 6.594507269789984, "grad_norm": 1.3703909566225074, "learning_rate": 1.3070098776606333e-06, "loss": 0.0147, "step": 4082 }, { "epoch": 6.596122778675283, "grad_norm": 1.1312061034078793, "learning_rate": 1.3058913849567082e-06, "loss": 0.0134, "step": 4083 }, { "epoch": 6.597738287560581, "grad_norm": 1.2455374542290214, "learning_rate": 1.3047732018330265e-06, "loss": 0.0132, "step": 4084 }, { "epoch": 6.599353796445881, "grad_norm": 1.850799690065442, "learning_rate": 1.3036553285794875e-06, "loss": 0.0167, "step": 4085 }, { "epoch": 6.600969305331179, "grad_norm": 1.5014764075065379, "learning_rate": 1.3025377654859046e-06, "loss": 0.0174, "step": 4086 }, { "epoch": 6.6025848142164785, "grad_norm": 1.114290771765229, "learning_rate": 1.3014205128420145e-06, "loss": 0.0136, "step": 4087 }, { "epoch": 6.604200323101777, "grad_norm": 1.3682025573272794, "learning_rate": 1.3003035709374705e-06, "loss": 0.0172, "step": 4088 }, { "epoch": 6.605815831987076, "grad_norm": 1.3975581775797066, "learning_rate": 1.2991869400618479e-06, "loss": 0.0189, "step": 4089 }, { "epoch": 6.607431340872375, "grad_norm": 1.8275332947576526, "learning_rate": 1.29807062050464e-06, "loss": 0.0178, "step": 4090 }, { "epoch": 6.609046849757673, "grad_norm": 1.4640140431569821, "learning_rate": 1.2969546125552612e-06, "loss": 0.0245, "step": 4091 }, { "epoch": 6.610662358642973, "grad_norm": 1.2975329248187222, "learning_rate": 1.2958389165030413e-06, "loss": 0.0204, "step": 4092 }, { "epoch": 6.612277867528271, "grad_norm": 2.0155208738398573, "learning_rate": 1.2947235326372325e-06, "loss": 0.0198, "step": 4093 }, { "epoch": 6.613893376413571, "grad_norm": 1.482442895114955, "learning_rate": 1.2936084612470067e-06, "loss": 0.0187, "step": 4094 }, { "epoch": 6.615508885298869, "grad_norm": 1.2282619484178448, "learning_rate": 1.2924937026214506e-06, "loss": 0.0123, "step": 4095 }, { "epoch": 6.617124394184168, "grad_norm": 1.0745930966386879, "learning_rate": 1.2913792570495737e-06, "loss": 0.0112, "step": 4096 }, { "epoch": 6.618739903069467, "grad_norm": 1.9486309787412985, "learning_rate": 1.2902651248203036e-06, "loss": 0.0203, "step": 4097 }, { "epoch": 6.620355411954765, "grad_norm": 1.4361049146446967, "learning_rate": 1.2891513062224862e-06, "loss": 0.0156, "step": 4098 }, { "epoch": 6.621970920840065, "grad_norm": 1.3789002583364138, "learning_rate": 1.2880378015448847e-06, "loss": 0.0175, "step": 4099 }, { "epoch": 6.623586429725363, "grad_norm": 1.7250922010063239, "learning_rate": 1.286924611076183e-06, "loss": 0.0156, "step": 4100 }, { "epoch": 6.625201938610663, "grad_norm": 1.2054057706701158, "learning_rate": 1.2858117351049831e-06, "loss": 0.0155, "step": 4101 }, { "epoch": 6.626817447495961, "grad_norm": 1.19192453376194, "learning_rate": 1.2846991739198056e-06, "loss": 0.0141, "step": 4102 }, { "epoch": 6.6284329563812605, "grad_norm": 1.2696842849633219, "learning_rate": 1.283586927809088e-06, "loss": 0.017, "step": 4103 }, { "epoch": 6.630048465266559, "grad_norm": 1.148701265201184, "learning_rate": 1.282474997061188e-06, "loss": 0.0141, "step": 4104 }, { "epoch": 6.6316639741518575, "grad_norm": 1.1459670196274383, "learning_rate": 1.2813633819643796e-06, "loss": 0.0137, "step": 4105 }, { "epoch": 6.633279483037157, "grad_norm": 1.4606810893397317, "learning_rate": 1.2802520828068566e-06, "loss": 0.013, "step": 4106 }, { "epoch": 6.634894991922455, "grad_norm": 1.3809670893751531, "learning_rate": 1.2791410998767306e-06, "loss": 0.0155, "step": 4107 }, { "epoch": 6.636510500807755, "grad_norm": 1.3609322961448356, "learning_rate": 1.2780304334620322e-06, "loss": 0.015, "step": 4108 }, { "epoch": 6.638126009693053, "grad_norm": 1.006729055153144, "learning_rate": 1.2769200838507062e-06, "loss": 0.0113, "step": 4109 }, { "epoch": 6.639741518578353, "grad_norm": 2.552162229232247, "learning_rate": 1.2758100513306201e-06, "loss": 0.0136, "step": 4110 }, { "epoch": 6.641357027463651, "grad_norm": 0.8778790079784735, "learning_rate": 1.274700336189554e-06, "loss": 0.0115, "step": 4111 }, { "epoch": 6.64297253634895, "grad_norm": 1.0412968548450827, "learning_rate": 1.2735909387152123e-06, "loss": 0.012, "step": 4112 }, { "epoch": 6.644588045234249, "grad_norm": 1.3631274410140817, "learning_rate": 1.2724818591952104e-06, "loss": 0.0132, "step": 4113 }, { "epoch": 6.646203554119547, "grad_norm": 1.4765880497039883, "learning_rate": 1.2713730979170863e-06, "loss": 0.0152, "step": 4114 }, { "epoch": 6.647819063004847, "grad_norm": 1.7316460499019766, "learning_rate": 1.2702646551682912e-06, "loss": 0.0237, "step": 4115 }, { "epoch": 6.649434571890145, "grad_norm": 1.298269696155927, "learning_rate": 1.2691565312361986e-06, "loss": 0.0214, "step": 4116 }, { "epoch": 6.651050080775445, "grad_norm": 1.206834006665414, "learning_rate": 1.268048726408093e-06, "loss": 0.0166, "step": 4117 }, { "epoch": 6.652665589660743, "grad_norm": 1.3674651963327367, "learning_rate": 1.266941240971184e-06, "loss": 0.0145, "step": 4118 }, { "epoch": 6.654281098546042, "grad_norm": 1.3974986592224945, "learning_rate": 1.265834075212591e-06, "loss": 0.0207, "step": 4119 }, { "epoch": 6.655896607431341, "grad_norm": 1.7650579544034262, "learning_rate": 1.2647272294193564e-06, "loss": 0.0169, "step": 4120 }, { "epoch": 6.6575121163166395, "grad_norm": 1.3235988362745341, "learning_rate": 1.2636207038784342e-06, "loss": 0.0118, "step": 4121 }, { "epoch": 6.659127625201939, "grad_norm": 1.39139800449024, "learning_rate": 1.2625144988767e-06, "loss": 0.0191, "step": 4122 }, { "epoch": 6.660743134087237, "grad_norm": 2.0647158171081026, "learning_rate": 1.261408614700944e-06, "loss": 0.02, "step": 4123 }, { "epoch": 6.662358642972537, "grad_norm": 1.2097989295660603, "learning_rate": 1.2603030516378736e-06, "loss": 0.0112, "step": 4124 }, { "epoch": 6.663974151857835, "grad_norm": 1.350555455563874, "learning_rate": 1.2591978099741143e-06, "loss": 0.0203, "step": 4125 }, { "epoch": 6.665589660743134, "grad_norm": 1.3520818247734836, "learning_rate": 1.258092889996205e-06, "loss": 0.0135, "step": 4126 }, { "epoch": 6.667205169628433, "grad_norm": 0.9367478686937328, "learning_rate": 1.256988291990604e-06, "loss": 0.013, "step": 4127 }, { "epoch": 6.668820678513732, "grad_norm": 2.4183748384995662, "learning_rate": 1.2558840162436858e-06, "loss": 0.0284, "step": 4128 }, { "epoch": 6.670436187399031, "grad_norm": 1.2828930574930197, "learning_rate": 1.2547800630417414e-06, "loss": 0.0148, "step": 4129 }, { "epoch": 6.6720516962843295, "grad_norm": 1.224423531935882, "learning_rate": 1.253676432670976e-06, "loss": 0.0135, "step": 4130 }, { "epoch": 6.673667205169629, "grad_norm": 1.2376662648166996, "learning_rate": 1.2525731254175147e-06, "loss": 0.0122, "step": 4131 }, { "epoch": 6.675282714054927, "grad_norm": 1.3489060517559295, "learning_rate": 1.2514701415673946e-06, "loss": 0.0146, "step": 4132 }, { "epoch": 6.676898222940226, "grad_norm": 2.0959064959162212, "learning_rate": 1.250367481406573e-06, "loss": 0.0173, "step": 4133 }, { "epoch": 6.678513731825525, "grad_norm": 0.9593193590145652, "learning_rate": 1.2492651452209212e-06, "loss": 0.0078, "step": 4134 }, { "epoch": 6.680129240710824, "grad_norm": 1.2253858273982416, "learning_rate": 1.2481631332962275e-06, "loss": 0.018, "step": 4135 }, { "epoch": 6.681744749596123, "grad_norm": 1.406244812573593, "learning_rate": 1.2470614459181938e-06, "loss": 0.0132, "step": 4136 }, { "epoch": 6.683360258481422, "grad_norm": 1.2890681465809004, "learning_rate": 1.2459600833724414e-06, "loss": 0.015, "step": 4137 }, { "epoch": 6.684975767366721, "grad_norm": 1.1032927077491084, "learning_rate": 1.2448590459445028e-06, "loss": 0.0101, "step": 4138 }, { "epoch": 6.686591276252019, "grad_norm": 1.2344793699819927, "learning_rate": 1.243758333919833e-06, "loss": 0.0109, "step": 4139 }, { "epoch": 6.688206785137318, "grad_norm": 1.1709042041679683, "learning_rate": 1.2426579475837952e-06, "loss": 0.0133, "step": 4140 }, { "epoch": 6.689822294022617, "grad_norm": 1.4167671980236294, "learning_rate": 1.2415578872216737e-06, "loss": 0.0138, "step": 4141 }, { "epoch": 6.691437802907916, "grad_norm": 1.2134685572656663, "learning_rate": 1.2404581531186644e-06, "loss": 0.0136, "step": 4142 }, { "epoch": 6.693053311793215, "grad_norm": 1.1249420712903246, "learning_rate": 1.2393587455598821e-06, "loss": 0.0113, "step": 4143 }, { "epoch": 6.694668820678514, "grad_norm": 1.0962164957353937, "learning_rate": 1.2382596648303522e-06, "loss": 0.0124, "step": 4144 }, { "epoch": 6.696284329563813, "grad_norm": 1.3493739387157757, "learning_rate": 1.2371609112150225e-06, "loss": 0.0151, "step": 4145 }, { "epoch": 6.6978998384491115, "grad_norm": 1.1686416003785036, "learning_rate": 1.2360624849987484e-06, "loss": 0.0141, "step": 4146 }, { "epoch": 6.69951534733441, "grad_norm": 1.4673601481421485, "learning_rate": 1.2349643864663063e-06, "loss": 0.0204, "step": 4147 }, { "epoch": 6.701130856219709, "grad_norm": 1.2734893215565015, "learning_rate": 1.2338666159023835e-06, "loss": 0.0109, "step": 4148 }, { "epoch": 6.702746365105008, "grad_norm": 1.4027587145403304, "learning_rate": 1.2327691735915845e-06, "loss": 0.0178, "step": 4149 }, { "epoch": 6.704361873990307, "grad_norm": 1.126588943976932, "learning_rate": 1.2316720598184282e-06, "loss": 0.0118, "step": 4150 }, { "epoch": 6.705977382875606, "grad_norm": 1.1651327905010465, "learning_rate": 1.2305752748673495e-06, "loss": 0.0129, "step": 4151 }, { "epoch": 6.707592891760905, "grad_norm": 1.106327326215751, "learning_rate": 1.2294788190226947e-06, "loss": 0.0108, "step": 4152 }, { "epoch": 6.709208400646204, "grad_norm": 1.152951951744576, "learning_rate": 1.2283826925687285e-06, "loss": 0.0119, "step": 4153 }, { "epoch": 6.710823909531502, "grad_norm": 1.4241062292247855, "learning_rate": 1.2272868957896286e-06, "loss": 0.019, "step": 4154 }, { "epoch": 6.7124394184168015, "grad_norm": 1.5486897370266637, "learning_rate": 1.2261914289694862e-06, "loss": 0.0186, "step": 4155 }, { "epoch": 6.7140549273021, "grad_norm": 1.5296236384708737, "learning_rate": 1.2250962923923088e-06, "loss": 0.015, "step": 4156 }, { "epoch": 6.715670436187399, "grad_norm": 1.2370890649574386, "learning_rate": 1.2240014863420174e-06, "loss": 0.02, "step": 4157 }, { "epoch": 6.717285945072698, "grad_norm": 1.4279262442790823, "learning_rate": 1.2229070111024485e-06, "loss": 0.0128, "step": 4158 }, { "epoch": 6.718901453957997, "grad_norm": 1.0694097786968184, "learning_rate": 1.2218128669573494e-06, "loss": 0.0124, "step": 4159 }, { "epoch": 6.720516962843296, "grad_norm": 1.1408423423449492, "learning_rate": 1.2207190541903858e-06, "loss": 0.013, "step": 4160 }, { "epoch": 6.722132471728594, "grad_norm": 1.4603938811692336, "learning_rate": 1.2196255730851345e-06, "loss": 0.0237, "step": 4161 }, { "epoch": 6.723747980613894, "grad_norm": 1.3880371758166716, "learning_rate": 1.2185324239250892e-06, "loss": 0.0121, "step": 4162 }, { "epoch": 6.725363489499192, "grad_norm": 1.1490536070733974, "learning_rate": 1.2174396069936534e-06, "loss": 0.0114, "step": 4163 }, { "epoch": 6.726978998384491, "grad_norm": 1.425810168174086, "learning_rate": 1.2163471225741492e-06, "loss": 0.0175, "step": 4164 }, { "epoch": 6.72859450726979, "grad_norm": 1.4462513122753997, "learning_rate": 1.2152549709498076e-06, "loss": 0.021, "step": 4165 }, { "epoch": 6.730210016155089, "grad_norm": 1.3310276080951156, "learning_rate": 1.2141631524037775e-06, "loss": 0.0105, "step": 4166 }, { "epoch": 6.731825525040388, "grad_norm": 1.3206882971252685, "learning_rate": 1.2130716672191193e-06, "loss": 0.0132, "step": 4167 }, { "epoch": 6.733441033925686, "grad_norm": 2.383330111722514, "learning_rate": 1.2119805156788083e-06, "loss": 0.0234, "step": 4168 }, { "epoch": 6.735056542810986, "grad_norm": 1.1947333662556272, "learning_rate": 1.210889698065731e-06, "loss": 0.0128, "step": 4169 }, { "epoch": 6.736672051696284, "grad_norm": 1.1854198753990115, "learning_rate": 1.2097992146626908e-06, "loss": 0.0114, "step": 4170 }, { "epoch": 6.7382875605815835, "grad_norm": 1.2862246041182404, "learning_rate": 1.2087090657523992e-06, "loss": 0.0149, "step": 4171 }, { "epoch": 6.739903069466882, "grad_norm": 1.8597185608473032, "learning_rate": 1.207619251617488e-06, "loss": 0.0187, "step": 4172 }, { "epoch": 6.741518578352181, "grad_norm": 1.5744314387562017, "learning_rate": 1.2065297725404955e-06, "loss": 0.0151, "step": 4173 }, { "epoch": 6.74313408723748, "grad_norm": 1.272848863378519, "learning_rate": 1.2054406288038789e-06, "loss": 0.0135, "step": 4174 }, { "epoch": 6.744749596122778, "grad_norm": 1.3869903239862809, "learning_rate": 1.2043518206900026e-06, "loss": 0.0135, "step": 4175 }, { "epoch": 6.746365105008078, "grad_norm": 1.2892375123524542, "learning_rate": 1.2032633484811492e-06, "loss": 0.0128, "step": 4176 }, { "epoch": 6.747980613893376, "grad_norm": 1.1145032727206703, "learning_rate": 1.2021752124595098e-06, "loss": 0.0182, "step": 4177 }, { "epoch": 6.749596122778676, "grad_norm": 1.8658647779166369, "learning_rate": 1.2010874129071936e-06, "loss": 0.0149, "step": 4178 }, { "epoch": 6.751211631663974, "grad_norm": 1.3557882586614365, "learning_rate": 1.199999950106217e-06, "loss": 0.0147, "step": 4179 }, { "epoch": 6.7528271405492735, "grad_norm": 1.1367737393131554, "learning_rate": 1.198912824338514e-06, "loss": 0.0142, "step": 4180 }, { "epoch": 6.754442649434572, "grad_norm": 1.2255408749076222, "learning_rate": 1.1978260358859263e-06, "loss": 0.0132, "step": 4181 }, { "epoch": 6.75605815831987, "grad_norm": 1.5825746134282328, "learning_rate": 1.1967395850302116e-06, "loss": 0.0167, "step": 4182 }, { "epoch": 6.75767366720517, "grad_norm": 0.9716694127601899, "learning_rate": 1.1956534720530398e-06, "loss": 0.0126, "step": 4183 }, { "epoch": 6.759289176090468, "grad_norm": 1.6073272796752838, "learning_rate": 1.194567697235992e-06, "loss": 0.0171, "step": 4184 }, { "epoch": 6.760904684975768, "grad_norm": 1.4271001419819278, "learning_rate": 1.1934822608605638e-06, "loss": 0.0112, "step": 4185 }, { "epoch": 6.762520193861066, "grad_norm": 1.423837070104379, "learning_rate": 1.1923971632081588e-06, "loss": 0.0212, "step": 4186 }, { "epoch": 6.7641357027463656, "grad_norm": 1.816632358327087, "learning_rate": 1.1913124045600972e-06, "loss": 0.0202, "step": 4187 }, { "epoch": 6.765751211631664, "grad_norm": 1.1131074768063598, "learning_rate": 1.190227985197609e-06, "loss": 0.0126, "step": 4188 }, { "epoch": 6.7673667205169625, "grad_norm": 1.427694620141325, "learning_rate": 1.1891439054018378e-06, "loss": 0.0189, "step": 4189 }, { "epoch": 6.768982229402262, "grad_norm": 1.153455858239662, "learning_rate": 1.1880601654538366e-06, "loss": 0.0169, "step": 4190 }, { "epoch": 6.77059773828756, "grad_norm": 1.4721510137192861, "learning_rate": 1.1869767656345735e-06, "loss": 0.0151, "step": 4191 }, { "epoch": 6.77221324717286, "grad_norm": 1.1480902802974846, "learning_rate": 1.1858937062249251e-06, "loss": 0.0113, "step": 4192 }, { "epoch": 6.773828756058158, "grad_norm": 1.0701979166509648, "learning_rate": 1.1848109875056823e-06, "loss": 0.0119, "step": 4193 }, { "epoch": 6.775444264943458, "grad_norm": 3.0427964917785464, "learning_rate": 1.1837286097575468e-06, "loss": 0.0296, "step": 4194 }, { "epoch": 6.777059773828756, "grad_norm": 1.0190033303369925, "learning_rate": 1.1826465732611326e-06, "loss": 0.0125, "step": 4195 }, { "epoch": 6.778675282714055, "grad_norm": 1.3963186884655312, "learning_rate": 1.1815648782969629e-06, "loss": 0.0115, "step": 4196 }, { "epoch": 6.780290791599354, "grad_norm": 1.5765628411677421, "learning_rate": 1.180483525145476e-06, "loss": 0.0158, "step": 4197 }, { "epoch": 6.7819063004846525, "grad_norm": 1.5341570044130382, "learning_rate": 1.1794025140870166e-06, "loss": 0.0178, "step": 4198 }, { "epoch": 6.783521809369952, "grad_norm": 1.1483921207553525, "learning_rate": 1.1783218454018474e-06, "loss": 0.0106, "step": 4199 }, { "epoch": 6.78513731825525, "grad_norm": 1.5527226828978034, "learning_rate": 1.177241519370136e-06, "loss": 0.0165, "step": 4200 }, { "epoch": 6.78513731825525, "eval_loss": 1.62959885597229, "eval_runtime": 2.3462, "eval_samples_per_second": 127.866, "eval_steps_per_second": 2.984, "step": 4200 }, { "epoch": 6.78675282714055, "grad_norm": 1.075388687743619, "learning_rate": 1.1761615362719652e-06, "loss": 0.0126, "step": 4201 }, { "epoch": 6.788368336025848, "grad_norm": 1.636763709005901, "learning_rate": 1.175081896387326e-06, "loss": 0.017, "step": 4202 }, { "epoch": 6.789983844911147, "grad_norm": 1.3241206719575975, "learning_rate": 1.1740025999961238e-06, "loss": 0.0152, "step": 4203 }, { "epoch": 6.791599353796446, "grad_norm": 1.2956143636410404, "learning_rate": 1.1729236473781699e-06, "loss": 0.0139, "step": 4204 }, { "epoch": 6.793214862681745, "grad_norm": 1.19407580988512, "learning_rate": 1.1718450388131938e-06, "loss": 0.0107, "step": 4205 }, { "epoch": 6.794830371567044, "grad_norm": 1.294848624597509, "learning_rate": 1.1707667745808287e-06, "loss": 0.0103, "step": 4206 }, { "epoch": 6.796445880452342, "grad_norm": 1.4315107950338923, "learning_rate": 1.169688854960623e-06, "loss": 0.0171, "step": 4207 }, { "epoch": 6.798061389337642, "grad_norm": 1.2453817537511316, "learning_rate": 1.168611280232033e-06, "loss": 0.0119, "step": 4208 }, { "epoch": 6.79967689822294, "grad_norm": 1.2334628294382668, "learning_rate": 1.1675340506744268e-06, "loss": 0.0127, "step": 4209 }, { "epoch": 6.801292407108239, "grad_norm": 1.1356403500349408, "learning_rate": 1.1664571665670837e-06, "loss": 0.0156, "step": 4210 }, { "epoch": 6.802907915993538, "grad_norm": 1.081482315173443, "learning_rate": 1.1653806281891936e-06, "loss": 0.0096, "step": 4211 }, { "epoch": 6.804523424878837, "grad_norm": 1.2446194614857478, "learning_rate": 1.164304435819854e-06, "loss": 0.0115, "step": 4212 }, { "epoch": 6.806138933764136, "grad_norm": 1.6435399132602633, "learning_rate": 1.1632285897380753e-06, "loss": 0.0168, "step": 4213 }, { "epoch": 6.8077544426494345, "grad_norm": 1.2803034013672483, "learning_rate": 1.162153090222778e-06, "loss": 0.0122, "step": 4214 }, { "epoch": 6.809369951534734, "grad_norm": 1.3906947247519468, "learning_rate": 1.1610779375527916e-06, "loss": 0.0148, "step": 4215 }, { "epoch": 6.810985460420032, "grad_norm": 1.0877864784000562, "learning_rate": 1.1600031320068564e-06, "loss": 0.0099, "step": 4216 }, { "epoch": 6.812600969305331, "grad_norm": 1.1600659270288862, "learning_rate": 1.1589286738636225e-06, "loss": 0.0134, "step": 4217 }, { "epoch": 6.81421647819063, "grad_norm": 1.9528915938510334, "learning_rate": 1.1578545634016512e-06, "loss": 0.0223, "step": 4218 }, { "epoch": 6.815831987075929, "grad_norm": 1.3107111898468728, "learning_rate": 1.15678080089941e-06, "loss": 0.0173, "step": 4219 }, { "epoch": 6.817447495961228, "grad_norm": 1.621535160174055, "learning_rate": 1.1557073866352803e-06, "loss": 0.0243, "step": 4220 }, { "epoch": 6.819063004846527, "grad_norm": 1.2399689918110628, "learning_rate": 1.1546343208875513e-06, "loss": 0.0145, "step": 4221 }, { "epoch": 6.820678513731826, "grad_norm": 1.5056298280167328, "learning_rate": 1.153561603934423e-06, "loss": 0.0115, "step": 4222 }, { "epoch": 6.8222940226171245, "grad_norm": 1.6592102142328018, "learning_rate": 1.1524892360540025e-06, "loss": 0.0248, "step": 4223 }, { "epoch": 6.823909531502423, "grad_norm": 1.2882501271814861, "learning_rate": 1.1514172175243094e-06, "loss": 0.0154, "step": 4224 }, { "epoch": 6.825525040387722, "grad_norm": 1.518226190175879, "learning_rate": 1.1503455486232698e-06, "loss": 0.0152, "step": 4225 }, { "epoch": 6.827140549273021, "grad_norm": 1.4039680028676782, "learning_rate": 1.1492742296287212e-06, "loss": 0.0189, "step": 4226 }, { "epoch": 6.82875605815832, "grad_norm": 1.3952967144629638, "learning_rate": 1.1482032608184105e-06, "loss": 0.0112, "step": 4227 }, { "epoch": 6.830371567043619, "grad_norm": 1.6056156059200521, "learning_rate": 1.1471326424699932e-06, "loss": 0.0145, "step": 4228 }, { "epoch": 6.831987075928918, "grad_norm": 1.3205478889552296, "learning_rate": 1.1460623748610331e-06, "loss": 0.0187, "step": 4229 }, { "epoch": 6.833602584814217, "grad_norm": 1.374383788053718, "learning_rate": 1.144992458269005e-06, "loss": 0.0146, "step": 4230 }, { "epoch": 6.835218093699515, "grad_norm": 1.0276465271115633, "learning_rate": 1.1439228929712889e-06, "loss": 0.0096, "step": 4231 }, { "epoch": 6.836833602584814, "grad_norm": 1.6721824914934325, "learning_rate": 1.1428536792451804e-06, "loss": 0.0186, "step": 4232 }, { "epoch": 6.838449111470113, "grad_norm": 1.0593149853900374, "learning_rate": 1.1417848173678769e-06, "loss": 0.0121, "step": 4233 }, { "epoch": 6.840064620355412, "grad_norm": 1.6429232364603352, "learning_rate": 1.1407163076164895e-06, "loss": 0.0146, "step": 4234 }, { "epoch": 6.841680129240711, "grad_norm": 1.2162605811137477, "learning_rate": 1.1396481502680343e-06, "loss": 0.013, "step": 4235 }, { "epoch": 6.84329563812601, "grad_norm": 0.9868408066065564, "learning_rate": 1.138580345599439e-06, "loss": 0.0115, "step": 4236 }, { "epoch": 6.844911147011309, "grad_norm": 1.2277828699208395, "learning_rate": 1.1375128938875385e-06, "loss": 0.0086, "step": 4237 }, { "epoch": 6.846526655896607, "grad_norm": 1.5472556079035376, "learning_rate": 1.1364457954090778e-06, "loss": 0.018, "step": 4238 }, { "epoch": 6.8481421647819065, "grad_norm": 1.4008301452148995, "learning_rate": 1.1353790504407061e-06, "loss": 0.0151, "step": 4239 }, { "epoch": 6.849757673667205, "grad_norm": 1.7033372022133568, "learning_rate": 1.1343126592589868e-06, "loss": 0.0209, "step": 4240 }, { "epoch": 6.851373182552504, "grad_norm": 1.3104230207077905, "learning_rate": 1.1332466221403861e-06, "loss": 0.013, "step": 4241 }, { "epoch": 6.852988691437803, "grad_norm": 1.6334795331979757, "learning_rate": 1.1321809393612824e-06, "loss": 0.0163, "step": 4242 }, { "epoch": 6.854604200323102, "grad_norm": 1.5470580882505773, "learning_rate": 1.13111561119796e-06, "loss": 0.013, "step": 4243 }, { "epoch": 6.856219709208401, "grad_norm": 1.3837159629054008, "learning_rate": 1.130050637926612e-06, "loss": 0.0112, "step": 4244 }, { "epoch": 6.857835218093699, "grad_norm": 1.5329576166667411, "learning_rate": 1.1289860198233413e-06, "loss": 0.0067, "step": 4245 }, { "epoch": 6.859450726978999, "grad_norm": 1.126152085730755, "learning_rate": 1.1279217571641542e-06, "loss": 0.0127, "step": 4246 }, { "epoch": 6.861066235864297, "grad_norm": 1.702359525184899, "learning_rate": 1.126857850224969e-06, "loss": 0.0168, "step": 4247 }, { "epoch": 6.8626817447495965, "grad_norm": 1.5413366665200492, "learning_rate": 1.12579429928161e-06, "loss": 0.019, "step": 4248 }, { "epoch": 6.864297253634895, "grad_norm": 1.7389480014291463, "learning_rate": 1.1247311046098103e-06, "loss": 0.0213, "step": 4249 }, { "epoch": 6.865912762520194, "grad_norm": 1.435524007059291, "learning_rate": 1.1236682664852082e-06, "loss": 0.0131, "step": 4250 }, { "epoch": 6.867528271405493, "grad_norm": 1.3227050981354922, "learning_rate": 1.1226057851833536e-06, "loss": 0.0141, "step": 4251 }, { "epoch": 6.869143780290791, "grad_norm": 2.31289358485837, "learning_rate": 1.1215436609796992e-06, "loss": 0.0181, "step": 4252 }, { "epoch": 6.870759289176091, "grad_norm": 1.3298086868297454, "learning_rate": 1.1204818941496085e-06, "loss": 0.0142, "step": 4253 }, { "epoch": 6.872374798061389, "grad_norm": 1.9914485130603126, "learning_rate": 1.119420484968351e-06, "loss": 0.0185, "step": 4254 }, { "epoch": 6.8739903069466886, "grad_norm": 1.0459074502294765, "learning_rate": 1.1183594337111048e-06, "loss": 0.0112, "step": 4255 }, { "epoch": 6.875605815831987, "grad_norm": 4.478050408653852, "learning_rate": 1.1172987406529529e-06, "loss": 0.0158, "step": 4256 }, { "epoch": 6.877221324717286, "grad_norm": 1.1503921604572245, "learning_rate": 1.1162384060688877e-06, "loss": 0.0201, "step": 4257 }, { "epoch": 6.878836833602585, "grad_norm": 1.0463622593382196, "learning_rate": 1.1151784302338053e-06, "loss": 0.0124, "step": 4258 }, { "epoch": 6.880452342487883, "grad_norm": 0.9810580039398922, "learning_rate": 1.1141188134225153e-06, "loss": 0.0134, "step": 4259 }, { "epoch": 6.882067851373183, "grad_norm": 1.2348665656240834, "learning_rate": 1.1130595559097264e-06, "loss": 0.0131, "step": 4260 }, { "epoch": 6.883683360258481, "grad_norm": 1.4512696592981547, "learning_rate": 1.1120006579700607e-06, "loss": 0.0153, "step": 4261 }, { "epoch": 6.885298869143781, "grad_norm": 1.0738446225289497, "learning_rate": 1.1109421198780416e-06, "loss": 0.0124, "step": 4262 }, { "epoch": 6.886914378029079, "grad_norm": 1.8314051402135327, "learning_rate": 1.1098839419081038e-06, "loss": 0.0229, "step": 4263 }, { "epoch": 6.8885298869143785, "grad_norm": 1.360824978894255, "learning_rate": 1.1088261243345843e-06, "loss": 0.0162, "step": 4264 }, { "epoch": 6.890145395799677, "grad_norm": 1.1193340930209523, "learning_rate": 1.1077686674317322e-06, "loss": 0.0144, "step": 4265 }, { "epoch": 6.8917609046849755, "grad_norm": 1.2622787141721559, "learning_rate": 1.1067115714736977e-06, "loss": 0.0116, "step": 4266 }, { "epoch": 6.893376413570275, "grad_norm": 1.535150295512146, "learning_rate": 1.105654836734541e-06, "loss": 0.0148, "step": 4267 }, { "epoch": 6.894991922455573, "grad_norm": 1.4796777020034475, "learning_rate": 1.1045984634882256e-06, "loss": 0.0149, "step": 4268 }, { "epoch": 6.896607431340873, "grad_norm": 1.79484147030185, "learning_rate": 1.103542452008624e-06, "loss": 0.0155, "step": 4269 }, { "epoch": 6.898222940226171, "grad_norm": 1.1947846785904526, "learning_rate": 1.102486802569514e-06, "loss": 0.0123, "step": 4270 }, { "epoch": 6.899838449111471, "grad_norm": 1.5550855629358584, "learning_rate": 1.1014315154445799e-06, "loss": 0.0166, "step": 4271 }, { "epoch": 6.901453957996769, "grad_norm": 1.493595715706173, "learning_rate": 1.1003765909074102e-06, "loss": 0.019, "step": 4272 }, { "epoch": 6.903069466882068, "grad_norm": 1.4857696757412022, "learning_rate": 1.0993220292315013e-06, "loss": 0.0163, "step": 4273 }, { "epoch": 6.904684975767367, "grad_norm": 0.999862426950505, "learning_rate": 1.0982678306902564e-06, "loss": 0.0125, "step": 4274 }, { "epoch": 6.906300484652665, "grad_norm": 1.7383136056397621, "learning_rate": 1.0972139955569808e-06, "loss": 0.0192, "step": 4275 }, { "epoch": 6.907915993537965, "grad_norm": 1.1923842099585016, "learning_rate": 1.0961605241048895e-06, "loss": 0.0125, "step": 4276 }, { "epoch": 6.909531502423263, "grad_norm": 1.441505366779148, "learning_rate": 1.095107416607101e-06, "loss": 0.0102, "step": 4277 }, { "epoch": 6.911147011308563, "grad_norm": 1.9131197745293684, "learning_rate": 1.0940546733366414e-06, "loss": 0.0236, "step": 4278 }, { "epoch": 6.912762520193861, "grad_norm": 1.3927628729841868, "learning_rate": 1.0930022945664395e-06, "loss": 0.0165, "step": 4279 }, { "epoch": 6.91437802907916, "grad_norm": 1.3679967289546473, "learning_rate": 1.0919502805693317e-06, "loss": 0.0173, "step": 4280 }, { "epoch": 6.915993537964459, "grad_norm": 1.3888944521917028, "learning_rate": 1.0908986316180592e-06, "loss": 0.0141, "step": 4281 }, { "epoch": 6.9176090468497575, "grad_norm": 1.6550617301073534, "learning_rate": 1.08984734798527e-06, "loss": 0.0162, "step": 4282 }, { "epoch": 6.919224555735057, "grad_norm": 1.1775956427754728, "learning_rate": 1.088796429943514e-06, "loss": 0.0134, "step": 4283 }, { "epoch": 6.920840064620355, "grad_norm": 1.4531351180703391, "learning_rate": 1.08774587776525e-06, "loss": 0.0166, "step": 4284 }, { "epoch": 6.922455573505655, "grad_norm": 1.118532089389496, "learning_rate": 1.0866956917228391e-06, "loss": 0.0107, "step": 4285 }, { "epoch": 6.924071082390953, "grad_norm": 1.7019789387789712, "learning_rate": 1.085645872088549e-06, "loss": 0.0155, "step": 4286 }, { "epoch": 6.925686591276252, "grad_norm": 2.100131822126775, "learning_rate": 1.0845964191345527e-06, "loss": 0.0232, "step": 4287 }, { "epoch": 6.927302100161551, "grad_norm": 1.1767510079239019, "learning_rate": 1.0835473331329278e-06, "loss": 0.0124, "step": 4288 }, { "epoch": 6.92891760904685, "grad_norm": 1.337169847932837, "learning_rate": 1.0824986143556552e-06, "loss": 0.0123, "step": 4289 }, { "epoch": 6.930533117932149, "grad_norm": 1.5962826475773688, "learning_rate": 1.0814502630746232e-06, "loss": 0.0157, "step": 4290 }, { "epoch": 6.9321486268174475, "grad_norm": 0.8436373856074585, "learning_rate": 1.0804022795616216e-06, "loss": 0.0131, "step": 4291 }, { "epoch": 6.933764135702747, "grad_norm": 1.3970660114615614, "learning_rate": 1.0793546640883496e-06, "loss": 0.0167, "step": 4292 }, { "epoch": 6.935379644588045, "grad_norm": 1.7242232419937624, "learning_rate": 1.078307416926406e-06, "loss": 0.0248, "step": 4293 }, { "epoch": 6.936995153473344, "grad_norm": 1.0430251874882799, "learning_rate": 1.0772605383472977e-06, "loss": 0.0133, "step": 4294 }, { "epoch": 6.938610662358643, "grad_norm": 1.329274437203402, "learning_rate": 1.076214028622433e-06, "loss": 0.0157, "step": 4295 }, { "epoch": 6.940226171243942, "grad_norm": 1.5263411045869015, "learning_rate": 1.0751678880231275e-06, "loss": 0.0182, "step": 4296 }, { "epoch": 6.941841680129241, "grad_norm": 1.8792201958741133, "learning_rate": 1.0741221168205992e-06, "loss": 0.015, "step": 4297 }, { "epoch": 6.94345718901454, "grad_norm": 0.9773017044912429, "learning_rate": 1.073076715285972e-06, "loss": 0.0126, "step": 4298 }, { "epoch": 6.945072697899839, "grad_norm": 1.223802804416818, "learning_rate": 1.0720316836902712e-06, "loss": 0.0123, "step": 4299 }, { "epoch": 6.946688206785137, "grad_norm": 1.123250957161544, "learning_rate": 1.0709870223044287e-06, "loss": 0.0139, "step": 4300 }, { "epoch": 6.948303715670436, "grad_norm": 1.3759854081315581, "learning_rate": 1.0699427313992803e-06, "loss": 0.0091, "step": 4301 }, { "epoch": 6.949919224555735, "grad_norm": 1.1701646203306673, "learning_rate": 1.0688988112455636e-06, "loss": 0.0195, "step": 4302 }, { "epoch": 6.951534733441034, "grad_norm": 1.4180821057350463, "learning_rate": 1.067855262113922e-06, "loss": 0.0141, "step": 4303 }, { "epoch": 6.953150242326333, "grad_norm": 1.6641562339437936, "learning_rate": 1.066812084274903e-06, "loss": 0.014, "step": 4304 }, { "epoch": 6.954765751211632, "grad_norm": 1.0150911695212157, "learning_rate": 1.0657692779989573e-06, "loss": 0.0125, "step": 4305 }, { "epoch": 6.956381260096931, "grad_norm": 1.104612210547784, "learning_rate": 1.0647268435564376e-06, "loss": 0.0109, "step": 4306 }, { "epoch": 6.9579967689822295, "grad_norm": 0.8594477165374046, "learning_rate": 1.0636847812176022e-06, "loss": 0.0097, "step": 4307 }, { "epoch": 6.959612277867528, "grad_norm": 1.701258184231759, "learning_rate": 1.0626430912526129e-06, "loss": 0.0191, "step": 4308 }, { "epoch": 6.961227786752827, "grad_norm": 1.6806027935787864, "learning_rate": 1.0616017739315345e-06, "loss": 0.0164, "step": 4309 }, { "epoch": 6.962843295638126, "grad_norm": 1.7136351348841785, "learning_rate": 1.0605608295243345e-06, "loss": 0.0187, "step": 4310 }, { "epoch": 6.964458804523425, "grad_norm": 1.1756467354950024, "learning_rate": 1.0595202583008851e-06, "loss": 0.0153, "step": 4311 }, { "epoch": 6.966074313408724, "grad_norm": 1.475880305494317, "learning_rate": 1.0584800605309598e-06, "loss": 0.0132, "step": 4312 }, { "epoch": 6.967689822294023, "grad_norm": 1.685097826119062, "learning_rate": 1.0574402364842374e-06, "loss": 0.0189, "step": 4313 }, { "epoch": 6.969305331179322, "grad_norm": 1.9403730861762167, "learning_rate": 1.0564007864302983e-06, "loss": 0.0182, "step": 4314 }, { "epoch": 6.97092084006462, "grad_norm": 1.2195119176830427, "learning_rate": 1.055361710638628e-06, "loss": 0.0148, "step": 4315 }, { "epoch": 6.9725363489499195, "grad_norm": 1.2358042239770874, "learning_rate": 1.0543230093786116e-06, "loss": 0.0177, "step": 4316 }, { "epoch": 6.974151857835218, "grad_norm": 1.17061317533151, "learning_rate": 1.0532846829195405e-06, "loss": 0.0113, "step": 4317 }, { "epoch": 6.975767366720517, "grad_norm": 2.4184282364339804, "learning_rate": 1.052246731530605e-06, "loss": 0.0401, "step": 4318 }, { "epoch": 6.977382875605816, "grad_norm": 1.8558638303966124, "learning_rate": 1.0512091554809045e-06, "loss": 0.0205, "step": 4319 }, { "epoch": 6.978998384491114, "grad_norm": 1.1688701121868388, "learning_rate": 1.0501719550394338e-06, "loss": 0.0098, "step": 4320 }, { "epoch": 6.980613893376414, "grad_norm": 1.075759369171278, "learning_rate": 1.0491351304750961e-06, "loss": 0.0131, "step": 4321 }, { "epoch": 6.982229402261712, "grad_norm": 0.9374944795017639, "learning_rate": 1.0480986820566924e-06, "loss": 0.0103, "step": 4322 }, { "epoch": 6.983844911147012, "grad_norm": 1.4937460858646467, "learning_rate": 1.0470626100529307e-06, "loss": 0.0112, "step": 4323 }, { "epoch": 6.98546042003231, "grad_norm": 1.500529656443297, "learning_rate": 1.0460269147324167e-06, "loss": 0.0158, "step": 4324 }, { "epoch": 6.987075928917609, "grad_norm": 1.2491322412303187, "learning_rate": 1.0449915963636642e-06, "loss": 0.0151, "step": 4325 }, { "epoch": 6.988691437802908, "grad_norm": 0.9024965739524284, "learning_rate": 1.0439566552150833e-06, "loss": 0.0107, "step": 4326 }, { "epoch": 6.990306946688206, "grad_norm": 1.2739810364807886, "learning_rate": 1.0429220915549914e-06, "loss": 0.0145, "step": 4327 }, { "epoch": 6.991922455573506, "grad_norm": 1.3327621372144385, "learning_rate": 1.0418879056516038e-06, "loss": 0.0177, "step": 4328 }, { "epoch": 6.993537964458804, "grad_norm": 1.2415018989469082, "learning_rate": 1.0408540977730403e-06, "loss": 0.0144, "step": 4329 }, { "epoch": 6.995153473344104, "grad_norm": 1.117836154549479, "learning_rate": 1.0398206681873222e-06, "loss": 0.0167, "step": 4330 }, { "epoch": 6.996768982229402, "grad_norm": 0.9636718491354689, "learning_rate": 1.0387876171623737e-06, "loss": 0.0114, "step": 4331 }, { "epoch": 6.9983844911147015, "grad_norm": 1.0977097046820063, "learning_rate": 1.0377549449660183e-06, "loss": 0.0107, "step": 4332 }, { "epoch": 7.0, "grad_norm": 1.2451270972986748, "learning_rate": 1.036722651865983e-06, "loss": 0.0089, "step": 4333 }, { "epoch": 7.0016155088852985, "grad_norm": 1.1714984429090194, "learning_rate": 1.0356907381298984e-06, "loss": 0.0149, "step": 4334 }, { "epoch": 7.003231017770598, "grad_norm": 0.6688546445926131, "learning_rate": 1.0346592040252918e-06, "loss": 0.0052, "step": 4335 }, { "epoch": 7.004846526655896, "grad_norm": 0.5266751762545707, "learning_rate": 1.0336280498195964e-06, "loss": 0.0059, "step": 4336 }, { "epoch": 7.006462035541196, "grad_norm": 1.1244056048519346, "learning_rate": 1.0325972757801453e-06, "loss": 0.0116, "step": 4337 }, { "epoch": 7.008077544426494, "grad_norm": 0.840697059566125, "learning_rate": 1.0315668821741743e-06, "loss": 0.0072, "step": 4338 }, { "epoch": 7.009693053311794, "grad_norm": 0.6823441488196694, "learning_rate": 1.0305368692688175e-06, "loss": 0.0044, "step": 4339 }, { "epoch": 7.011308562197092, "grad_norm": 0.7870718048429807, "learning_rate": 1.0295072373311132e-06, "loss": 0.0113, "step": 4340 }, { "epoch": 7.012924071082391, "grad_norm": 0.8579151099785357, "learning_rate": 1.0284779866280003e-06, "loss": 0.0107, "step": 4341 }, { "epoch": 7.01453957996769, "grad_norm": 1.0249095871569038, "learning_rate": 1.027449117426319e-06, "loss": 0.007, "step": 4342 }, { "epoch": 7.016155088852988, "grad_norm": 0.8617960687305338, "learning_rate": 1.026420629992809e-06, "loss": 0.0061, "step": 4343 }, { "epoch": 7.017770597738288, "grad_norm": 1.598716075454915, "learning_rate": 1.0253925245941135e-06, "loss": 0.0048, "step": 4344 }, { "epoch": 7.019386106623586, "grad_norm": 1.1602995990006846, "learning_rate": 1.0243648014967727e-06, "loss": 0.0072, "step": 4345 }, { "epoch": 7.021001615508886, "grad_norm": 1.0132651487681765, "learning_rate": 1.0233374609672343e-06, "loss": 0.0072, "step": 4346 }, { "epoch": 7.022617124394184, "grad_norm": 0.9365544851441187, "learning_rate": 1.0223105032718398e-06, "loss": 0.0064, "step": 4347 }, { "epoch": 7.024232633279483, "grad_norm": 1.0318357417502686, "learning_rate": 1.021283928676836e-06, "loss": 0.0096, "step": 4348 }, { "epoch": 7.025848142164782, "grad_norm": 0.8226589057894564, "learning_rate": 1.0202577374483679e-06, "loss": 0.0095, "step": 4349 }, { "epoch": 7.0274636510500805, "grad_norm": 0.67782471411231, "learning_rate": 1.0192319298524836e-06, "loss": 0.0056, "step": 4350 }, { "epoch": 7.02907915993538, "grad_norm": 1.5136870740152764, "learning_rate": 1.0182065061551275e-06, "loss": 0.0113, "step": 4351 }, { "epoch": 7.030694668820678, "grad_norm": 0.6508466099466728, "learning_rate": 1.0171814666221504e-06, "loss": 0.0091, "step": 4352 }, { "epoch": 7.032310177705978, "grad_norm": 0.7308890761639099, "learning_rate": 1.0161568115192982e-06, "loss": 0.0049, "step": 4353 }, { "epoch": 7.033925686591276, "grad_norm": 0.8837333235798945, "learning_rate": 1.0151325411122207e-06, "loss": 0.0063, "step": 4354 }, { "epoch": 7.035541195476575, "grad_norm": 0.794804303038518, "learning_rate": 1.0141086556664651e-06, "loss": 0.0077, "step": 4355 }, { "epoch": 7.037156704361874, "grad_norm": 0.8361355483857118, "learning_rate": 1.0130851554474804e-06, "loss": 0.0067, "step": 4356 }, { "epoch": 7.038772213247173, "grad_norm": 1.1649456091108388, "learning_rate": 1.012062040720616e-06, "loss": 0.0115, "step": 4357 }, { "epoch": 7.040387722132472, "grad_norm": 0.560339007693639, "learning_rate": 1.0110393117511218e-06, "loss": 0.005, "step": 4358 }, { "epoch": 7.0420032310177705, "grad_norm": 1.0615983682054937, "learning_rate": 1.0100169688041453e-06, "loss": 0.0112, "step": 4359 }, { "epoch": 7.04361873990307, "grad_norm": 0.5607451584428813, "learning_rate": 1.0089950121447356e-06, "loss": 0.005, "step": 4360 }, { "epoch": 7.045234248788368, "grad_norm": 0.6500723196699517, "learning_rate": 1.0079734420378426e-06, "loss": 0.0069, "step": 4361 }, { "epoch": 7.046849757673667, "grad_norm": 0.7896924193182496, "learning_rate": 1.0069522587483133e-06, "loss": 0.0069, "step": 4362 }, { "epoch": 7.048465266558966, "grad_norm": 0.5929742963925961, "learning_rate": 1.0059314625408968e-06, "loss": 0.0055, "step": 4363 }, { "epoch": 7.050080775444265, "grad_norm": 1.0494835172268109, "learning_rate": 1.0049110536802406e-06, "loss": 0.0078, "step": 4364 }, { "epoch": 7.051696284329564, "grad_norm": 1.3703085105307493, "learning_rate": 1.0038910324308936e-06, "loss": 0.0099, "step": 4365 }, { "epoch": 7.053311793214863, "grad_norm": 1.031938790362115, "learning_rate": 1.0028713990573008e-06, "loss": 0.0103, "step": 4366 }, { "epoch": 7.054927302100162, "grad_norm": 0.622461157323788, "learning_rate": 1.0018521538238093e-06, "loss": 0.0043, "step": 4367 }, { "epoch": 7.05654281098546, "grad_norm": 0.818497765443752, "learning_rate": 1.0008332969946652e-06, "loss": 0.0063, "step": 4368 }, { "epoch": 7.058158319870759, "grad_norm": 0.7043934907120466, "learning_rate": 9.998148288340142e-07, "loss": 0.0054, "step": 4369 }, { "epoch": 7.059773828756058, "grad_norm": 0.7370254393074216, "learning_rate": 9.987967496058992e-07, "loss": 0.0063, "step": 4370 }, { "epoch": 7.061389337641357, "grad_norm": 0.754935493516322, "learning_rate": 9.977790595742654e-07, "loss": 0.0067, "step": 4371 }, { "epoch": 7.063004846526656, "grad_norm": 0.9900297284074169, "learning_rate": 9.967617590029536e-07, "loss": 0.0084, "step": 4372 }, { "epoch": 7.064620355411955, "grad_norm": 1.0488378865161263, "learning_rate": 9.957448481557062e-07, "loss": 0.0067, "step": 4373 }, { "epoch": 7.066235864297254, "grad_norm": 1.1426549744734205, "learning_rate": 9.94728327296164e-07, "loss": 0.0079, "step": 4374 }, { "epoch": 7.0678513731825525, "grad_norm": 0.9775282265584566, "learning_rate": 9.93712196687867e-07, "loss": 0.008, "step": 4375 }, { "epoch": 7.069466882067851, "grad_norm": 1.4683857485600929, "learning_rate": 9.926964565942522e-07, "loss": 0.0108, "step": 4376 }, { "epoch": 7.07108239095315, "grad_norm": 0.5241761943142785, "learning_rate": 9.91681107278658e-07, "loss": 0.0044, "step": 4377 }, { "epoch": 7.072697899838449, "grad_norm": 0.7066993230273837, "learning_rate": 9.906661490043182e-07, "loss": 0.0077, "step": 4378 }, { "epoch": 7.074313408723748, "grad_norm": 0.5765704918803444, "learning_rate": 9.8965158203437e-07, "loss": 0.0053, "step": 4379 }, { "epoch": 7.075928917609047, "grad_norm": 1.1500324634087522, "learning_rate": 9.886374066318441e-07, "loss": 0.0093, "step": 4380 }, { "epoch": 7.077544426494346, "grad_norm": 0.8937406347032919, "learning_rate": 9.876236230596734e-07, "loss": 0.0073, "step": 4381 }, { "epoch": 7.079159935379645, "grad_norm": 0.563051138706349, "learning_rate": 9.866102315806861e-07, "loss": 0.0051, "step": 4382 }, { "epoch": 7.080775444264943, "grad_norm": 0.901568789620386, "learning_rate": 9.855972324576124e-07, "loss": 0.0078, "step": 4383 }, { "epoch": 7.0823909531502425, "grad_norm": 0.3568532666386544, "learning_rate": 9.845846259530756e-07, "loss": 0.0048, "step": 4384 }, { "epoch": 7.084006462035541, "grad_norm": 0.5832817390032446, "learning_rate": 9.83572412329604e-07, "loss": 0.0058, "step": 4385 }, { "epoch": 7.08562197092084, "grad_norm": 1.07597984851606, "learning_rate": 9.825605918496183e-07, "loss": 0.009, "step": 4386 }, { "epoch": 7.087237479806139, "grad_norm": 0.9621249007838767, "learning_rate": 9.815491647754408e-07, "loss": 0.0113, "step": 4387 }, { "epoch": 7.088852988691438, "grad_norm": 0.9583251117417523, "learning_rate": 9.805381313692885e-07, "loss": 0.0082, "step": 4388 }, { "epoch": 7.090468497576737, "grad_norm": 0.7489909467411368, "learning_rate": 9.79527491893279e-07, "loss": 0.0076, "step": 4389 }, { "epoch": 7.092084006462035, "grad_norm": 1.2039510087090501, "learning_rate": 9.785172466094276e-07, "loss": 0.0075, "step": 4390 }, { "epoch": 7.093699515347335, "grad_norm": 0.8095351462669893, "learning_rate": 9.775073957796472e-07, "loss": 0.0062, "step": 4391 }, { "epoch": 7.095315024232633, "grad_norm": 0.8172362685232493, "learning_rate": 9.764979396657462e-07, "loss": 0.005, "step": 4392 }, { "epoch": 7.096930533117932, "grad_norm": 0.7482847112778287, "learning_rate": 9.754888785294338e-07, "loss": 0.0089, "step": 4393 }, { "epoch": 7.098546042003231, "grad_norm": 1.4259453414268966, "learning_rate": 9.74480212632316e-07, "loss": 0.0086, "step": 4394 }, { "epoch": 7.10016155088853, "grad_norm": 0.6162191439832418, "learning_rate": 9.734719422358946e-07, "loss": 0.008, "step": 4395 }, { "epoch": 7.101777059773829, "grad_norm": 0.6367843296915178, "learning_rate": 9.724640676015704e-07, "loss": 0.0051, "step": 4396 }, { "epoch": 7.103392568659127, "grad_norm": 0.7158131396627045, "learning_rate": 9.714565889906417e-07, "loss": 0.0077, "step": 4397 }, { "epoch": 7.105008077544427, "grad_norm": 0.6752388122752946, "learning_rate": 9.704495066643043e-07, "loss": 0.0061, "step": 4398 }, { "epoch": 7.106623586429725, "grad_norm": 0.6756361982922507, "learning_rate": 9.694428208836492e-07, "loss": 0.0055, "step": 4399 }, { "epoch": 7.1082390953150245, "grad_norm": 0.8753497619147792, "learning_rate": 9.68436531909667e-07, "loss": 0.0077, "step": 4400 }, { "epoch": 7.1082390953150245, "eval_loss": 1.7002532482147217, "eval_runtime": 2.3548, "eval_samples_per_second": 127.4, "eval_steps_per_second": 2.973, "step": 4400 }, { "epoch": 7.109854604200323, "grad_norm": 1.3507471718450743, "learning_rate": 9.67430640003244e-07, "loss": 0.0096, "step": 4401 }, { "epoch": 7.111470113085622, "grad_norm": 1.3653243719202202, "learning_rate": 9.664251454251659e-07, "loss": 0.01, "step": 4402 }, { "epoch": 7.113085621970921, "grad_norm": 0.8525742671915131, "learning_rate": 9.654200484361106e-07, "loss": 0.0063, "step": 4403 }, { "epoch": 7.114701130856219, "grad_norm": 0.906310597762557, "learning_rate": 9.644153492966584e-07, "loss": 0.0063, "step": 4404 }, { "epoch": 7.116316639741519, "grad_norm": 0.7877150473494295, "learning_rate": 9.634110482672815e-07, "loss": 0.0059, "step": 4405 }, { "epoch": 7.117932148626817, "grad_norm": 1.362660700396865, "learning_rate": 9.624071456083542e-07, "loss": 0.0107, "step": 4406 }, { "epoch": 7.119547657512117, "grad_norm": 0.8579433230134182, "learning_rate": 9.614036415801422e-07, "loss": 0.008, "step": 4407 }, { "epoch": 7.121163166397415, "grad_norm": 0.8571410514744134, "learning_rate": 9.604005364428127e-07, "loss": 0.0062, "step": 4408 }, { "epoch": 7.1227786752827145, "grad_norm": 0.4888329269232612, "learning_rate": 9.593978304564242e-07, "loss": 0.0061, "step": 4409 }, { "epoch": 7.124394184168013, "grad_norm": 0.9630318972426639, "learning_rate": 9.583955238809373e-07, "loss": 0.0069, "step": 4410 }, { "epoch": 7.1260096930533114, "grad_norm": 1.0547133140017568, "learning_rate": 9.57393616976203e-07, "loss": 0.0061, "step": 4411 }, { "epoch": 7.127625201938611, "grad_norm": 0.8807509567943625, "learning_rate": 9.56392110001976e-07, "loss": 0.0068, "step": 4412 }, { "epoch": 7.129240710823909, "grad_norm": 0.8193803323854906, "learning_rate": 9.553910032179006e-07, "loss": 0.006, "step": 4413 }, { "epoch": 7.130856219709209, "grad_norm": 0.8395523823888222, "learning_rate": 9.54390296883522e-07, "loss": 0.0075, "step": 4414 }, { "epoch": 7.132471728594507, "grad_norm": 0.709663956828818, "learning_rate": 9.533899912582778e-07, "loss": 0.0039, "step": 4415 }, { "epoch": 7.1340872374798066, "grad_norm": 0.949162159350048, "learning_rate": 9.523900866015043e-07, "loss": 0.0061, "step": 4416 }, { "epoch": 7.135702746365105, "grad_norm": 1.2768035480024345, "learning_rate": 9.513905831724332e-07, "loss": 0.009, "step": 4417 }, { "epoch": 7.1373182552504035, "grad_norm": 0.7596044458340604, "learning_rate": 9.503914812301931e-07, "loss": 0.0041, "step": 4418 }, { "epoch": 7.138933764135703, "grad_norm": 1.3752580571757747, "learning_rate": 9.493927810338058e-07, "loss": 0.01, "step": 4419 }, { "epoch": 7.140549273021001, "grad_norm": 0.8102618058527232, "learning_rate": 9.483944828421912e-07, "loss": 0.0124, "step": 4420 }, { "epoch": 7.142164781906301, "grad_norm": 0.5861769174033666, "learning_rate": 9.473965869141661e-07, "loss": 0.0058, "step": 4421 }, { "epoch": 7.143780290791599, "grad_norm": 0.857431846658482, "learning_rate": 9.463990935084389e-07, "loss": 0.0078, "step": 4422 }, { "epoch": 7.145395799676899, "grad_norm": 0.7263096897361652, "learning_rate": 9.45402002883617e-07, "loss": 0.0049, "step": 4423 }, { "epoch": 7.147011308562197, "grad_norm": 0.7788326596687675, "learning_rate": 9.444053152982027e-07, "loss": 0.0079, "step": 4424 }, { "epoch": 7.148626817447496, "grad_norm": 0.5407515611973491, "learning_rate": 9.434090310105945e-07, "loss": 0.0056, "step": 4425 }, { "epoch": 7.150242326332795, "grad_norm": 1.0802581453799356, "learning_rate": 9.424131502790837e-07, "loss": 0.0064, "step": 4426 }, { "epoch": 7.1518578352180935, "grad_norm": 0.9193780152494727, "learning_rate": 9.414176733618593e-07, "loss": 0.0094, "step": 4427 }, { "epoch": 7.153473344103393, "grad_norm": 0.8285552021358003, "learning_rate": 9.404226005170056e-07, "loss": 0.0108, "step": 4428 }, { "epoch": 7.155088852988691, "grad_norm": 0.6728480393533857, "learning_rate": 9.394279320025018e-07, "loss": 0.0049, "step": 4429 }, { "epoch": 7.156704361873991, "grad_norm": 1.653018989045413, "learning_rate": 9.384336680762204e-07, "loss": 0.0091, "step": 4430 }, { "epoch": 7.158319870759289, "grad_norm": 1.1848159407961507, "learning_rate": 9.374398089959328e-07, "loss": 0.0099, "step": 4431 }, { "epoch": 7.159935379644588, "grad_norm": 0.6460568783275562, "learning_rate": 9.364463550193009e-07, "loss": 0.0053, "step": 4432 }, { "epoch": 7.161550888529887, "grad_norm": 0.6116415303390198, "learning_rate": 9.354533064038853e-07, "loss": 0.0055, "step": 4433 }, { "epoch": 7.163166397415186, "grad_norm": 0.607817893344054, "learning_rate": 9.344606634071402e-07, "loss": 0.0057, "step": 4434 }, { "epoch": 7.164781906300485, "grad_norm": 0.9662818667163726, "learning_rate": 9.334684262864153e-07, "loss": 0.0079, "step": 4435 }, { "epoch": 7.166397415185783, "grad_norm": 0.6906496107612898, "learning_rate": 9.324765952989523e-07, "loss": 0.0093, "step": 4436 }, { "epoch": 7.168012924071083, "grad_norm": 0.7994014327088214, "learning_rate": 9.314851707018921e-07, "loss": 0.0073, "step": 4437 }, { "epoch": 7.169628432956381, "grad_norm": 1.009640523165479, "learning_rate": 9.304941527522649e-07, "loss": 0.007, "step": 4438 }, { "epoch": 7.17124394184168, "grad_norm": 1.305067575523542, "learning_rate": 9.295035417070017e-07, "loss": 0.0074, "step": 4439 }, { "epoch": 7.172859450726979, "grad_norm": 0.6968969563838973, "learning_rate": 9.285133378229224e-07, "loss": 0.0073, "step": 4440 }, { "epoch": 7.174474959612278, "grad_norm": 0.8661708466563581, "learning_rate": 9.275235413567451e-07, "loss": 0.0064, "step": 4441 }, { "epoch": 7.176090468497577, "grad_norm": 0.5334982118261351, "learning_rate": 9.265341525650796e-07, "loss": 0.0082, "step": 4442 }, { "epoch": 7.1777059773828755, "grad_norm": 0.5358387235325405, "learning_rate": 9.255451717044322e-07, "loss": 0.0069, "step": 4443 }, { "epoch": 7.179321486268175, "grad_norm": 1.3615588590893861, "learning_rate": 9.245565990312003e-07, "loss": 0.0059, "step": 4444 }, { "epoch": 7.180936995153473, "grad_norm": 0.7036721024927347, "learning_rate": 9.235684348016813e-07, "loss": 0.0064, "step": 4445 }, { "epoch": 7.182552504038772, "grad_norm": 0.9875907377277863, "learning_rate": 9.225806792720602e-07, "loss": 0.0076, "step": 4446 }, { "epoch": 7.184168012924071, "grad_norm": 0.8441238848713967, "learning_rate": 9.215933326984205e-07, "loss": 0.0096, "step": 4447 }, { "epoch": 7.18578352180937, "grad_norm": 0.6645866290894965, "learning_rate": 9.206063953367367e-07, "loss": 0.0074, "step": 4448 }, { "epoch": 7.187399030694669, "grad_norm": 0.5695496266724315, "learning_rate": 9.196198674428791e-07, "loss": 0.0068, "step": 4449 }, { "epoch": 7.189014539579968, "grad_norm": 1.0065568669290446, "learning_rate": 9.186337492726116e-07, "loss": 0.0069, "step": 4450 }, { "epoch": 7.190630048465267, "grad_norm": 1.0950897061382334, "learning_rate": 9.176480410815925e-07, "loss": 0.0112, "step": 4451 }, { "epoch": 7.1922455573505655, "grad_norm": 0.5080641386515168, "learning_rate": 9.166627431253708e-07, "loss": 0.0047, "step": 4452 }, { "epoch": 7.193861066235864, "grad_norm": 0.9306218327091, "learning_rate": 9.156778556593923e-07, "loss": 0.0104, "step": 4453 }, { "epoch": 7.195476575121163, "grad_norm": 1.360664791781904, "learning_rate": 9.146933789389964e-07, "loss": 0.0154, "step": 4454 }, { "epoch": 7.197092084006462, "grad_norm": 0.8336702190515836, "learning_rate": 9.13709313219413e-07, "loss": 0.0078, "step": 4455 }, { "epoch": 7.198707592891761, "grad_norm": 0.7444737268388318, "learning_rate": 9.127256587557684e-07, "loss": 0.0062, "step": 4456 }, { "epoch": 7.20032310177706, "grad_norm": 0.7569650266534006, "learning_rate": 9.11742415803081e-07, "loss": 0.0068, "step": 4457 }, { "epoch": 7.201938610662358, "grad_norm": 0.6297477190344973, "learning_rate": 9.10759584616264e-07, "loss": 0.007, "step": 4458 }, { "epoch": 7.203554119547658, "grad_norm": 0.6391733756739199, "learning_rate": 9.097771654501205e-07, "loss": 0.009, "step": 4459 }, { "epoch": 7.205169628432956, "grad_norm": 0.8158829156673506, "learning_rate": 9.087951585593502e-07, "loss": 0.0093, "step": 4460 }, { "epoch": 7.206785137318255, "grad_norm": 0.6915824306661431, "learning_rate": 9.078135641985444e-07, "loss": 0.0046, "step": 4461 }, { "epoch": 7.208400646203554, "grad_norm": 1.117639586974176, "learning_rate": 9.068323826221887e-07, "loss": 0.0095, "step": 4462 }, { "epoch": 7.210016155088853, "grad_norm": 0.6278059848077426, "learning_rate": 9.058516140846587e-07, "loss": 0.006, "step": 4463 }, { "epoch": 7.211631663974152, "grad_norm": 1.677561731706652, "learning_rate": 9.048712588402267e-07, "loss": 0.0081, "step": 4464 }, { "epoch": 7.21324717285945, "grad_norm": 1.679461486391822, "learning_rate": 9.038913171430536e-07, "loss": 0.0089, "step": 4465 }, { "epoch": 7.21486268174475, "grad_norm": 0.6807361104882224, "learning_rate": 9.029117892471984e-07, "loss": 0.0046, "step": 4466 }, { "epoch": 7.216478190630048, "grad_norm": 1.0754530186222186, "learning_rate": 9.019326754066079e-07, "loss": 0.0074, "step": 4467 }, { "epoch": 7.2180936995153475, "grad_norm": 1.0175441842245252, "learning_rate": 9.009539758751254e-07, "loss": 0.0099, "step": 4468 }, { "epoch": 7.219709208400646, "grad_norm": 0.9688958164073835, "learning_rate": 8.999756909064828e-07, "loss": 0.0085, "step": 4469 }, { "epoch": 7.221324717285945, "grad_norm": 1.0132973439886697, "learning_rate": 8.989978207543087e-07, "loss": 0.0101, "step": 4470 }, { "epoch": 7.222940226171244, "grad_norm": 1.555848588226797, "learning_rate": 8.980203656721193e-07, "loss": 0.0093, "step": 4471 }, { "epoch": 7.224555735056542, "grad_norm": 1.0289276813125414, "learning_rate": 8.970433259133299e-07, "loss": 0.0072, "step": 4472 }, { "epoch": 7.226171243941842, "grad_norm": 1.3540413651433754, "learning_rate": 8.960667017312413e-07, "loss": 0.0056, "step": 4473 }, { "epoch": 7.22778675282714, "grad_norm": 1.478552381269275, "learning_rate": 8.950904933790513e-07, "loss": 0.0139, "step": 4474 }, { "epoch": 7.22940226171244, "grad_norm": 1.2492899221680278, "learning_rate": 8.941147011098466e-07, "loss": 0.0084, "step": 4475 }, { "epoch": 7.231017770597738, "grad_norm": 0.7217142920841694, "learning_rate": 8.931393251766085e-07, "loss": 0.008, "step": 4476 }, { "epoch": 7.2326332794830375, "grad_norm": 0.651951800152884, "learning_rate": 8.92164365832209e-07, "loss": 0.0063, "step": 4477 }, { "epoch": 7.234248788368336, "grad_norm": 0.7973215544512714, "learning_rate": 8.911898233294136e-07, "loss": 0.0087, "step": 4478 }, { "epoch": 7.2358642972536344, "grad_norm": 1.0307270213395716, "learning_rate": 8.90215697920877e-07, "loss": 0.0097, "step": 4479 }, { "epoch": 7.237479806138934, "grad_norm": 1.095817066423349, "learning_rate": 8.892419898591478e-07, "loss": 0.0084, "step": 4480 }, { "epoch": 7.239095315024232, "grad_norm": 1.1367054573610857, "learning_rate": 8.882686993966677e-07, "loss": 0.0084, "step": 4481 }, { "epoch": 7.240710823909532, "grad_norm": 0.6815896948340743, "learning_rate": 8.87295826785766e-07, "loss": 0.0086, "step": 4482 }, { "epoch": 7.24232633279483, "grad_norm": 0.8900562366178525, "learning_rate": 8.863233722786671e-07, "loss": 0.0108, "step": 4483 }, { "epoch": 7.24394184168013, "grad_norm": 0.8920637826945561, "learning_rate": 8.85351336127486e-07, "loss": 0.0067, "step": 4484 }, { "epoch": 7.245557350565428, "grad_norm": 0.6041844408191005, "learning_rate": 8.843797185842304e-07, "loss": 0.0069, "step": 4485 }, { "epoch": 7.2471728594507265, "grad_norm": 1.0087077126924682, "learning_rate": 8.834085199007963e-07, "loss": 0.0088, "step": 4486 }, { "epoch": 7.248788368336026, "grad_norm": 0.6440736754968914, "learning_rate": 8.824377403289742e-07, "loss": 0.0075, "step": 4487 }, { "epoch": 7.250403877221324, "grad_norm": 0.8455319916332835, "learning_rate": 8.814673801204446e-07, "loss": 0.0077, "step": 4488 }, { "epoch": 7.252019386106624, "grad_norm": 0.6687106229327502, "learning_rate": 8.804974395267807e-07, "loss": 0.0092, "step": 4489 }, { "epoch": 7.253634894991922, "grad_norm": 0.5802887685737538, "learning_rate": 8.79527918799444e-07, "loss": 0.0081, "step": 4490 }, { "epoch": 7.255250403877222, "grad_norm": 0.7733985423016769, "learning_rate": 8.785588181897903e-07, "loss": 0.0055, "step": 4491 }, { "epoch": 7.25686591276252, "grad_norm": 0.8517804335424829, "learning_rate": 8.77590137949064e-07, "loss": 0.0064, "step": 4492 }, { "epoch": 7.258481421647819, "grad_norm": 0.7703709231366416, "learning_rate": 8.766218783284017e-07, "loss": 0.0053, "step": 4493 }, { "epoch": 7.260096930533118, "grad_norm": 0.8968483304998971, "learning_rate": 8.756540395788313e-07, "loss": 0.0084, "step": 4494 }, { "epoch": 7.2617124394184165, "grad_norm": 0.8800706564140044, "learning_rate": 8.74686621951272e-07, "loss": 0.0121, "step": 4495 }, { "epoch": 7.263327948303716, "grad_norm": 0.6705566788656356, "learning_rate": 8.737196256965313e-07, "loss": 0.0048, "step": 4496 }, { "epoch": 7.264943457189014, "grad_norm": 0.9448727916933066, "learning_rate": 8.727530510653107e-07, "loss": 0.0076, "step": 4497 }, { "epoch": 7.266558966074314, "grad_norm": 0.7219544954323194, "learning_rate": 8.717868983081981e-07, "loss": 0.0073, "step": 4498 }, { "epoch": 7.268174474959612, "grad_norm": 0.8526477672828147, "learning_rate": 8.708211676756786e-07, "loss": 0.0081, "step": 4499 }, { "epoch": 7.269789983844911, "grad_norm": 0.5980161722753634, "learning_rate": 8.698558594181209e-07, "loss": 0.0048, "step": 4500 }, { "epoch": 7.27140549273021, "grad_norm": 0.86442008334823, "learning_rate": 8.688909737857895e-07, "loss": 0.0103, "step": 4501 }, { "epoch": 7.273021001615509, "grad_norm": 0.6910546083929467, "learning_rate": 8.679265110288351e-07, "loss": 0.0066, "step": 4502 }, { "epoch": 7.274636510500808, "grad_norm": 0.692160130976952, "learning_rate": 8.669624713973026e-07, "loss": 0.0054, "step": 4503 }, { "epoch": 7.276252019386106, "grad_norm": 0.9730921490971955, "learning_rate": 8.65998855141123e-07, "loss": 0.0054, "step": 4504 }, { "epoch": 7.277867528271406, "grad_norm": 0.512774740622645, "learning_rate": 8.65035662510123e-07, "loss": 0.0072, "step": 4505 }, { "epoch": 7.279483037156704, "grad_norm": 0.9317402039489453, "learning_rate": 8.640728937540141e-07, "loss": 0.0075, "step": 4506 }, { "epoch": 7.281098546042003, "grad_norm": 0.47128513831075375, "learning_rate": 8.631105491224018e-07, "loss": 0.0042, "step": 4507 }, { "epoch": 7.282714054927302, "grad_norm": 0.7731606652689194, "learning_rate": 8.621486288647787e-07, "loss": 0.0057, "step": 4508 }, { "epoch": 7.284329563812601, "grad_norm": 0.6050451788918042, "learning_rate": 8.611871332305291e-07, "loss": 0.0082, "step": 4509 }, { "epoch": 7.2859450726979, "grad_norm": 0.9609092670157331, "learning_rate": 8.602260624689271e-07, "loss": 0.0097, "step": 4510 }, { "epoch": 7.2875605815831985, "grad_norm": 0.6993511642174471, "learning_rate": 8.592654168291372e-07, "loss": 0.0064, "step": 4511 }, { "epoch": 7.289176090468498, "grad_norm": 0.6501372465992873, "learning_rate": 8.583051965602113e-07, "loss": 0.0049, "step": 4512 }, { "epoch": 7.290791599353796, "grad_norm": 0.5283695216864808, "learning_rate": 8.573454019110933e-07, "loss": 0.0064, "step": 4513 }, { "epoch": 7.292407108239095, "grad_norm": 0.7250142270061044, "learning_rate": 8.563860331306159e-07, "loss": 0.0067, "step": 4514 }, { "epoch": 7.294022617124394, "grad_norm": 0.9428443972001387, "learning_rate": 8.55427090467503e-07, "loss": 0.0092, "step": 4515 }, { "epoch": 7.295638126009693, "grad_norm": 0.698322350069749, "learning_rate": 8.544685741703642e-07, "loss": 0.0082, "step": 4516 }, { "epoch": 7.297253634894992, "grad_norm": 0.6748932790065163, "learning_rate": 8.535104844877024e-07, "loss": 0.0091, "step": 4517 }, { "epoch": 7.298869143780291, "grad_norm": 1.3651360234953733, "learning_rate": 8.525528216679088e-07, "loss": 0.0071, "step": 4518 }, { "epoch": 7.30048465266559, "grad_norm": 0.51705003476252, "learning_rate": 8.51595585959262e-07, "loss": 0.0061, "step": 4519 }, { "epoch": 7.3021001615508885, "grad_norm": 0.6894638690371715, "learning_rate": 8.506387776099323e-07, "loss": 0.0082, "step": 4520 }, { "epoch": 7.303715670436187, "grad_norm": 1.0358013134102335, "learning_rate": 8.496823968679785e-07, "loss": 0.0073, "step": 4521 }, { "epoch": 7.305331179321486, "grad_norm": 0.9547213303745393, "learning_rate": 8.487264439813489e-07, "loss": 0.0068, "step": 4522 }, { "epoch": 7.306946688206785, "grad_norm": 0.7469406914936361, "learning_rate": 8.477709191978792e-07, "loss": 0.0086, "step": 4523 }, { "epoch": 7.308562197092084, "grad_norm": 1.175291260853779, "learning_rate": 8.468158227652965e-07, "loss": 0.0178, "step": 4524 }, { "epoch": 7.310177705977383, "grad_norm": 0.5876529533592829, "learning_rate": 8.458611549312131e-07, "loss": 0.0062, "step": 4525 }, { "epoch": 7.311793214862682, "grad_norm": 0.4955116843687238, "learning_rate": 8.449069159431364e-07, "loss": 0.005, "step": 4526 }, { "epoch": 7.313408723747981, "grad_norm": 1.250397060789131, "learning_rate": 8.43953106048456e-07, "loss": 0.0112, "step": 4527 }, { "epoch": 7.315024232633279, "grad_norm": 0.9220722453879018, "learning_rate": 8.429997254944552e-07, "loss": 0.0066, "step": 4528 }, { "epoch": 7.316639741518578, "grad_norm": 0.7648235046226987, "learning_rate": 8.420467745283021e-07, "loss": 0.007, "step": 4529 }, { "epoch": 7.318255250403877, "grad_norm": 0.524410583548185, "learning_rate": 8.410942533970571e-07, "loss": 0.0064, "step": 4530 }, { "epoch": 7.319870759289176, "grad_norm": 0.6514023180508753, "learning_rate": 8.401421623476644e-07, "loss": 0.0062, "step": 4531 }, { "epoch": 7.321486268174475, "grad_norm": 0.8756844130839606, "learning_rate": 8.391905016269636e-07, "loss": 0.0049, "step": 4532 }, { "epoch": 7.323101777059774, "grad_norm": 0.788428187849155, "learning_rate": 8.382392714816759e-07, "loss": 0.0083, "step": 4533 }, { "epoch": 7.324717285945073, "grad_norm": 0.8008618134186294, "learning_rate": 8.372884721584157e-07, "loss": 0.0051, "step": 4534 }, { "epoch": 7.326332794830371, "grad_norm": 0.988111293422866, "learning_rate": 8.36338103903682e-07, "loss": 0.0094, "step": 4535 }, { "epoch": 7.3279483037156705, "grad_norm": 0.9539095909787877, "learning_rate": 8.353881669638644e-07, "loss": 0.0104, "step": 4536 }, { "epoch": 7.329563812600969, "grad_norm": 0.8875262816651239, "learning_rate": 8.344386615852404e-07, "loss": 0.0067, "step": 4537 }, { "epoch": 7.331179321486268, "grad_norm": 1.0249342225379385, "learning_rate": 8.334895880139759e-07, "loss": 0.008, "step": 4538 }, { "epoch": 7.332794830371567, "grad_norm": 0.8528634614474003, "learning_rate": 8.325409464961229e-07, "loss": 0.0061, "step": 4539 }, { "epoch": 7.334410339256866, "grad_norm": 0.4830443237266528, "learning_rate": 8.315927372776233e-07, "loss": 0.0053, "step": 4540 }, { "epoch": 7.336025848142165, "grad_norm": 0.6999816409905716, "learning_rate": 8.30644960604307e-07, "loss": 0.007, "step": 4541 }, { "epoch": 7.337641357027463, "grad_norm": 0.5519989503427063, "learning_rate": 8.296976167218901e-07, "loss": 0.0064, "step": 4542 }, { "epoch": 7.339256865912763, "grad_norm": 0.5427342787269253, "learning_rate": 8.287507058759781e-07, "loss": 0.0072, "step": 4543 }, { "epoch": 7.340872374798061, "grad_norm": 1.0233982416634877, "learning_rate": 8.278042283120633e-07, "loss": 0.0094, "step": 4544 }, { "epoch": 7.3424878836833605, "grad_norm": 0.8265256340792863, "learning_rate": 8.268581842755269e-07, "loss": 0.0074, "step": 4545 }, { "epoch": 7.344103392568659, "grad_norm": 0.5763630442937202, "learning_rate": 8.259125740116356e-07, "loss": 0.0047, "step": 4546 }, { "epoch": 7.345718901453958, "grad_norm": 0.7071663660323586, "learning_rate": 8.249673977655453e-07, "loss": 0.0101, "step": 4547 }, { "epoch": 7.347334410339257, "grad_norm": 1.4883864074372184, "learning_rate": 8.240226557822992e-07, "loss": 0.0109, "step": 4548 }, { "epoch": 7.348949919224555, "grad_norm": 1.0187771885376051, "learning_rate": 8.230783483068283e-07, "loss": 0.0084, "step": 4549 }, { "epoch": 7.350565428109855, "grad_norm": 1.3563238273190945, "learning_rate": 8.221344755839489e-07, "loss": 0.0052, "step": 4550 }, { "epoch": 7.352180936995153, "grad_norm": 0.8211977327345578, "learning_rate": 8.211910378583676e-07, "loss": 0.0056, "step": 4551 }, { "epoch": 7.353796445880453, "grad_norm": 1.1145439038202107, "learning_rate": 8.202480353746747e-07, "loss": 0.0112, "step": 4552 }, { "epoch": 7.355411954765751, "grad_norm": 1.116839555950906, "learning_rate": 8.193054683773505e-07, "loss": 0.0054, "step": 4553 }, { "epoch": 7.35702746365105, "grad_norm": 0.9914934567705521, "learning_rate": 8.183633371107616e-07, "loss": 0.0076, "step": 4554 }, { "epoch": 7.358642972536349, "grad_norm": 0.6859515120056099, "learning_rate": 8.174216418191624e-07, "loss": 0.0087, "step": 4555 }, { "epoch": 7.360258481421647, "grad_norm": 0.9254967738838353, "learning_rate": 8.164803827466916e-07, "loss": 0.0036, "step": 4556 }, { "epoch": 7.361873990306947, "grad_norm": 0.6351939136543622, "learning_rate": 8.155395601373784e-07, "loss": 0.0062, "step": 4557 }, { "epoch": 7.363489499192245, "grad_norm": 0.45501409547514576, "learning_rate": 8.145991742351344e-07, "loss": 0.0051, "step": 4558 }, { "epoch": 7.365105008077545, "grad_norm": 0.822875556653965, "learning_rate": 8.136592252837638e-07, "loss": 0.0062, "step": 4559 }, { "epoch": 7.366720516962843, "grad_norm": 0.6512784800796521, "learning_rate": 8.127197135269524e-07, "loss": 0.0062, "step": 4560 }, { "epoch": 7.3683360258481425, "grad_norm": 1.0961667476296784, "learning_rate": 8.117806392082755e-07, "loss": 0.0104, "step": 4561 }, { "epoch": 7.369951534733441, "grad_norm": 0.7435254981252923, "learning_rate": 8.10842002571193e-07, "loss": 0.0065, "step": 4562 }, { "epoch": 7.3715670436187395, "grad_norm": 1.099734255942405, "learning_rate": 8.099038038590537e-07, "loss": 0.0094, "step": 4563 }, { "epoch": 7.373182552504039, "grad_norm": 0.8555206756518234, "learning_rate": 8.089660433150894e-07, "loss": 0.006, "step": 4564 }, { "epoch": 7.374798061389337, "grad_norm": 1.626581212033113, "learning_rate": 8.080287211824236e-07, "loss": 0.0094, "step": 4565 }, { "epoch": 7.376413570274637, "grad_norm": 0.8717529384341813, "learning_rate": 8.07091837704061e-07, "loss": 0.0079, "step": 4566 }, { "epoch": 7.378029079159935, "grad_norm": 1.077703000019972, "learning_rate": 8.061553931228955e-07, "loss": 0.0081, "step": 4567 }, { "epoch": 7.379644588045235, "grad_norm": 1.1220314590424376, "learning_rate": 8.05219387681705e-07, "loss": 0.0042, "step": 4568 }, { "epoch": 7.381260096930533, "grad_norm": 0.591327432987195, "learning_rate": 8.042838216231555e-07, "loss": 0.0059, "step": 4569 }, { "epoch": 7.382875605815832, "grad_norm": 1.030639541891543, "learning_rate": 8.033486951897989e-07, "loss": 0.0064, "step": 4570 }, { "epoch": 7.384491114701131, "grad_norm": 0.40019921290804483, "learning_rate": 8.02414008624073e-07, "loss": 0.0053, "step": 4571 }, { "epoch": 7.386106623586429, "grad_norm": 0.7533035822445464, "learning_rate": 8.014797621682999e-07, "loss": 0.0032, "step": 4572 }, { "epoch": 7.387722132471729, "grad_norm": 0.9025969669791462, "learning_rate": 8.005459560646898e-07, "loss": 0.0066, "step": 4573 }, { "epoch": 7.389337641357027, "grad_norm": 0.5607674037211267, "learning_rate": 7.996125905553375e-07, "loss": 0.005, "step": 4574 }, { "epoch": 7.390953150242327, "grad_norm": 0.776712890228543, "learning_rate": 7.986796658822252e-07, "loss": 0.0054, "step": 4575 }, { "epoch": 7.392568659127625, "grad_norm": 1.1676939117517686, "learning_rate": 7.977471822872176e-07, "loss": 0.0124, "step": 4576 }, { "epoch": 7.394184168012924, "grad_norm": 0.8926606132685808, "learning_rate": 7.968151400120677e-07, "loss": 0.0093, "step": 4577 }, { "epoch": 7.395799676898223, "grad_norm": 0.8248582784338999, "learning_rate": 7.958835392984147e-07, "loss": 0.0069, "step": 4578 }, { "epoch": 7.3974151857835215, "grad_norm": 0.786932725445839, "learning_rate": 7.949523803877798e-07, "loss": 0.0095, "step": 4579 }, { "epoch": 7.399030694668821, "grad_norm": 0.6296925572745912, "learning_rate": 7.940216635215733e-07, "loss": 0.006, "step": 4580 }, { "epoch": 7.400646203554119, "grad_norm": 0.9926182669764981, "learning_rate": 7.930913889410888e-07, "loss": 0.0081, "step": 4581 }, { "epoch": 7.402261712439419, "grad_norm": 0.9818859505677592, "learning_rate": 7.921615568875071e-07, "loss": 0.0093, "step": 4582 }, { "epoch": 7.403877221324717, "grad_norm": 1.028361787653331, "learning_rate": 7.912321676018914e-07, "loss": 0.0073, "step": 4583 }, { "epoch": 7.405492730210016, "grad_norm": 0.864045474196937, "learning_rate": 7.903032213251935e-07, "loss": 0.0073, "step": 4584 }, { "epoch": 7.407108239095315, "grad_norm": 0.5777768204808951, "learning_rate": 7.893747182982459e-07, "loss": 0.0055, "step": 4585 }, { "epoch": 7.408723747980614, "grad_norm": 1.3898217156292414, "learning_rate": 7.884466587617723e-07, "loss": 0.0072, "step": 4586 }, { "epoch": 7.410339256865913, "grad_norm": 0.43323834000515987, "learning_rate": 7.875190429563756e-07, "loss": 0.0044, "step": 4587 }, { "epoch": 7.4119547657512115, "grad_norm": 0.9828676809820981, "learning_rate": 7.865918711225476e-07, "loss": 0.0098, "step": 4588 }, { "epoch": 7.413570274636511, "grad_norm": 0.8308094145223104, "learning_rate": 7.85665143500662e-07, "loss": 0.0053, "step": 4589 }, { "epoch": 7.415185783521809, "grad_norm": 0.6483239650299042, "learning_rate": 7.847388603309808e-07, "loss": 0.0072, "step": 4590 }, { "epoch": 7.416801292407108, "grad_norm": 0.7454065515778594, "learning_rate": 7.838130218536454e-07, "loss": 0.0084, "step": 4591 }, { "epoch": 7.418416801292407, "grad_norm": 0.4624637379636952, "learning_rate": 7.828876283086898e-07, "loss": 0.0049, "step": 4592 }, { "epoch": 7.420032310177706, "grad_norm": 0.8423645807223623, "learning_rate": 7.819626799360247e-07, "loss": 0.0051, "step": 4593 }, { "epoch": 7.421647819063005, "grad_norm": 0.9928646138792788, "learning_rate": 7.810381769754508e-07, "loss": 0.0069, "step": 4594 }, { "epoch": 7.423263327948304, "grad_norm": 1.0369206655102934, "learning_rate": 7.801141196666498e-07, "loss": 0.0093, "step": 4595 }, { "epoch": 7.424878836833603, "grad_norm": 0.9286730198233611, "learning_rate": 7.791905082491902e-07, "loss": 0.0088, "step": 4596 }, { "epoch": 7.426494345718901, "grad_norm": 0.8657609886018731, "learning_rate": 7.782673429625242e-07, "loss": 0.0085, "step": 4597 }, { "epoch": 7.4281098546042, "grad_norm": 1.15021190290157, "learning_rate": 7.773446240459892e-07, "loss": 0.0083, "step": 4598 }, { "epoch": 7.429725363489499, "grad_norm": 0.814348037535528, "learning_rate": 7.764223517388039e-07, "loss": 0.0084, "step": 4599 }, { "epoch": 7.431340872374798, "grad_norm": 0.8806596440073972, "learning_rate": 7.755005262800743e-07, "loss": 0.0072, "step": 4600 }, { "epoch": 7.431340872374798, "eval_loss": 1.7089048624038696, "eval_runtime": 2.347, "eval_samples_per_second": 127.821, "eval_steps_per_second": 2.982, "step": 4600 }, { "epoch": 7.432956381260097, "grad_norm": 0.8257671663265624, "learning_rate": 7.745791479087906e-07, "loss": 0.0089, "step": 4601 }, { "epoch": 7.434571890145396, "grad_norm": 0.8986639763592152, "learning_rate": 7.736582168638237e-07, "loss": 0.0071, "step": 4602 }, { "epoch": 7.436187399030695, "grad_norm": 0.7390832379825633, "learning_rate": 7.727377333839323e-07, "loss": 0.0102, "step": 4603 }, { "epoch": 7.4378029079159935, "grad_norm": 0.8736327654908653, "learning_rate": 7.718176977077571e-07, "loss": 0.0067, "step": 4604 }, { "epoch": 7.439418416801292, "grad_norm": 0.740817088974561, "learning_rate": 7.708981100738245e-07, "loss": 0.0083, "step": 4605 }, { "epoch": 7.441033925686591, "grad_norm": 1.0458572192944247, "learning_rate": 7.699789707205416e-07, "loss": 0.0073, "step": 4606 }, { "epoch": 7.44264943457189, "grad_norm": 0.6102410636058061, "learning_rate": 7.690602798862015e-07, "loss": 0.0087, "step": 4607 }, { "epoch": 7.444264943457189, "grad_norm": 1.4402052480699719, "learning_rate": 7.681420378089813e-07, "loss": 0.0072, "step": 4608 }, { "epoch": 7.445880452342488, "grad_norm": 1.0322015238606788, "learning_rate": 7.672242447269415e-07, "loss": 0.009, "step": 4609 }, { "epoch": 7.447495961227787, "grad_norm": 0.7859962804814045, "learning_rate": 7.663069008780241e-07, "loss": 0.0066, "step": 4610 }, { "epoch": 7.449111470113086, "grad_norm": 1.0814267376987263, "learning_rate": 7.653900065000583e-07, "loss": 0.0095, "step": 4611 }, { "epoch": 7.450726978998384, "grad_norm": 1.3689140293692648, "learning_rate": 7.64473561830753e-07, "loss": 0.0118, "step": 4612 }, { "epoch": 7.4523424878836835, "grad_norm": 0.9294399565526886, "learning_rate": 7.635575671077031e-07, "loss": 0.0078, "step": 4613 }, { "epoch": 7.453957996768982, "grad_norm": 0.7554939530677417, "learning_rate": 7.626420225683862e-07, "loss": 0.007, "step": 4614 }, { "epoch": 7.455573505654281, "grad_norm": 0.7711007106922516, "learning_rate": 7.617269284501635e-07, "loss": 0.0051, "step": 4615 }, { "epoch": 7.45718901453958, "grad_norm": 0.6346208409434878, "learning_rate": 7.608122849902774e-07, "loss": 0.0054, "step": 4616 }, { "epoch": 7.458804523424879, "grad_norm": 0.8616601329596875, "learning_rate": 7.59898092425857e-07, "loss": 0.0055, "step": 4617 }, { "epoch": 7.460420032310178, "grad_norm": 0.8351250202687114, "learning_rate": 7.589843509939099e-07, "loss": 0.0085, "step": 4618 }, { "epoch": 7.462035541195476, "grad_norm": 0.6086099906113285, "learning_rate": 7.580710609313327e-07, "loss": 0.0077, "step": 4619 }, { "epoch": 7.463651050080776, "grad_norm": 0.5760453454216796, "learning_rate": 7.571582224748991e-07, "loss": 0.0054, "step": 4620 }, { "epoch": 7.465266558966074, "grad_norm": 1.1869078327436613, "learning_rate": 7.562458358612699e-07, "loss": 0.0125, "step": 4621 }, { "epoch": 7.466882067851373, "grad_norm": 0.6443576121841598, "learning_rate": 7.553339013269854e-07, "loss": 0.0068, "step": 4622 }, { "epoch": 7.468497576736672, "grad_norm": 0.9038858796550849, "learning_rate": 7.544224191084712e-07, "loss": 0.0099, "step": 4623 }, { "epoch": 7.470113085621971, "grad_norm": 0.902191735623091, "learning_rate": 7.535113894420354e-07, "loss": 0.0083, "step": 4624 }, { "epoch": 7.47172859450727, "grad_norm": 1.6168528742387727, "learning_rate": 7.526008125638684e-07, "loss": 0.0226, "step": 4625 }, { "epoch": 7.473344103392568, "grad_norm": 0.8259107122456508, "learning_rate": 7.516906887100417e-07, "loss": 0.0076, "step": 4626 }, { "epoch": 7.474959612277868, "grad_norm": 0.8835427392664367, "learning_rate": 7.507810181165123e-07, "loss": 0.0081, "step": 4627 }, { "epoch": 7.476575121163166, "grad_norm": 0.4677763351082363, "learning_rate": 7.498718010191167e-07, "loss": 0.0066, "step": 4628 }, { "epoch": 7.4781906300484655, "grad_norm": 1.0574764482264054, "learning_rate": 7.489630376535753e-07, "loss": 0.0073, "step": 4629 }, { "epoch": 7.479806138933764, "grad_norm": 0.9939751029394279, "learning_rate": 7.480547282554915e-07, "loss": 0.0106, "step": 4630 }, { "epoch": 7.481421647819063, "grad_norm": 0.5632094846324446, "learning_rate": 7.47146873060351e-07, "loss": 0.0036, "step": 4631 }, { "epoch": 7.483037156704362, "grad_norm": 0.831101305828787, "learning_rate": 7.462394723035188e-07, "loss": 0.0068, "step": 4632 }, { "epoch": 7.48465266558966, "grad_norm": 1.135695086268484, "learning_rate": 7.45332526220246e-07, "loss": 0.0066, "step": 4633 }, { "epoch": 7.48626817447496, "grad_norm": 0.450434310882283, "learning_rate": 7.444260350456634e-07, "loss": 0.0059, "step": 4634 }, { "epoch": 7.487883683360258, "grad_norm": 1.2581114639689974, "learning_rate": 7.435199990147857e-07, "loss": 0.0075, "step": 4635 }, { "epoch": 7.489499192245558, "grad_norm": 0.4867229248845719, "learning_rate": 7.42614418362507e-07, "loss": 0.0044, "step": 4636 }, { "epoch": 7.491114701130856, "grad_norm": 0.6349967961812968, "learning_rate": 7.417092933236056e-07, "loss": 0.0043, "step": 4637 }, { "epoch": 7.4927302100161555, "grad_norm": 0.6315647663102686, "learning_rate": 7.408046241327416e-07, "loss": 0.0052, "step": 4638 }, { "epoch": 7.494345718901454, "grad_norm": 1.3687194390319248, "learning_rate": 7.399004110244545e-07, "loss": 0.0113, "step": 4639 }, { "epoch": 7.4959612277867524, "grad_norm": 1.0123691461745141, "learning_rate": 7.389966542331683e-07, "loss": 0.0112, "step": 4640 }, { "epoch": 7.497576736672052, "grad_norm": 1.2964979481025798, "learning_rate": 7.380933539931878e-07, "loss": 0.0109, "step": 4641 }, { "epoch": 7.49919224555735, "grad_norm": 0.9365981207946159, "learning_rate": 7.371905105386998e-07, "loss": 0.0074, "step": 4642 }, { "epoch": 7.50080775444265, "grad_norm": 0.9010210250362723, "learning_rate": 7.362881241037711e-07, "loss": 0.0054, "step": 4643 }, { "epoch": 7.502423263327948, "grad_norm": 1.5888471209961306, "learning_rate": 7.35386194922352e-07, "loss": 0.0125, "step": 4644 }, { "epoch": 7.5040387722132476, "grad_norm": 1.1634964579213385, "learning_rate": 7.344847232282718e-07, "loss": 0.0089, "step": 4645 }, { "epoch": 7.505654281098546, "grad_norm": 0.8717009257865211, "learning_rate": 7.335837092552453e-07, "loss": 0.0072, "step": 4646 }, { "epoch": 7.5072697899838445, "grad_norm": 1.0342323755807648, "learning_rate": 7.326831532368639e-07, "loss": 0.013, "step": 4647 }, { "epoch": 7.508885298869144, "grad_norm": 0.6344175083623933, "learning_rate": 7.317830554066041e-07, "loss": 0.0062, "step": 4648 }, { "epoch": 7.510500807754442, "grad_norm": 0.5501003486537973, "learning_rate": 7.308834159978206e-07, "loss": 0.0037, "step": 4649 }, { "epoch": 7.512116316639742, "grad_norm": 1.349158952415588, "learning_rate": 7.299842352437517e-07, "loss": 0.0102, "step": 4650 }, { "epoch": 7.51373182552504, "grad_norm": 0.6592965264685234, "learning_rate": 7.29085513377514e-07, "loss": 0.0065, "step": 4651 }, { "epoch": 7.51534733441034, "grad_norm": 0.984575716525801, "learning_rate": 7.281872506321094e-07, "loss": 0.0078, "step": 4652 }, { "epoch": 7.516962843295638, "grad_norm": 0.6901091168616243, "learning_rate": 7.272894472404163e-07, "loss": 0.008, "step": 4653 }, { "epoch": 7.518578352180937, "grad_norm": 0.6331659663865151, "learning_rate": 7.263921034351973e-07, "loss": 0.0066, "step": 4654 }, { "epoch": 7.520193861066236, "grad_norm": 0.8952587841230804, "learning_rate": 7.254952194490927e-07, "loss": 0.0077, "step": 4655 }, { "epoch": 7.5218093699515345, "grad_norm": 0.56456132612054, "learning_rate": 7.245987955146266e-07, "loss": 0.0046, "step": 4656 }, { "epoch": 7.523424878836834, "grad_norm": 0.8151342947948619, "learning_rate": 7.237028318642026e-07, "loss": 0.0104, "step": 4657 }, { "epoch": 7.525040387722132, "grad_norm": 0.7048011466461184, "learning_rate": 7.228073287301054e-07, "loss": 0.0077, "step": 4658 }, { "epoch": 7.526655896607432, "grad_norm": 0.44699181392198756, "learning_rate": 7.219122863444986e-07, "loss": 0.0065, "step": 4659 }, { "epoch": 7.52827140549273, "grad_norm": 0.8812067265968372, "learning_rate": 7.21017704939428e-07, "loss": 0.0091, "step": 4660 }, { "epoch": 7.529886914378029, "grad_norm": 0.5175525838017483, "learning_rate": 7.201235847468208e-07, "loss": 0.0062, "step": 4661 }, { "epoch": 7.531502423263328, "grad_norm": 1.2256620879949567, "learning_rate": 7.192299259984814e-07, "loss": 0.0059, "step": 4662 }, { "epoch": 7.533117932148627, "grad_norm": 0.5643038227111696, "learning_rate": 7.183367289260973e-07, "loss": 0.0039, "step": 4663 }, { "epoch": 7.534733441033926, "grad_norm": 0.9506986226680997, "learning_rate": 7.17443993761236e-07, "loss": 0.0087, "step": 4664 }, { "epoch": 7.536348949919224, "grad_norm": 0.8683813254365325, "learning_rate": 7.16551720735345e-07, "loss": 0.0078, "step": 4665 }, { "epoch": 7.537964458804524, "grad_norm": 0.5362265135571896, "learning_rate": 7.156599100797504e-07, "loss": 0.0073, "step": 4666 }, { "epoch": 7.539579967689822, "grad_norm": 0.8887257431246485, "learning_rate": 7.147685620256606e-07, "loss": 0.0086, "step": 4667 }, { "epoch": 7.541195476575121, "grad_norm": 0.38097913683801715, "learning_rate": 7.138776768041633e-07, "loss": 0.0051, "step": 4668 }, { "epoch": 7.54281098546042, "grad_norm": 0.8562579599035072, "learning_rate": 7.129872546462266e-07, "loss": 0.0066, "step": 4669 }, { "epoch": 7.544426494345719, "grad_norm": 0.9354168590163267, "learning_rate": 7.120972957826968e-07, "loss": 0.0071, "step": 4670 }, { "epoch": 7.546042003231018, "grad_norm": 0.7226574265581377, "learning_rate": 7.11207800444303e-07, "loss": 0.0088, "step": 4671 }, { "epoch": 7.5476575121163165, "grad_norm": 0.8604846956848125, "learning_rate": 7.103187688616508e-07, "loss": 0.0091, "step": 4672 }, { "epoch": 7.549273021001616, "grad_norm": 0.8156598989689697, "learning_rate": 7.094302012652283e-07, "loss": 0.0095, "step": 4673 }, { "epoch": 7.550888529886914, "grad_norm": 0.8679184382506502, "learning_rate": 7.085420978854022e-07, "loss": 0.0068, "step": 4674 }, { "epoch": 7.552504038772213, "grad_norm": 0.7028932594508918, "learning_rate": 7.076544589524193e-07, "loss": 0.0087, "step": 4675 }, { "epoch": 7.554119547657512, "grad_norm": 0.9005871340740338, "learning_rate": 7.067672846964046e-07, "loss": 0.0106, "step": 4676 }, { "epoch": 7.555735056542811, "grad_norm": 0.8885295438369327, "learning_rate": 7.058805753473652e-07, "loss": 0.0072, "step": 4677 }, { "epoch": 7.55735056542811, "grad_norm": 0.9628318221250141, "learning_rate": 7.049943311351834e-07, "loss": 0.0067, "step": 4678 }, { "epoch": 7.558966074313409, "grad_norm": 0.8778213682977691, "learning_rate": 7.041085522896271e-07, "loss": 0.0086, "step": 4679 }, { "epoch": 7.560581583198708, "grad_norm": 0.9309576685004307, "learning_rate": 7.032232390403376e-07, "loss": 0.0077, "step": 4680 }, { "epoch": 7.5621970920840065, "grad_norm": 1.11412376183662, "learning_rate": 7.0233839161684e-07, "loss": 0.0122, "step": 4681 }, { "epoch": 7.563812600969305, "grad_norm": 0.7669924259288048, "learning_rate": 7.014540102485343e-07, "loss": 0.0095, "step": 4682 }, { "epoch": 7.565428109854604, "grad_norm": 0.9447603853088546, "learning_rate": 7.005700951647032e-07, "loss": 0.0108, "step": 4683 }, { "epoch": 7.567043618739903, "grad_norm": 0.578171528555278, "learning_rate": 6.996866465945071e-07, "loss": 0.0063, "step": 4684 }, { "epoch": 7.568659127625202, "grad_norm": 0.7542956833397565, "learning_rate": 6.988036647669869e-07, "loss": 0.0109, "step": 4685 }, { "epoch": 7.570274636510501, "grad_norm": 1.0846853108323482, "learning_rate": 6.979211499110591e-07, "loss": 0.0054, "step": 4686 }, { "epoch": 7.5718901453958, "grad_norm": 2.1491238922262834, "learning_rate": 6.970391022555231e-07, "loss": 0.0105, "step": 4687 }, { "epoch": 7.573505654281099, "grad_norm": 1.0221231906724495, "learning_rate": 6.961575220290539e-07, "loss": 0.0103, "step": 4688 }, { "epoch": 7.575121163166397, "grad_norm": 0.6133236762151012, "learning_rate": 6.952764094602074e-07, "loss": 0.0062, "step": 4689 }, { "epoch": 7.576736672051696, "grad_norm": 1.1100108339453048, "learning_rate": 6.943957647774177e-07, "loss": 0.0114, "step": 4690 }, { "epoch": 7.578352180936995, "grad_norm": 1.0868669501331358, "learning_rate": 6.935155882089983e-07, "loss": 0.0085, "step": 4691 }, { "epoch": 7.579967689822294, "grad_norm": 0.5160218280396468, "learning_rate": 6.926358799831389e-07, "loss": 0.006, "step": 4692 }, { "epoch": 7.581583198707593, "grad_norm": 1.127489155387339, "learning_rate": 6.917566403279105e-07, "loss": 0.0101, "step": 4693 }, { "epoch": 7.583198707592892, "grad_norm": 0.8682034723979016, "learning_rate": 6.908778694712612e-07, "loss": 0.0053, "step": 4694 }, { "epoch": 7.584814216478191, "grad_norm": 1.490611461432908, "learning_rate": 6.899995676410189e-07, "loss": 0.0096, "step": 4695 }, { "epoch": 7.586429725363489, "grad_norm": 0.8514329494394199, "learning_rate": 6.891217350648877e-07, "loss": 0.009, "step": 4696 }, { "epoch": 7.5880452342487885, "grad_norm": 0.6518097302827666, "learning_rate": 6.882443719704515e-07, "loss": 0.0072, "step": 4697 }, { "epoch": 7.589660743134087, "grad_norm": 0.9885773004200866, "learning_rate": 6.873674785851731e-07, "loss": 0.0076, "step": 4698 }, { "epoch": 7.591276252019386, "grad_norm": 1.8666133647398635, "learning_rate": 6.864910551363915e-07, "loss": 0.0113, "step": 4699 }, { "epoch": 7.592891760904685, "grad_norm": 0.6144170865945997, "learning_rate": 6.856151018513257e-07, "loss": 0.0106, "step": 4700 }, { "epoch": 7.594507269789984, "grad_norm": 0.9034004544157327, "learning_rate": 6.847396189570718e-07, "loss": 0.009, "step": 4701 }, { "epoch": 7.596122778675283, "grad_norm": 1.1678341589705608, "learning_rate": 6.838646066806054e-07, "loss": 0.0159, "step": 4702 }, { "epoch": 7.597738287560581, "grad_norm": 0.7482702696597954, "learning_rate": 6.829900652487775e-07, "loss": 0.0067, "step": 4703 }, { "epoch": 7.599353796445881, "grad_norm": 0.8416863537288105, "learning_rate": 6.821159948883199e-07, "loss": 0.0089, "step": 4704 }, { "epoch": 7.600969305331179, "grad_norm": 1.3585887747634344, "learning_rate": 6.812423958258388e-07, "loss": 0.0089, "step": 4705 }, { "epoch": 7.6025848142164785, "grad_norm": 0.5662755650657126, "learning_rate": 6.80369268287823e-07, "loss": 0.0059, "step": 4706 }, { "epoch": 7.604200323101777, "grad_norm": 0.970308711667628, "learning_rate": 6.794966125006342e-07, "loss": 0.0099, "step": 4707 }, { "epoch": 7.605815831987076, "grad_norm": 0.6132020623159714, "learning_rate": 6.786244286905158e-07, "loss": 0.0041, "step": 4708 }, { "epoch": 7.607431340872375, "grad_norm": 1.2559544096905781, "learning_rate": 6.777527170835851e-07, "loss": 0.0131, "step": 4709 }, { "epoch": 7.609046849757673, "grad_norm": 1.0143837852295214, "learning_rate": 6.768814779058405e-07, "loss": 0.0065, "step": 4710 }, { "epoch": 7.610662358642973, "grad_norm": 0.7467834233000717, "learning_rate": 6.76010711383154e-07, "loss": 0.0048, "step": 4711 }, { "epoch": 7.612277867528271, "grad_norm": 0.6483290220665068, "learning_rate": 6.751404177412806e-07, "loss": 0.0058, "step": 4712 }, { "epoch": 7.613893376413571, "grad_norm": 0.681087353280827, "learning_rate": 6.742705972058469e-07, "loss": 0.0055, "step": 4713 }, { "epoch": 7.615508885298869, "grad_norm": 1.165857591887925, "learning_rate": 6.73401250002361e-07, "loss": 0.0094, "step": 4714 }, { "epoch": 7.617124394184168, "grad_norm": 0.7727636445216326, "learning_rate": 6.725323763562052e-07, "loss": 0.008, "step": 4715 }, { "epoch": 7.618739903069467, "grad_norm": 0.6393360952877674, "learning_rate": 6.716639764926414e-07, "loss": 0.0065, "step": 4716 }, { "epoch": 7.620355411954765, "grad_norm": 0.7954623125014296, "learning_rate": 6.707960506368075e-07, "loss": 0.008, "step": 4717 }, { "epoch": 7.621970920840065, "grad_norm": 0.8281874887903241, "learning_rate": 6.699285990137197e-07, "loss": 0.0071, "step": 4718 }, { "epoch": 7.623586429725363, "grad_norm": 1.1804783772495877, "learning_rate": 6.69061621848269e-07, "loss": 0.0089, "step": 4719 }, { "epoch": 7.625201938610663, "grad_norm": 0.6229408538053738, "learning_rate": 6.681951193652252e-07, "loss": 0.0061, "step": 4720 }, { "epoch": 7.626817447495961, "grad_norm": 0.9961158001053505, "learning_rate": 6.673290917892356e-07, "loss": 0.0059, "step": 4721 }, { "epoch": 7.6284329563812605, "grad_norm": 1.6183028311913503, "learning_rate": 6.664635393448216e-07, "loss": 0.0134, "step": 4722 }, { "epoch": 7.630048465266559, "grad_norm": 0.5539364560220627, "learning_rate": 6.655984622563838e-07, "loss": 0.0042, "step": 4723 }, { "epoch": 7.6316639741518575, "grad_norm": 0.7361677155736555, "learning_rate": 6.647338607481993e-07, "loss": 0.0052, "step": 4724 }, { "epoch": 7.633279483037157, "grad_norm": 0.42703671178066166, "learning_rate": 6.638697350444223e-07, "loss": 0.0056, "step": 4725 }, { "epoch": 7.634894991922455, "grad_norm": 1.2265545028361708, "learning_rate": 6.630060853690809e-07, "loss": 0.0092, "step": 4726 }, { "epoch": 7.636510500807755, "grad_norm": 0.6504863285967369, "learning_rate": 6.621429119460832e-07, "loss": 0.0054, "step": 4727 }, { "epoch": 7.638126009693053, "grad_norm": 1.3667929579544145, "learning_rate": 6.612802149992118e-07, "loss": 0.0055, "step": 4728 }, { "epoch": 7.639741518578353, "grad_norm": 0.5448824974804829, "learning_rate": 6.604179947521272e-07, "loss": 0.0067, "step": 4729 }, { "epoch": 7.641357027463651, "grad_norm": 0.6382570994162245, "learning_rate": 6.595562514283643e-07, "loss": 0.0074, "step": 4730 }, { "epoch": 7.64297253634895, "grad_norm": 0.6229003998832275, "learning_rate": 6.586949852513369e-07, "loss": 0.0062, "step": 4731 }, { "epoch": 7.644588045234249, "grad_norm": 0.6057995933425678, "learning_rate": 6.578341964443313e-07, "loss": 0.0063, "step": 4732 }, { "epoch": 7.646203554119547, "grad_norm": 0.7654688354288398, "learning_rate": 6.569738852305155e-07, "loss": 0.005, "step": 4733 }, { "epoch": 7.647819063004847, "grad_norm": 0.5486805155619504, "learning_rate": 6.561140518329287e-07, "loss": 0.0049, "step": 4734 }, { "epoch": 7.649434571890145, "grad_norm": 1.1469882767526314, "learning_rate": 6.552546964744893e-07, "loss": 0.01, "step": 4735 }, { "epoch": 7.651050080775445, "grad_norm": 1.3460792976285123, "learning_rate": 6.543958193779895e-07, "loss": 0.0106, "step": 4736 }, { "epoch": 7.652665589660743, "grad_norm": 1.274533569325841, "learning_rate": 6.535374207660997e-07, "loss": 0.0062, "step": 4737 }, { "epoch": 7.654281098546042, "grad_norm": 0.714048587845881, "learning_rate": 6.526795008613635e-07, "loss": 0.0078, "step": 4738 }, { "epoch": 7.655896607431341, "grad_norm": 0.366608782960471, "learning_rate": 6.518220598862043e-07, "loss": 0.0028, "step": 4739 }, { "epoch": 7.6575121163166395, "grad_norm": 0.9206596856821271, "learning_rate": 6.509650980629176e-07, "loss": 0.0083, "step": 4740 }, { "epoch": 7.659127625201939, "grad_norm": 1.1811344294034554, "learning_rate": 6.501086156136776e-07, "loss": 0.0071, "step": 4741 }, { "epoch": 7.660743134087237, "grad_norm": 2.248164472019334, "learning_rate": 6.492526127605309e-07, "loss": 0.0124, "step": 4742 }, { "epoch": 7.662358642972537, "grad_norm": 0.8310926818018505, "learning_rate": 6.483970897254028e-07, "loss": 0.0051, "step": 4743 }, { "epoch": 7.663974151857835, "grad_norm": 1.2095262814301833, "learning_rate": 6.475420467300931e-07, "loss": 0.0057, "step": 4744 }, { "epoch": 7.665589660743134, "grad_norm": 0.7704668554142959, "learning_rate": 6.466874839962775e-07, "loss": 0.005, "step": 4745 }, { "epoch": 7.667205169628433, "grad_norm": 0.6868285473442085, "learning_rate": 6.458334017455056e-07, "loss": 0.0071, "step": 4746 }, { "epoch": 7.668820678513732, "grad_norm": 0.6039460949759996, "learning_rate": 6.44979800199205e-07, "loss": 0.0088, "step": 4747 }, { "epoch": 7.670436187399031, "grad_norm": 0.8078843922443685, "learning_rate": 6.441266795786763e-07, "loss": 0.0068, "step": 4748 }, { "epoch": 7.6720516962843295, "grad_norm": 1.0019489264758215, "learning_rate": 6.432740401050966e-07, "loss": 0.0071, "step": 4749 }, { "epoch": 7.673667205169629, "grad_norm": 1.373782657992833, "learning_rate": 6.424218819995182e-07, "loss": 0.0105, "step": 4750 }, { "epoch": 7.675282714054927, "grad_norm": 1.0787210838735906, "learning_rate": 6.415702054828696e-07, "loss": 0.0081, "step": 4751 }, { "epoch": 7.676898222940226, "grad_norm": 0.5376728211172278, "learning_rate": 6.407190107759514e-07, "loss": 0.004, "step": 4752 }, { "epoch": 7.678513731825525, "grad_norm": 1.473458600852253, "learning_rate": 6.398682980994425e-07, "loss": 0.0116, "step": 4753 }, { "epoch": 7.680129240710824, "grad_norm": 1.2442031569738776, "learning_rate": 6.39018067673895e-07, "loss": 0.0085, "step": 4754 }, { "epoch": 7.681744749596123, "grad_norm": 0.5234558069828706, "learning_rate": 6.381683197197377e-07, "loss": 0.0061, "step": 4755 }, { "epoch": 7.683360258481422, "grad_norm": 0.9195960024889297, "learning_rate": 6.373190544572716e-07, "loss": 0.0076, "step": 4756 }, { "epoch": 7.684975767366721, "grad_norm": 0.9835221459044272, "learning_rate": 6.364702721066748e-07, "loss": 0.0074, "step": 4757 }, { "epoch": 7.686591276252019, "grad_norm": 1.2233451913742581, "learning_rate": 6.35621972888e-07, "loss": 0.0079, "step": 4758 }, { "epoch": 7.688206785137318, "grad_norm": 0.8404491768864162, "learning_rate": 6.347741570211732e-07, "loss": 0.0104, "step": 4759 }, { "epoch": 7.689822294022617, "grad_norm": 0.9897877883357676, "learning_rate": 6.339268247259964e-07, "loss": 0.0064, "step": 4760 }, { "epoch": 7.691437802907916, "grad_norm": 0.7345730913583661, "learning_rate": 6.33079976222146e-07, "loss": 0.0064, "step": 4761 }, { "epoch": 7.693053311793215, "grad_norm": 0.946322106871707, "learning_rate": 6.322336117291736e-07, "loss": 0.0058, "step": 4762 }, { "epoch": 7.694668820678514, "grad_norm": 0.5361271788913886, "learning_rate": 6.313877314665032e-07, "loss": 0.008, "step": 4763 }, { "epoch": 7.696284329563813, "grad_norm": 1.3980886728075455, "learning_rate": 6.305423356534357e-07, "loss": 0.008, "step": 4764 }, { "epoch": 7.6978998384491115, "grad_norm": 0.461707307246933, "learning_rate": 6.296974245091436e-07, "loss": 0.0062, "step": 4765 }, { "epoch": 7.69951534733441, "grad_norm": 0.44329636003628475, "learning_rate": 6.288529982526784e-07, "loss": 0.0061, "step": 4766 }, { "epoch": 7.701130856219709, "grad_norm": 1.0502540901567352, "learning_rate": 6.280090571029604e-07, "loss": 0.01, "step": 4767 }, { "epoch": 7.702746365105008, "grad_norm": 0.828881926373165, "learning_rate": 6.271656012787882e-07, "loss": 0.0081, "step": 4768 }, { "epoch": 7.704361873990307, "grad_norm": 0.967760365106236, "learning_rate": 6.263226309988324e-07, "loss": 0.0066, "step": 4769 }, { "epoch": 7.705977382875606, "grad_norm": 0.6322934170689493, "learning_rate": 6.254801464816388e-07, "loss": 0.0092, "step": 4770 }, { "epoch": 7.707592891760905, "grad_norm": 0.5658540592945819, "learning_rate": 6.246381479456251e-07, "loss": 0.0054, "step": 4771 }, { "epoch": 7.709208400646204, "grad_norm": 0.6071325095459436, "learning_rate": 6.23796635609088e-07, "loss": 0.005, "step": 4772 }, { "epoch": 7.710823909531502, "grad_norm": 1.1427604855855906, "learning_rate": 6.229556096901923e-07, "loss": 0.0081, "step": 4773 }, { "epoch": 7.7124394184168015, "grad_norm": 1.0680753202779816, "learning_rate": 6.221150704069809e-07, "loss": 0.0115, "step": 4774 }, { "epoch": 7.7140549273021, "grad_norm": 0.44318560042803584, "learning_rate": 6.212750179773677e-07, "loss": 0.0052, "step": 4775 }, { "epoch": 7.715670436187399, "grad_norm": 0.9250670389321058, "learning_rate": 6.20435452619142e-07, "loss": 0.0085, "step": 4776 }, { "epoch": 7.717285945072698, "grad_norm": 0.5289391647879279, "learning_rate": 6.195963745499667e-07, "loss": 0.0063, "step": 4777 }, { "epoch": 7.718901453957997, "grad_norm": 0.6246662254713964, "learning_rate": 6.187577839873788e-07, "loss": 0.0076, "step": 4778 }, { "epoch": 7.720516962843296, "grad_norm": 0.8336260845354807, "learning_rate": 6.179196811487867e-07, "loss": 0.0073, "step": 4779 }, { "epoch": 7.722132471728594, "grad_norm": 0.7865373895123605, "learning_rate": 6.170820662514748e-07, "loss": 0.0063, "step": 4780 }, { "epoch": 7.723747980613894, "grad_norm": 0.8219212579978417, "learning_rate": 6.162449395126005e-07, "loss": 0.0066, "step": 4781 }, { "epoch": 7.725363489499192, "grad_norm": 0.7186574947313197, "learning_rate": 6.154083011491932e-07, "loss": 0.0049, "step": 4782 }, { "epoch": 7.726978998384491, "grad_norm": 1.1482749647877377, "learning_rate": 6.145721513781569e-07, "loss": 0.0076, "step": 4783 }, { "epoch": 7.72859450726979, "grad_norm": 0.7685022125415579, "learning_rate": 6.13736490416269e-07, "loss": 0.0097, "step": 4784 }, { "epoch": 7.730210016155089, "grad_norm": 0.41806419683551, "learning_rate": 6.129013184801811e-07, "loss": 0.0039, "step": 4785 }, { "epoch": 7.731825525040388, "grad_norm": 0.9656922710906526, "learning_rate": 6.120666357864144e-07, "loss": 0.0063, "step": 4786 }, { "epoch": 7.733441033925686, "grad_norm": 0.694982528571464, "learning_rate": 6.112324425513674e-07, "loss": 0.0101, "step": 4787 }, { "epoch": 7.735056542810986, "grad_norm": 1.112026070135489, "learning_rate": 6.103987389913093e-07, "loss": 0.011, "step": 4788 }, { "epoch": 7.736672051696284, "grad_norm": 0.5842731752734938, "learning_rate": 6.095655253223842e-07, "loss": 0.0049, "step": 4789 }, { "epoch": 7.7382875605815835, "grad_norm": 0.811332693072252, "learning_rate": 6.087328017606064e-07, "loss": 0.008, "step": 4790 }, { "epoch": 7.739903069466882, "grad_norm": 0.8398746066543724, "learning_rate": 6.079005685218661e-07, "loss": 0.0072, "step": 4791 }, { "epoch": 7.741518578352181, "grad_norm": 0.748667570512935, "learning_rate": 6.070688258219232e-07, "loss": 0.0054, "step": 4792 }, { "epoch": 7.74313408723748, "grad_norm": 1.366170190566409, "learning_rate": 6.062375738764148e-07, "loss": 0.0081, "step": 4793 }, { "epoch": 7.744749596122778, "grad_norm": 0.6339192700204129, "learning_rate": 6.054068129008464e-07, "loss": 0.0069, "step": 4794 }, { "epoch": 7.746365105008078, "grad_norm": 0.6100006055062719, "learning_rate": 6.045765431105994e-07, "loss": 0.0076, "step": 4795 }, { "epoch": 7.747980613893376, "grad_norm": 0.7382606622140343, "learning_rate": 6.037467647209249e-07, "loss": 0.0063, "step": 4796 }, { "epoch": 7.749596122778676, "grad_norm": 0.8937717273678533, "learning_rate": 6.029174779469502e-07, "loss": 0.0087, "step": 4797 }, { "epoch": 7.751211631663974, "grad_norm": 0.5165755141564986, "learning_rate": 6.020886830036701e-07, "loss": 0.0052, "step": 4798 }, { "epoch": 7.7528271405492735, "grad_norm": 0.8920715495273732, "learning_rate": 6.012603801059589e-07, "loss": 0.0056, "step": 4799 }, { "epoch": 7.754442649434572, "grad_norm": 0.5636093189295072, "learning_rate": 6.004325694685562e-07, "loss": 0.0072, "step": 4800 }, { "epoch": 7.754442649434572, "eval_loss": 1.7026395797729492, "eval_runtime": 2.3561, "eval_samples_per_second": 127.329, "eval_steps_per_second": 2.971, "step": 4800 }, { "epoch": 7.75605815831987, "grad_norm": 1.0265441128955604, "learning_rate": 5.996052513060793e-07, "loss": 0.0086, "step": 4801 }, { "epoch": 7.75767366720517, "grad_norm": 0.7902030297888721, "learning_rate": 5.98778425833014e-07, "loss": 0.0072, "step": 4802 }, { "epoch": 7.759289176090468, "grad_norm": 0.9328239919882839, "learning_rate": 5.979520932637208e-07, "loss": 0.0084, "step": 4803 }, { "epoch": 7.760904684975768, "grad_norm": 1.1260644230497385, "learning_rate": 5.97126253812432e-07, "loss": 0.0072, "step": 4804 }, { "epoch": 7.762520193861066, "grad_norm": 1.028404039735459, "learning_rate": 5.963009076932519e-07, "loss": 0.0062, "step": 4805 }, { "epoch": 7.7641357027463656, "grad_norm": 0.618780357130007, "learning_rate": 5.954760551201555e-07, "loss": 0.0061, "step": 4806 }, { "epoch": 7.765751211631664, "grad_norm": 1.109780776098144, "learning_rate": 5.946516963069923e-07, "loss": 0.0087, "step": 4807 }, { "epoch": 7.7673667205169625, "grad_norm": 0.5591399374230085, "learning_rate": 5.938278314674817e-07, "loss": 0.0055, "step": 4808 }, { "epoch": 7.768982229402262, "grad_norm": 0.78394209011005, "learning_rate": 5.93004460815216e-07, "loss": 0.0071, "step": 4809 }, { "epoch": 7.77059773828756, "grad_norm": 0.8555433850440141, "learning_rate": 5.9218158456366e-07, "loss": 0.0072, "step": 4810 }, { "epoch": 7.77221324717286, "grad_norm": 0.5137096291181186, "learning_rate": 5.913592029261494e-07, "loss": 0.0056, "step": 4811 }, { "epoch": 7.773828756058158, "grad_norm": 0.6051929025120605, "learning_rate": 5.905373161158909e-07, "loss": 0.0072, "step": 4812 }, { "epoch": 7.775444264943458, "grad_norm": 0.7290571226382823, "learning_rate": 5.897159243459647e-07, "loss": 0.0083, "step": 4813 }, { "epoch": 7.777059773828756, "grad_norm": 1.0644526534665009, "learning_rate": 5.888950278293218e-07, "loss": 0.0076, "step": 4814 }, { "epoch": 7.778675282714055, "grad_norm": 0.5840086856177646, "learning_rate": 5.880746267787852e-07, "loss": 0.0064, "step": 4815 }, { "epoch": 7.780290791599354, "grad_norm": 1.0103377959332285, "learning_rate": 5.872547214070482e-07, "loss": 0.014, "step": 4816 }, { "epoch": 7.7819063004846525, "grad_norm": 0.7000391805443025, "learning_rate": 5.864353119266766e-07, "loss": 0.0105, "step": 4817 }, { "epoch": 7.783521809369952, "grad_norm": 0.6116218790719368, "learning_rate": 5.856163985501088e-07, "loss": 0.0056, "step": 4818 }, { "epoch": 7.78513731825525, "grad_norm": 0.9320465678489889, "learning_rate": 5.847979814896515e-07, "loss": 0.0091, "step": 4819 }, { "epoch": 7.78675282714055, "grad_norm": 0.7002976712674153, "learning_rate": 5.839800609574852e-07, "loss": 0.0068, "step": 4820 }, { "epoch": 7.788368336025848, "grad_norm": 0.8698936160011635, "learning_rate": 5.831626371656612e-07, "loss": 0.0085, "step": 4821 }, { "epoch": 7.789983844911147, "grad_norm": 0.9384175596741522, "learning_rate": 5.823457103261026e-07, "loss": 0.0065, "step": 4822 }, { "epoch": 7.791599353796446, "grad_norm": 0.8605970528352948, "learning_rate": 5.815292806506012e-07, "loss": 0.0075, "step": 4823 }, { "epoch": 7.793214862681745, "grad_norm": 0.8108647226257524, "learning_rate": 5.807133483508232e-07, "loss": 0.0063, "step": 4824 }, { "epoch": 7.794830371567044, "grad_norm": 1.159405210741287, "learning_rate": 5.79897913638302e-07, "loss": 0.0081, "step": 4825 }, { "epoch": 7.796445880452342, "grad_norm": 1.3696952592269251, "learning_rate": 5.790829767244471e-07, "loss": 0.0075, "step": 4826 }, { "epoch": 7.798061389337642, "grad_norm": 1.0275637201515524, "learning_rate": 5.782685378205338e-07, "loss": 0.0083, "step": 4827 }, { "epoch": 7.79967689822294, "grad_norm": 0.5638880054973208, "learning_rate": 5.774545971377121e-07, "loss": 0.006, "step": 4828 }, { "epoch": 7.801292407108239, "grad_norm": 0.981098141250938, "learning_rate": 5.766411548869999e-07, "loss": 0.0104, "step": 4829 }, { "epoch": 7.802907915993538, "grad_norm": 0.5156082839650507, "learning_rate": 5.758282112792887e-07, "loss": 0.0053, "step": 4830 }, { "epoch": 7.804523424878837, "grad_norm": 1.120859902864875, "learning_rate": 5.750157665253373e-07, "loss": 0.0054, "step": 4831 }, { "epoch": 7.806138933764136, "grad_norm": 0.43710295471772254, "learning_rate": 5.742038208357795e-07, "loss": 0.0043, "step": 4832 }, { "epoch": 7.8077544426494345, "grad_norm": 1.4926709159347609, "learning_rate": 5.733923744211156e-07, "loss": 0.0094, "step": 4833 }, { "epoch": 7.809369951534734, "grad_norm": 0.9079895098510038, "learning_rate": 5.725814274917199e-07, "loss": 0.0096, "step": 4834 }, { "epoch": 7.810985460420032, "grad_norm": 0.9412506188799522, "learning_rate": 5.717709802578336e-07, "loss": 0.0054, "step": 4835 }, { "epoch": 7.812600969305331, "grad_norm": 0.6310337652834511, "learning_rate": 5.709610329295714e-07, "loss": 0.0092, "step": 4836 }, { "epoch": 7.81421647819063, "grad_norm": 0.602007458736948, "learning_rate": 5.701515857169173e-07, "loss": 0.0059, "step": 4837 }, { "epoch": 7.815831987075929, "grad_norm": 0.5357147953668913, "learning_rate": 5.69342638829726e-07, "loss": 0.0059, "step": 4838 }, { "epoch": 7.817447495961228, "grad_norm": 0.6008442870098958, "learning_rate": 5.685341924777211e-07, "loss": 0.0056, "step": 4839 }, { "epoch": 7.819063004846527, "grad_norm": 0.6183756893803682, "learning_rate": 5.677262468704981e-07, "loss": 0.0077, "step": 4840 }, { "epoch": 7.820678513731826, "grad_norm": 0.9392435221597193, "learning_rate": 5.669188022175215e-07, "loss": 0.0057, "step": 4841 }, { "epoch": 7.8222940226171245, "grad_norm": 0.9779699619012602, "learning_rate": 5.661118587281278e-07, "loss": 0.008, "step": 4842 }, { "epoch": 7.823909531502423, "grad_norm": 0.5349286903805575, "learning_rate": 5.653054166115205e-07, "loss": 0.0078, "step": 4843 }, { "epoch": 7.825525040387722, "grad_norm": 0.7278606594911192, "learning_rate": 5.644994760767756e-07, "loss": 0.0077, "step": 4844 }, { "epoch": 7.827140549273021, "grad_norm": 1.3230063760980522, "learning_rate": 5.636940373328389e-07, "loss": 0.0153, "step": 4845 }, { "epoch": 7.82875605815832, "grad_norm": 0.546900772493138, "learning_rate": 5.628891005885245e-07, "loss": 0.0057, "step": 4846 }, { "epoch": 7.830371567043619, "grad_norm": 1.0673241416911927, "learning_rate": 5.620846660525176e-07, "loss": 0.0098, "step": 4847 }, { "epoch": 7.831987075928918, "grad_norm": 0.3279289632611417, "learning_rate": 5.612807339333734e-07, "loss": 0.0035, "step": 4848 }, { "epoch": 7.833602584814217, "grad_norm": 0.42317082246871285, "learning_rate": 5.604773044395167e-07, "loss": 0.0053, "step": 4849 }, { "epoch": 7.835218093699515, "grad_norm": 1.1774338250358882, "learning_rate": 5.596743777792405e-07, "loss": 0.0096, "step": 4850 }, { "epoch": 7.836833602584814, "grad_norm": 0.6573326372161594, "learning_rate": 5.588719541607102e-07, "loss": 0.0091, "step": 4851 }, { "epoch": 7.838449111470113, "grad_norm": 0.6999108675824963, "learning_rate": 5.580700337919568e-07, "loss": 0.0065, "step": 4852 }, { "epoch": 7.840064620355412, "grad_norm": 0.9258222915166125, "learning_rate": 5.572686168808861e-07, "loss": 0.0081, "step": 4853 }, { "epoch": 7.841680129240711, "grad_norm": 0.8504168368463296, "learning_rate": 5.564677036352684e-07, "loss": 0.0064, "step": 4854 }, { "epoch": 7.84329563812601, "grad_norm": 0.6437972369376788, "learning_rate": 5.556672942627473e-07, "loss": 0.0081, "step": 4855 }, { "epoch": 7.844911147011309, "grad_norm": 0.5572493869999292, "learning_rate": 5.548673889708323e-07, "loss": 0.009, "step": 4856 }, { "epoch": 7.846526655896607, "grad_norm": 0.960788275537403, "learning_rate": 5.540679879669056e-07, "loss": 0.0084, "step": 4857 }, { "epoch": 7.8481421647819065, "grad_norm": 0.7120799773273585, "learning_rate": 5.532690914582145e-07, "loss": 0.0052, "step": 4858 }, { "epoch": 7.849757673667205, "grad_norm": 0.4757019650570028, "learning_rate": 5.524706996518808e-07, "loss": 0.0062, "step": 4859 }, { "epoch": 7.851373182552504, "grad_norm": 0.8204191166065733, "learning_rate": 5.516728127548909e-07, "loss": 0.0088, "step": 4860 }, { "epoch": 7.852988691437803, "grad_norm": 0.9813801943850262, "learning_rate": 5.508754309741035e-07, "loss": 0.0041, "step": 4861 }, { "epoch": 7.854604200323102, "grad_norm": 1.0028291499902027, "learning_rate": 5.500785545162432e-07, "loss": 0.0073, "step": 4862 }, { "epoch": 7.856219709208401, "grad_norm": 0.6092253228708648, "learning_rate": 5.492821835879059e-07, "loss": 0.0044, "step": 4863 }, { "epoch": 7.857835218093699, "grad_norm": 0.5312774119506108, "learning_rate": 5.484863183955561e-07, "loss": 0.0057, "step": 4864 }, { "epoch": 7.859450726978999, "grad_norm": 0.4410042133506114, "learning_rate": 5.476909591455273e-07, "loss": 0.0046, "step": 4865 }, { "epoch": 7.861066235864297, "grad_norm": 0.7781447652875854, "learning_rate": 5.468961060440206e-07, "loss": 0.0065, "step": 4866 }, { "epoch": 7.8626817447495965, "grad_norm": 0.9361883504871031, "learning_rate": 5.461017592971074e-07, "loss": 0.0074, "step": 4867 }, { "epoch": 7.864297253634895, "grad_norm": 0.7979168359104992, "learning_rate": 5.453079191107263e-07, "loss": 0.0058, "step": 4868 }, { "epoch": 7.865912762520194, "grad_norm": 0.8996202555628982, "learning_rate": 5.445145856906858e-07, "loss": 0.0062, "step": 4869 }, { "epoch": 7.867528271405493, "grad_norm": 0.960929501875921, "learning_rate": 5.437217592426627e-07, "loss": 0.009, "step": 4870 }, { "epoch": 7.869143780290791, "grad_norm": 0.955145155116222, "learning_rate": 5.429294399722032e-07, "loss": 0.0078, "step": 4871 }, { "epoch": 7.870759289176091, "grad_norm": 1.013865512327377, "learning_rate": 5.421376280847193e-07, "loss": 0.0073, "step": 4872 }, { "epoch": 7.872374798061389, "grad_norm": 0.6174757966817321, "learning_rate": 5.413463237854943e-07, "loss": 0.0058, "step": 4873 }, { "epoch": 7.8739903069466886, "grad_norm": 0.7782856164856824, "learning_rate": 5.405555272796789e-07, "loss": 0.006, "step": 4874 }, { "epoch": 7.875605815831987, "grad_norm": 0.6055871692755459, "learning_rate": 5.397652387722924e-07, "loss": 0.0043, "step": 4875 }, { "epoch": 7.877221324717286, "grad_norm": 0.7100773706644261, "learning_rate": 5.38975458468221e-07, "loss": 0.007, "step": 4876 }, { "epoch": 7.878836833602585, "grad_norm": 0.633499131501302, "learning_rate": 5.381861865722212e-07, "loss": 0.0065, "step": 4877 }, { "epoch": 7.880452342487883, "grad_norm": 0.7347568602080151, "learning_rate": 5.373974232889173e-07, "loss": 0.0071, "step": 4878 }, { "epoch": 7.882067851373183, "grad_norm": 0.8602711348674577, "learning_rate": 5.366091688227995e-07, "loss": 0.0064, "step": 4879 }, { "epoch": 7.883683360258481, "grad_norm": 0.5158244879644756, "learning_rate": 5.358214233782286e-07, "loss": 0.004, "step": 4880 }, { "epoch": 7.885298869143781, "grad_norm": 1.024595995317092, "learning_rate": 5.350341871594328e-07, "loss": 0.0089, "step": 4881 }, { "epoch": 7.886914378029079, "grad_norm": 0.7376034837012179, "learning_rate": 5.342474603705086e-07, "loss": 0.0062, "step": 4882 }, { "epoch": 7.8885298869143785, "grad_norm": 0.8662798620997652, "learning_rate": 5.334612432154188e-07, "loss": 0.0073, "step": 4883 }, { "epoch": 7.890145395799677, "grad_norm": 1.084475688398582, "learning_rate": 5.326755358979959e-07, "loss": 0.0063, "step": 4884 }, { "epoch": 7.8917609046849755, "grad_norm": 0.6930619223594584, "learning_rate": 5.318903386219384e-07, "loss": 0.0044, "step": 4885 }, { "epoch": 7.893376413570275, "grad_norm": 1.0198552207779599, "learning_rate": 5.311056515908158e-07, "loss": 0.0077, "step": 4886 }, { "epoch": 7.894991922455573, "grad_norm": 0.5183837615931103, "learning_rate": 5.303214750080612e-07, "loss": 0.0055, "step": 4887 }, { "epoch": 7.896607431340873, "grad_norm": 0.6474467969299582, "learning_rate": 5.295378090769793e-07, "loss": 0.0059, "step": 4888 }, { "epoch": 7.898222940226171, "grad_norm": 0.6267619557347537, "learning_rate": 5.287546540007383e-07, "loss": 0.0065, "step": 4889 }, { "epoch": 7.899838449111471, "grad_norm": 0.6292248387668229, "learning_rate": 5.279720099823777e-07, "loss": 0.0076, "step": 4890 }, { "epoch": 7.901453957996769, "grad_norm": 0.3612225578841197, "learning_rate": 5.27189877224801e-07, "loss": 0.0036, "step": 4891 }, { "epoch": 7.903069466882068, "grad_norm": 0.7306190681307081, "learning_rate": 5.26408255930784e-07, "loss": 0.0058, "step": 4892 }, { "epoch": 7.904684975767367, "grad_norm": 1.4451977505164133, "learning_rate": 5.256271463029649e-07, "loss": 0.0076, "step": 4893 }, { "epoch": 7.906300484652665, "grad_norm": 0.6743157522846907, "learning_rate": 5.248465485438522e-07, "loss": 0.0062, "step": 4894 }, { "epoch": 7.907915993537965, "grad_norm": 0.4973155898100177, "learning_rate": 5.2406646285582e-07, "loss": 0.0079, "step": 4895 }, { "epoch": 7.909531502423263, "grad_norm": 0.7068262091793831, "learning_rate": 5.232868894411114e-07, "loss": 0.0067, "step": 4896 }, { "epoch": 7.911147011308563, "grad_norm": 0.5727851309547429, "learning_rate": 5.225078285018348e-07, "loss": 0.0055, "step": 4897 }, { "epoch": 7.912762520193861, "grad_norm": 0.6880709595073252, "learning_rate": 5.217292802399682e-07, "loss": 0.0045, "step": 4898 }, { "epoch": 7.91437802907916, "grad_norm": 0.6107444798835735, "learning_rate": 5.209512448573533e-07, "loss": 0.0063, "step": 4899 }, { "epoch": 7.915993537964459, "grad_norm": 0.7248948256633505, "learning_rate": 5.201737225557019e-07, "loss": 0.0048, "step": 4900 }, { "epoch": 7.9176090468497575, "grad_norm": 1.0399872300973851, "learning_rate": 5.193967135365912e-07, "loss": 0.0058, "step": 4901 }, { "epoch": 7.919224555735057, "grad_norm": 0.7203034608731422, "learning_rate": 5.186202180014668e-07, "loss": 0.0056, "step": 4902 }, { "epoch": 7.920840064620355, "grad_norm": 0.8769215559437057, "learning_rate": 5.178442361516384e-07, "loss": 0.007, "step": 4903 }, { "epoch": 7.922455573505655, "grad_norm": 0.9083930356546839, "learning_rate": 5.170687681882846e-07, "loss": 0.0081, "step": 4904 }, { "epoch": 7.924071082390953, "grad_norm": 0.3556670501542986, "learning_rate": 5.162938143124519e-07, "loss": 0.0039, "step": 4905 }, { "epoch": 7.925686591276252, "grad_norm": 0.7435178210755821, "learning_rate": 5.1551937472505e-07, "loss": 0.0068, "step": 4906 }, { "epoch": 7.927302100161551, "grad_norm": 0.6114619323524749, "learning_rate": 5.147454496268583e-07, "loss": 0.0066, "step": 4907 }, { "epoch": 7.92891760904685, "grad_norm": 1.0129703825495158, "learning_rate": 5.139720392185219e-07, "loss": 0.0096, "step": 4908 }, { "epoch": 7.930533117932149, "grad_norm": 0.8017713612442239, "learning_rate": 5.131991437005523e-07, "loss": 0.009, "step": 4909 }, { "epoch": 7.9321486268174475, "grad_norm": 0.9131426150083222, "learning_rate": 5.124267632733271e-07, "loss": 0.0057, "step": 4910 }, { "epoch": 7.933764135702747, "grad_norm": 0.732108869793683, "learning_rate": 5.11654898137092e-07, "loss": 0.008, "step": 4911 }, { "epoch": 7.935379644588045, "grad_norm": 1.4044251281930855, "learning_rate": 5.108835484919552e-07, "loss": 0.0108, "step": 4912 }, { "epoch": 7.936995153473344, "grad_norm": 0.7294382374771569, "learning_rate": 5.101127145378978e-07, "loss": 0.0053, "step": 4913 }, { "epoch": 7.938610662358643, "grad_norm": 0.4390266870419709, "learning_rate": 5.093423964747604e-07, "loss": 0.0067, "step": 4914 }, { "epoch": 7.940226171243942, "grad_norm": 0.7384413393776721, "learning_rate": 5.085725945022549e-07, "loss": 0.0073, "step": 4915 }, { "epoch": 7.941841680129241, "grad_norm": 0.5519691722606095, "learning_rate": 5.078033088199555e-07, "loss": 0.007, "step": 4916 }, { "epoch": 7.94345718901454, "grad_norm": 0.9029677228544847, "learning_rate": 5.070345396273064e-07, "loss": 0.0116, "step": 4917 }, { "epoch": 7.945072697899839, "grad_norm": 0.8800512020147929, "learning_rate": 5.062662871236129e-07, "loss": 0.0069, "step": 4918 }, { "epoch": 7.946688206785137, "grad_norm": 0.47265232587413025, "learning_rate": 5.054985515080529e-07, "loss": 0.0053, "step": 4919 }, { "epoch": 7.948303715670436, "grad_norm": 0.7088296805706809, "learning_rate": 5.047313329796644e-07, "loss": 0.0068, "step": 4920 }, { "epoch": 7.949919224555735, "grad_norm": 1.1159448608534537, "learning_rate": 5.039646317373548e-07, "loss": 0.0137, "step": 4921 }, { "epoch": 7.951534733441034, "grad_norm": 0.4391005783823855, "learning_rate": 5.031984479798954e-07, "loss": 0.0048, "step": 4922 }, { "epoch": 7.953150242326333, "grad_norm": 0.976711815775905, "learning_rate": 5.024327819059244e-07, "loss": 0.0054, "step": 4923 }, { "epoch": 7.954765751211632, "grad_norm": 0.7696927021833446, "learning_rate": 5.016676337139459e-07, "loss": 0.0056, "step": 4924 }, { "epoch": 7.956381260096931, "grad_norm": 0.6300299547802273, "learning_rate": 5.009030036023301e-07, "loss": 0.0096, "step": 4925 }, { "epoch": 7.9579967689822295, "grad_norm": 0.6827628466108877, "learning_rate": 5.001388917693106e-07, "loss": 0.0064, "step": 4926 }, { "epoch": 7.959612277867528, "grad_norm": 0.5327101619622372, "learning_rate": 4.993752984129902e-07, "loss": 0.0065, "step": 4927 }, { "epoch": 7.961227786752827, "grad_norm": 0.29545864787325893, "learning_rate": 4.986122237313332e-07, "loss": 0.0037, "step": 4928 }, { "epoch": 7.962843295638126, "grad_norm": 0.481836062711605, "learning_rate": 4.978496679221726e-07, "loss": 0.0064, "step": 4929 }, { "epoch": 7.964458804523425, "grad_norm": 0.7243397027813128, "learning_rate": 4.970876311832059e-07, "loss": 0.0065, "step": 4930 }, { "epoch": 7.966074313408724, "grad_norm": 0.484876637270921, "learning_rate": 4.963261137119968e-07, "loss": 0.005, "step": 4931 }, { "epoch": 7.967689822294023, "grad_norm": 0.4332637072170807, "learning_rate": 4.955651157059718e-07, "loss": 0.0063, "step": 4932 }, { "epoch": 7.969305331179322, "grad_norm": 0.5295729607150099, "learning_rate": 4.948046373624252e-07, "loss": 0.0061, "step": 4933 }, { "epoch": 7.97092084006462, "grad_norm": 1.152154801147835, "learning_rate": 4.940446788785161e-07, "loss": 0.0083, "step": 4934 }, { "epoch": 7.9725363489499195, "grad_norm": 0.8033411369426783, "learning_rate": 4.932852404512689e-07, "loss": 0.0078, "step": 4935 }, { "epoch": 7.974151857835218, "grad_norm": 0.43446986275675414, "learning_rate": 4.92526322277572e-07, "loss": 0.0051, "step": 4936 }, { "epoch": 7.975767366720517, "grad_norm": 0.8583428053651121, "learning_rate": 4.917679245541798e-07, "loss": 0.006, "step": 4937 }, { "epoch": 7.977382875605816, "grad_norm": 0.6518162147674338, "learning_rate": 4.91010047477713e-07, "loss": 0.0065, "step": 4938 }, { "epoch": 7.978998384491114, "grad_norm": 0.6590748109702856, "learning_rate": 4.902526912446543e-07, "loss": 0.0061, "step": 4939 }, { "epoch": 7.980613893376414, "grad_norm": 0.6664691403444966, "learning_rate": 4.894958560513538e-07, "loss": 0.0076, "step": 4940 }, { "epoch": 7.982229402261712, "grad_norm": 0.904285321002074, "learning_rate": 4.887395420940261e-07, "loss": 0.0101, "step": 4941 }, { "epoch": 7.983844911147012, "grad_norm": 0.7594349008268265, "learning_rate": 4.879837495687509e-07, "loss": 0.0091, "step": 4942 }, { "epoch": 7.98546042003231, "grad_norm": 0.6753045735431386, "learning_rate": 4.872284786714707e-07, "loss": 0.0067, "step": 4943 }, { "epoch": 7.987075928917609, "grad_norm": 1.0058457387613147, "learning_rate": 4.86473729597996e-07, "loss": 0.0089, "step": 4944 }, { "epoch": 7.988691437802908, "grad_norm": 1.6211954491493998, "learning_rate": 4.857195025439981e-07, "loss": 0.0091, "step": 4945 }, { "epoch": 7.990306946688206, "grad_norm": 0.47227154131166815, "learning_rate": 4.849657977050176e-07, "loss": 0.0058, "step": 4946 }, { "epoch": 7.991922455573506, "grad_norm": 0.7081692709146415, "learning_rate": 4.842126152764556e-07, "loss": 0.0047, "step": 4947 }, { "epoch": 7.993537964458804, "grad_norm": 0.6771661552950011, "learning_rate": 4.834599554535808e-07, "loss": 0.0056, "step": 4948 }, { "epoch": 7.995153473344104, "grad_norm": 0.5984457395445814, "learning_rate": 4.827078184315234e-07, "loss": 0.0055, "step": 4949 }, { "epoch": 7.996768982229402, "grad_norm": 2.1058950726905503, "learning_rate": 4.819562044052811e-07, "loss": 0.0075, "step": 4950 }, { "epoch": 7.9983844911147015, "grad_norm": 0.9947126424858865, "learning_rate": 4.812051135697129e-07, "loss": 0.0081, "step": 4951 }, { "epoch": 8.0, "grad_norm": 0.45947563284051457, "learning_rate": 4.804545461195462e-07, "loss": 0.0048, "step": 4952 }, { "epoch": 8.0016155088853, "grad_norm": 0.35768361012313366, "learning_rate": 4.797045022493685e-07, "loss": 0.0046, "step": 4953 }, { "epoch": 8.003231017770597, "grad_norm": 0.3481785626482471, "learning_rate": 4.789549821536349e-07, "loss": 0.004, "step": 4954 }, { "epoch": 8.004846526655896, "grad_norm": 1.6264685902250984, "learning_rate": 4.782059860266617e-07, "loss": 0.0071, "step": 4955 }, { "epoch": 8.006462035541196, "grad_norm": 0.40740480576947236, "learning_rate": 4.774575140626317e-07, "loss": 0.0046, "step": 4956 }, { "epoch": 8.008077544426495, "grad_norm": 0.37102598004636, "learning_rate": 4.7670956645559096e-07, "loss": 0.004, "step": 4957 }, { "epoch": 8.009693053311793, "grad_norm": 0.354180428474573, "learning_rate": 4.759621433994502e-07, "loss": 0.0041, "step": 4958 }, { "epoch": 8.011308562197092, "grad_norm": 0.4152648200388243, "learning_rate": 4.752152450879827e-07, "loss": 0.0032, "step": 4959 }, { "epoch": 8.012924071082391, "grad_norm": 0.36757387186904894, "learning_rate": 4.7446887171482685e-07, "loss": 0.0039, "step": 4960 }, { "epoch": 8.014539579967689, "grad_norm": 0.3078550535247416, "learning_rate": 4.7372302347348463e-07, "loss": 0.003, "step": 4961 }, { "epoch": 8.016155088852988, "grad_norm": 0.28823461281597795, "learning_rate": 4.72977700557323e-07, "loss": 0.0051, "step": 4962 }, { "epoch": 8.017770597738288, "grad_norm": 0.47762873235671144, "learning_rate": 4.7223290315956987e-07, "loss": 0.0062, "step": 4963 }, { "epoch": 8.019386106623587, "grad_norm": 0.34884863637968966, "learning_rate": 4.7148863147331995e-07, "loss": 0.0037, "step": 4964 }, { "epoch": 8.021001615508885, "grad_norm": 0.2142776875745893, "learning_rate": 4.7074488569153033e-07, "loss": 0.0024, "step": 4965 }, { "epoch": 8.022617124394184, "grad_norm": 0.3484262003913781, "learning_rate": 4.700016660070211e-07, "loss": 0.0049, "step": 4966 }, { "epoch": 8.024232633279484, "grad_norm": 0.34659050146808756, "learning_rate": 4.692589726124772e-07, "loss": 0.0043, "step": 4967 }, { "epoch": 8.025848142164781, "grad_norm": 0.42412093592611394, "learning_rate": 4.6851680570044635e-07, "loss": 0.0042, "step": 4968 }, { "epoch": 8.02746365105008, "grad_norm": 0.23371244841622663, "learning_rate": 4.6777516546334085e-07, "loss": 0.0032, "step": 4969 }, { "epoch": 8.02907915993538, "grad_norm": 0.22750796570423862, "learning_rate": 4.6703405209343467e-07, "loss": 0.0027, "step": 4970 }, { "epoch": 8.03069466882068, "grad_norm": 0.3092607299561412, "learning_rate": 4.6629346578286686e-07, "loss": 0.004, "step": 4971 }, { "epoch": 8.032310177705977, "grad_norm": 0.5098933019249692, "learning_rate": 4.655534067236375e-07, "loss": 0.0029, "step": 4972 }, { "epoch": 8.033925686591276, "grad_norm": 0.6503823887712711, "learning_rate": 4.648138751076145e-07, "loss": 0.0051, "step": 4973 }, { "epoch": 8.035541195476576, "grad_norm": 0.7554512726794711, "learning_rate": 4.6407487112652363e-07, "loss": 0.0044, "step": 4974 }, { "epoch": 8.037156704361873, "grad_norm": 0.24790918914620208, "learning_rate": 4.633363949719577e-07, "loss": 0.0022, "step": 4975 }, { "epoch": 8.038772213247173, "grad_norm": 0.198291004464055, "learning_rate": 4.625984468353706e-07, "loss": 0.003, "step": 4976 }, { "epoch": 8.040387722132472, "grad_norm": 0.26011790002436225, "learning_rate": 4.618610269080809e-07, "loss": 0.0027, "step": 4977 }, { "epoch": 8.042003231017771, "grad_norm": 0.16959625225872846, "learning_rate": 4.611241353812673e-07, "loss": 0.0031, "step": 4978 }, { "epoch": 8.043618739903069, "grad_norm": 0.6528253597380227, "learning_rate": 4.603877724459768e-07, "loss": 0.0029, "step": 4979 }, { "epoch": 8.045234248788368, "grad_norm": 0.4814115218178998, "learning_rate": 4.596519382931139e-07, "loss": 0.006, "step": 4980 }, { "epoch": 8.046849757673668, "grad_norm": 0.18228373178121554, "learning_rate": 4.589166331134495e-07, "loss": 0.0027, "step": 4981 }, { "epoch": 8.048465266558965, "grad_norm": 0.3374614696235116, "learning_rate": 4.581818570976149e-07, "loss": 0.0038, "step": 4982 }, { "epoch": 8.050080775444265, "grad_norm": 0.7009293082449134, "learning_rate": 4.5744761043610616e-07, "loss": 0.005, "step": 4983 }, { "epoch": 8.051696284329564, "grad_norm": 0.6933396385420936, "learning_rate": 4.567138933192811e-07, "loss": 0.0049, "step": 4984 }, { "epoch": 8.053311793214863, "grad_norm": 0.5884070197682217, "learning_rate": 4.5598070593736165e-07, "loss": 0.0048, "step": 4985 }, { "epoch": 8.054927302100161, "grad_norm": 0.8008719294951903, "learning_rate": 4.5524804848042975e-07, "loss": 0.0054, "step": 4986 }, { "epoch": 8.05654281098546, "grad_norm": 0.32154294342117684, "learning_rate": 4.5451592113843287e-07, "loss": 0.0031, "step": 4987 }, { "epoch": 8.05815831987076, "grad_norm": 0.32593999922008327, "learning_rate": 4.537843241011783e-07, "loss": 0.0039, "step": 4988 }, { "epoch": 8.059773828756057, "grad_norm": 0.21303121384849635, "learning_rate": 4.5305325755833797e-07, "loss": 0.0024, "step": 4989 }, { "epoch": 8.061389337641357, "grad_norm": 0.47767580248722286, "learning_rate": 4.5232272169944554e-07, "loss": 0.0035, "step": 4990 }, { "epoch": 8.063004846526656, "grad_norm": 0.1987812924831376, "learning_rate": 4.515927167138975e-07, "loss": 0.0027, "step": 4991 }, { "epoch": 8.064620355411956, "grad_norm": 0.2814410290885687, "learning_rate": 4.508632427909512e-07, "loss": 0.0035, "step": 4992 }, { "epoch": 8.066235864297253, "grad_norm": 0.21550017488325804, "learning_rate": 4.501343001197281e-07, "loss": 0.0024, "step": 4993 }, { "epoch": 8.067851373182553, "grad_norm": 0.4670814578610595, "learning_rate": 4.494058888892108e-07, "loss": 0.0033, "step": 4994 }, { "epoch": 8.069466882067852, "grad_norm": 0.37462019319209294, "learning_rate": 4.486780092882456e-07, "loss": 0.0034, "step": 4995 }, { "epoch": 8.07108239095315, "grad_norm": 0.3564067097025944, "learning_rate": 4.4795066150553875e-07, "loss": 0.0031, "step": 4996 }, { "epoch": 8.072697899838449, "grad_norm": 0.26513887081699133, "learning_rate": 4.472238457296599e-07, "loss": 0.0039, "step": 4997 }, { "epoch": 8.074313408723748, "grad_norm": 0.44270829446316035, "learning_rate": 4.4649756214904167e-07, "loss": 0.0045, "step": 4998 }, { "epoch": 8.075928917609048, "grad_norm": 0.5008075717744261, "learning_rate": 4.4577181095197625e-07, "loss": 0.0056, "step": 4999 }, { "epoch": 8.077544426494345, "grad_norm": 0.30769832123060575, "learning_rate": 4.4504659232661977e-07, "loss": 0.0032, "step": 5000 }, { "epoch": 8.077544426494345, "eval_loss": 1.7820653915405273, "eval_runtime": 2.3566, "eval_samples_per_second": 127.3, "eval_steps_per_second": 2.97, "step": 5000 } ], "logging_steps": 1.0, "max_steps": 6190, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 285923766435840.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }