diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13755 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9994236311239195, + "eval_steps": 500, + "global_step": 3903, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001536983669548511, + "grad_norm": 25.51269023102587, + "learning_rate": 3.5805626598465474e-08, + "loss": 1.1796, + "step": 2 + }, + { + "epoch": 0.003073967339097022, + "grad_norm": 35.249053162120475, + "learning_rate": 7.161125319693095e-08, + "loss": 1.176, + "step": 4 + }, + { + "epoch": 0.004610951008645533, + "grad_norm": 40.978012392510365, + "learning_rate": 1.0741687979539642e-07, + "loss": 1.1898, + "step": 6 + }, + { + "epoch": 0.006147934678194044, + "grad_norm": 37.791685318747554, + "learning_rate": 1.432225063938619e-07, + "loss": 1.1456, + "step": 8 + }, + { + "epoch": 0.007684918347742555, + "grad_norm": 42.35883366195493, + "learning_rate": 1.7902813299232735e-07, + "loss": 1.2175, + "step": 10 + }, + { + "epoch": 0.009221902017291067, + "grad_norm": 48.99379222065678, + "learning_rate": 2.1483375959079283e-07, + "loss": 1.192, + "step": 12 + }, + { + "epoch": 0.010758885686839578, + "grad_norm": 44.18044117170443, + "learning_rate": 2.506393861892583e-07, + "loss": 1.1534, + "step": 14 + }, + { + "epoch": 0.012295869356388088, + "grad_norm": 53.20894364313595, + "learning_rate": 2.864450127877238e-07, + "loss": 1.1731, + "step": 16 + }, + { + "epoch": 0.0138328530259366, + "grad_norm": 24.63444222442823, + "learning_rate": 3.2225063938618927e-07, + "loss": 1.1526, + "step": 18 + }, + { + "epoch": 0.01536983669548511, + "grad_norm": 26.218955749561335, + "learning_rate": 3.580562659846547e-07, + "loss": 1.1429, + "step": 20 + }, + { + "epoch": 0.01690682036503362, + "grad_norm": 24.488025247790667, + "learning_rate": 3.938618925831202e-07, + "loss": 1.103, + "step": 22 + }, + { + "epoch": 0.018443804034582133, + "grad_norm": 36.781877179954705, + "learning_rate": 4.2966751918158566e-07, + "loss": 1.119, + "step": 24 + }, + { + "epoch": 0.019980787704130643, + "grad_norm": 20.069997876411783, + "learning_rate": 4.654731457800512e-07, + "loss": 1.0916, + "step": 26 + }, + { + "epoch": 0.021517771373679155, + "grad_norm": 18.663749580298237, + "learning_rate": 5.012787723785166e-07, + "loss": 1.0403, + "step": 28 + }, + { + "epoch": 0.023054755043227664, + "grad_norm": 30.587706942463424, + "learning_rate": 5.37084398976982e-07, + "loss": 0.9996, + "step": 30 + }, + { + "epoch": 0.024591738712776177, + "grad_norm": 25.127131893569143, + "learning_rate": 5.728900255754476e-07, + "loss": 0.9013, + "step": 32 + }, + { + "epoch": 0.02612872238232469, + "grad_norm": 24.505390983983318, + "learning_rate": 6.08695652173913e-07, + "loss": 0.8338, + "step": 34 + }, + { + "epoch": 0.0276657060518732, + "grad_norm": 28.03624180091526, + "learning_rate": 6.445012787723785e-07, + "loss": 0.7863, + "step": 36 + }, + { + "epoch": 0.02920268972142171, + "grad_norm": 18.04703186938572, + "learning_rate": 6.80306905370844e-07, + "loss": 0.8265, + "step": 38 + }, + { + "epoch": 0.03073967339097022, + "grad_norm": 19.463425683379242, + "learning_rate": 7.161125319693094e-07, + "loss": 0.775, + "step": 40 + }, + { + "epoch": 0.03227665706051873, + "grad_norm": 17.677748750715033, + "learning_rate": 7.519181585677749e-07, + "loss": 0.6703, + "step": 42 + }, + { + "epoch": 0.03381364073006724, + "grad_norm": 10.700820532554927, + "learning_rate": 7.877237851662404e-07, + "loss": 0.5609, + "step": 44 + }, + { + "epoch": 0.035350624399615754, + "grad_norm": 8.475129109313396, + "learning_rate": 8.235294117647059e-07, + "loss": 0.4812, + "step": 46 + }, + { + "epoch": 0.03688760806916427, + "grad_norm": 8.038325889019223, + "learning_rate": 8.593350383631713e-07, + "loss": 0.5001, + "step": 48 + }, + { + "epoch": 0.03842459173871278, + "grad_norm": 5.218235069976644, + "learning_rate": 8.951406649616369e-07, + "loss": 0.4479, + "step": 50 + }, + { + "epoch": 0.039961575408261285, + "grad_norm": 4.835373589811063, + "learning_rate": 9.309462915601024e-07, + "loss": 0.4323, + "step": 52 + }, + { + "epoch": 0.0414985590778098, + "grad_norm": 6.306114301931914, + "learning_rate": 9.667519181585676e-07, + "loss": 0.4035, + "step": 54 + }, + { + "epoch": 0.04303554274735831, + "grad_norm": 4.0054619844059, + "learning_rate": 1.0025575447570332e-06, + "loss": 0.3893, + "step": 56 + }, + { + "epoch": 0.04457252641690682, + "grad_norm": 6.071833866129103, + "learning_rate": 1.0383631713554987e-06, + "loss": 0.3929, + "step": 58 + }, + { + "epoch": 0.04610951008645533, + "grad_norm": 6.484184007577541, + "learning_rate": 1.074168797953964e-06, + "loss": 0.3561, + "step": 60 + }, + { + "epoch": 0.04764649375600384, + "grad_norm": 3.3031873501656173, + "learning_rate": 1.1099744245524297e-06, + "loss": 0.3598, + "step": 62 + }, + { + "epoch": 0.049183477425552354, + "grad_norm": 3.1979226953240896, + "learning_rate": 1.1457800511508952e-06, + "loss": 0.3505, + "step": 64 + }, + { + "epoch": 0.050720461095100866, + "grad_norm": 3.2686119354900667, + "learning_rate": 1.1815856777493606e-06, + "loss": 0.3402, + "step": 66 + }, + { + "epoch": 0.05225744476464938, + "grad_norm": 3.4755981271030425, + "learning_rate": 1.217391304347826e-06, + "loss": 0.3354, + "step": 68 + }, + { + "epoch": 0.053794428434197884, + "grad_norm": 3.7786309539549294, + "learning_rate": 1.2531969309462915e-06, + "loss": 0.3222, + "step": 70 + }, + { + "epoch": 0.0553314121037464, + "grad_norm": 3.189215314687861, + "learning_rate": 1.289002557544757e-06, + "loss": 0.3269, + "step": 72 + }, + { + "epoch": 0.05686839577329491, + "grad_norm": 3.231319202267947, + "learning_rate": 1.3248081841432225e-06, + "loss": 0.317, + "step": 74 + }, + { + "epoch": 0.05840537944284342, + "grad_norm": 3.4059003373513526, + "learning_rate": 1.360613810741688e-06, + "loss": 0.3116, + "step": 76 + }, + { + "epoch": 0.05994236311239193, + "grad_norm": 3.118126356582086, + "learning_rate": 1.3964194373401534e-06, + "loss": 0.3138, + "step": 78 + }, + { + "epoch": 0.06147934678194044, + "grad_norm": 3.4693555336883417, + "learning_rate": 1.4322250639386188e-06, + "loss": 0.3019, + "step": 80 + }, + { + "epoch": 0.06301633045148895, + "grad_norm": 2.970297866234449, + "learning_rate": 1.4680306905370844e-06, + "loss": 0.293, + "step": 82 + }, + { + "epoch": 0.06455331412103746, + "grad_norm": 2.9935089318553216, + "learning_rate": 1.5038363171355499e-06, + "loss": 0.2807, + "step": 84 + }, + { + "epoch": 0.06609029779058598, + "grad_norm": 2.9320835508202747, + "learning_rate": 1.5396419437340153e-06, + "loss": 0.2981, + "step": 86 + }, + { + "epoch": 0.06762728146013448, + "grad_norm": 2.7902350790145958, + "learning_rate": 1.5754475703324807e-06, + "loss": 0.2882, + "step": 88 + }, + { + "epoch": 0.069164265129683, + "grad_norm": 2.913110266507068, + "learning_rate": 1.6112531969309462e-06, + "loss": 0.2963, + "step": 90 + }, + { + "epoch": 0.07070124879923151, + "grad_norm": 3.0624887135891905, + "learning_rate": 1.6470588235294118e-06, + "loss": 0.2841, + "step": 92 + }, + { + "epoch": 0.07223823246878001, + "grad_norm": 2.9694480736928193, + "learning_rate": 1.6828644501278772e-06, + "loss": 0.2745, + "step": 94 + }, + { + "epoch": 0.07377521613832853, + "grad_norm": 2.9922295547415168, + "learning_rate": 1.7186700767263426e-06, + "loss": 0.2702, + "step": 96 + }, + { + "epoch": 0.07531219980787704, + "grad_norm": 2.909619857546067, + "learning_rate": 1.754475703324808e-06, + "loss": 0.2731, + "step": 98 + }, + { + "epoch": 0.07684918347742556, + "grad_norm": 2.894052835979119, + "learning_rate": 1.7902813299232737e-06, + "loss": 0.28, + "step": 100 + }, + { + "epoch": 0.07838616714697406, + "grad_norm": 3.0298785356363256, + "learning_rate": 1.8260869565217391e-06, + "loss": 0.2604, + "step": 102 + }, + { + "epoch": 0.07992315081652257, + "grad_norm": 2.7510733379017367, + "learning_rate": 1.8618925831202048e-06, + "loss": 0.2745, + "step": 104 + }, + { + "epoch": 0.08146013448607109, + "grad_norm": 2.8755196777892267, + "learning_rate": 1.89769820971867e-06, + "loss": 0.2649, + "step": 106 + }, + { + "epoch": 0.0829971181556196, + "grad_norm": 2.7823559727995715, + "learning_rate": 1.933503836317135e-06, + "loss": 0.2617, + "step": 108 + }, + { + "epoch": 0.0845341018251681, + "grad_norm": 2.777608095721294, + "learning_rate": 1.9693094629156013e-06, + "loss": 0.2421, + "step": 110 + }, + { + "epoch": 0.08607108549471662, + "grad_norm": 2.8340487973855075, + "learning_rate": 2.0051150895140665e-06, + "loss": 0.2725, + "step": 112 + }, + { + "epoch": 0.08760806916426513, + "grad_norm": 2.7460631775841913, + "learning_rate": 2.040920716112532e-06, + "loss": 0.2553, + "step": 114 + }, + { + "epoch": 0.08914505283381365, + "grad_norm": 2.930465300765411, + "learning_rate": 2.0767263427109973e-06, + "loss": 0.2563, + "step": 116 + }, + { + "epoch": 0.09068203650336215, + "grad_norm": 2.9408506186054297, + "learning_rate": 2.1125319693094626e-06, + "loss": 0.2522, + "step": 118 + }, + { + "epoch": 0.09221902017291066, + "grad_norm": 2.9075494103912516, + "learning_rate": 2.148337595907928e-06, + "loss": 0.252, + "step": 120 + }, + { + "epoch": 0.09375600384245918, + "grad_norm": 2.772712298254247, + "learning_rate": 2.184143222506394e-06, + "loss": 0.2443, + "step": 122 + }, + { + "epoch": 0.09529298751200768, + "grad_norm": 2.9572827571923654, + "learning_rate": 2.2199488491048595e-06, + "loss": 0.2371, + "step": 124 + }, + { + "epoch": 0.0968299711815562, + "grad_norm": 2.939257511565131, + "learning_rate": 2.2557544757033247e-06, + "loss": 0.2553, + "step": 126 + }, + { + "epoch": 0.09836695485110471, + "grad_norm": 2.8729850658382055, + "learning_rate": 2.2915601023017903e-06, + "loss": 0.2468, + "step": 128 + }, + { + "epoch": 0.09990393852065321, + "grad_norm": 2.7047070980502474, + "learning_rate": 2.3273657289002556e-06, + "loss": 0.2314, + "step": 130 + }, + { + "epoch": 0.10144092219020173, + "grad_norm": 2.7576396912344876, + "learning_rate": 2.363171355498721e-06, + "loss": 0.231, + "step": 132 + }, + { + "epoch": 0.10297790585975024, + "grad_norm": 2.9300704934262054, + "learning_rate": 2.398976982097187e-06, + "loss": 0.2386, + "step": 134 + }, + { + "epoch": 0.10451488952929876, + "grad_norm": 2.630373862177306, + "learning_rate": 2.434782608695652e-06, + "loss": 0.2259, + "step": 136 + }, + { + "epoch": 0.10605187319884726, + "grad_norm": 2.7320707506261286, + "learning_rate": 2.4705882352941177e-06, + "loss": 0.2393, + "step": 138 + }, + { + "epoch": 0.10758885686839577, + "grad_norm": 2.7386829967371384, + "learning_rate": 2.506393861892583e-06, + "loss": 0.2391, + "step": 140 + }, + { + "epoch": 0.10912584053794429, + "grad_norm": 2.5434808295139653, + "learning_rate": 2.5421994884910485e-06, + "loss": 0.2254, + "step": 142 + }, + { + "epoch": 0.1106628242074928, + "grad_norm": 2.59627848497483, + "learning_rate": 2.578005115089514e-06, + "loss": 0.2323, + "step": 144 + }, + { + "epoch": 0.11219980787704131, + "grad_norm": 2.6102361814153836, + "learning_rate": 2.6138107416879794e-06, + "loss": 0.2212, + "step": 146 + }, + { + "epoch": 0.11373679154658982, + "grad_norm": 2.4837198822431157, + "learning_rate": 2.649616368286445e-06, + "loss": 0.2174, + "step": 148 + }, + { + "epoch": 0.11527377521613832, + "grad_norm": 2.487984443326589, + "learning_rate": 2.6854219948849103e-06, + "loss": 0.2171, + "step": 150 + }, + { + "epoch": 0.11681075888568684, + "grad_norm": 2.491267047584632, + "learning_rate": 2.721227621483376e-06, + "loss": 0.2303, + "step": 152 + }, + { + "epoch": 0.11834774255523535, + "grad_norm": 2.2658939217150302, + "learning_rate": 2.7570332480818415e-06, + "loss": 0.211, + "step": 154 + }, + { + "epoch": 0.11988472622478386, + "grad_norm": 2.2884352286474727, + "learning_rate": 2.7928388746803067e-06, + "loss": 0.2155, + "step": 156 + }, + { + "epoch": 0.12142170989433237, + "grad_norm": 2.175477310058195, + "learning_rate": 2.8286445012787724e-06, + "loss": 0.2101, + "step": 158 + }, + { + "epoch": 0.12295869356388088, + "grad_norm": 2.04586024150522, + "learning_rate": 2.8644501278772376e-06, + "loss": 0.2072, + "step": 160 + }, + { + "epoch": 0.1244956772334294, + "grad_norm": 1.937221714698271, + "learning_rate": 2.9002557544757037e-06, + "loss": 0.2128, + "step": 162 + }, + { + "epoch": 0.1260326609029779, + "grad_norm": 1.9127369194130952, + "learning_rate": 2.936061381074169e-06, + "loss": 0.2164, + "step": 164 + }, + { + "epoch": 0.12756964457252642, + "grad_norm": 1.9467054527515646, + "learning_rate": 2.971867007672634e-06, + "loss": 0.216, + "step": 166 + }, + { + "epoch": 0.12910662824207492, + "grad_norm": 1.7444615717696514, + "learning_rate": 3.0076726342710997e-06, + "loss": 0.185, + "step": 168 + }, + { + "epoch": 0.13064361191162344, + "grad_norm": 1.721455823281585, + "learning_rate": 3.043478260869565e-06, + "loss": 0.2041, + "step": 170 + }, + { + "epoch": 0.13218059558117196, + "grad_norm": 1.6944968403589584, + "learning_rate": 3.0792838874680306e-06, + "loss": 0.2001, + "step": 172 + }, + { + "epoch": 0.13371757925072045, + "grad_norm": 1.5902843900468329, + "learning_rate": 3.1150895140664962e-06, + "loss": 0.1969, + "step": 174 + }, + { + "epoch": 0.13525456292026897, + "grad_norm": 1.564694037006993, + "learning_rate": 3.1508951406649614e-06, + "loss": 0.1945, + "step": 176 + }, + { + "epoch": 0.1367915465898175, + "grad_norm": 1.5404270105425055, + "learning_rate": 3.186700767263427e-06, + "loss": 0.1919, + "step": 178 + }, + { + "epoch": 0.138328530259366, + "grad_norm": 1.4600068175795704, + "learning_rate": 3.2225063938618923e-06, + "loss": 0.2144, + "step": 180 + }, + { + "epoch": 0.1398655139289145, + "grad_norm": 1.4972009484455417, + "learning_rate": 3.258312020460358e-06, + "loss": 0.2029, + "step": 182 + }, + { + "epoch": 0.14140249759846302, + "grad_norm": 1.5216950664348157, + "learning_rate": 3.2941176470588236e-06, + "loss": 0.1997, + "step": 184 + }, + { + "epoch": 0.14293948126801154, + "grad_norm": 1.4985918972641317, + "learning_rate": 3.3299232736572892e-06, + "loss": 0.2036, + "step": 186 + }, + { + "epoch": 0.14447646493756003, + "grad_norm": 1.4780907494983677, + "learning_rate": 3.3657289002557544e-06, + "loss": 0.1981, + "step": 188 + }, + { + "epoch": 0.14601344860710855, + "grad_norm": 1.3886567101627987, + "learning_rate": 3.4015345268542197e-06, + "loss": 0.185, + "step": 190 + }, + { + "epoch": 0.14755043227665707, + "grad_norm": 1.398832954652927, + "learning_rate": 3.4373401534526853e-06, + "loss": 0.2029, + "step": 192 + }, + { + "epoch": 0.14908741594620556, + "grad_norm": 1.4597381544671573, + "learning_rate": 3.473145780051151e-06, + "loss": 0.1881, + "step": 194 + }, + { + "epoch": 0.15062439961575408, + "grad_norm": 1.3840318868108508, + "learning_rate": 3.508951406649616e-06, + "loss": 0.2078, + "step": 196 + }, + { + "epoch": 0.1521613832853026, + "grad_norm": 1.4070697917261799, + "learning_rate": 3.544757033248082e-06, + "loss": 0.1974, + "step": 198 + }, + { + "epoch": 0.15369836695485112, + "grad_norm": 1.361393353227654, + "learning_rate": 3.5805626598465474e-06, + "loss": 0.197, + "step": 200 + }, + { + "epoch": 0.1552353506243996, + "grad_norm": 1.434775426937055, + "learning_rate": 3.6163682864450126e-06, + "loss": 0.2038, + "step": 202 + }, + { + "epoch": 0.15677233429394813, + "grad_norm": 1.5065947786249383, + "learning_rate": 3.6521739130434783e-06, + "loss": 0.2037, + "step": 204 + }, + { + "epoch": 0.15830931796349665, + "grad_norm": 1.3442194224119404, + "learning_rate": 3.6879795396419435e-06, + "loss": 0.1964, + "step": 206 + }, + { + "epoch": 0.15984630163304514, + "grad_norm": 1.3670203145371058, + "learning_rate": 3.7237851662404096e-06, + "loss": 0.195, + "step": 208 + }, + { + "epoch": 0.16138328530259366, + "grad_norm": 1.4035980721723869, + "learning_rate": 3.7595907928388748e-06, + "loss": 0.1961, + "step": 210 + }, + { + "epoch": 0.16292026897214218, + "grad_norm": 1.30606746520172, + "learning_rate": 3.79539641943734e-06, + "loss": 0.1877, + "step": 212 + }, + { + "epoch": 0.16445725264169067, + "grad_norm": 1.4103433650854988, + "learning_rate": 3.831202046035806e-06, + "loss": 0.1941, + "step": 214 + }, + { + "epoch": 0.1659942363112392, + "grad_norm": 1.394026552172561, + "learning_rate": 3.86700767263427e-06, + "loss": 0.1922, + "step": 216 + }, + { + "epoch": 0.1675312199807877, + "grad_norm": 1.3829946302657223, + "learning_rate": 3.902813299232737e-06, + "loss": 0.2048, + "step": 218 + }, + { + "epoch": 0.1690682036503362, + "grad_norm": 1.3276260104745685, + "learning_rate": 3.9386189258312025e-06, + "loss": 0.1892, + "step": 220 + }, + { + "epoch": 0.17060518731988472, + "grad_norm": 1.3432125610119714, + "learning_rate": 3.974424552429667e-06, + "loss": 0.1892, + "step": 222 + }, + { + "epoch": 0.17214217098943324, + "grad_norm": 1.3597738268768664, + "learning_rate": 4.010230179028133e-06, + "loss": 0.1933, + "step": 224 + }, + { + "epoch": 0.17367915465898176, + "grad_norm": 1.3784514898304991, + "learning_rate": 4.046035805626598e-06, + "loss": 0.187, + "step": 226 + }, + { + "epoch": 0.17521613832853025, + "grad_norm": 1.283311780182546, + "learning_rate": 4.081841432225064e-06, + "loss": 0.1889, + "step": 228 + }, + { + "epoch": 0.17675312199807877, + "grad_norm": 1.3048663711388646, + "learning_rate": 4.11764705882353e-06, + "loss": 0.1958, + "step": 230 + }, + { + "epoch": 0.1782901056676273, + "grad_norm": 1.3644400089186173, + "learning_rate": 4.153452685421995e-06, + "loss": 0.1939, + "step": 232 + }, + { + "epoch": 0.17982708933717578, + "grad_norm": 1.2592600425837408, + "learning_rate": 4.18925831202046e-06, + "loss": 0.2031, + "step": 234 + }, + { + "epoch": 0.1813640730067243, + "grad_norm": 30.200615105516786, + "learning_rate": 4.225063938618925e-06, + "loss": 0.1891, + "step": 236 + }, + { + "epoch": 0.18290105667627282, + "grad_norm": 1.4965751336406534, + "learning_rate": 4.260869565217392e-06, + "loss": 0.1878, + "step": 238 + }, + { + "epoch": 0.1844380403458213, + "grad_norm": 5.563286529626895, + "learning_rate": 4.296675191815856e-06, + "loss": 0.1982, + "step": 240 + }, + { + "epoch": 0.18597502401536983, + "grad_norm": 1.4805149402757245, + "learning_rate": 4.332480818414322e-06, + "loss": 0.1942, + "step": 242 + }, + { + "epoch": 0.18751200768491835, + "grad_norm": 1.6629859442097377, + "learning_rate": 4.368286445012788e-06, + "loss": 0.2067, + "step": 244 + }, + { + "epoch": 0.18904899135446687, + "grad_norm": 9.155421397230691, + "learning_rate": 4.4040920716112525e-06, + "loss": 0.1872, + "step": 246 + }, + { + "epoch": 0.19058597502401536, + "grad_norm": 1.32114090472292, + "learning_rate": 4.439897698209719e-06, + "loss": 0.1844, + "step": 248 + }, + { + "epoch": 0.19212295869356388, + "grad_norm": 1.2320314387569729, + "learning_rate": 4.475703324808184e-06, + "loss": 0.1791, + "step": 250 + }, + { + "epoch": 0.1936599423631124, + "grad_norm": 1.2291776990139922, + "learning_rate": 4.511508951406649e-06, + "loss": 0.1799, + "step": 252 + }, + { + "epoch": 0.1951969260326609, + "grad_norm": 1.270813260985118, + "learning_rate": 4.547314578005115e-06, + "loss": 0.1877, + "step": 254 + }, + { + "epoch": 0.19673390970220941, + "grad_norm": 1.3202011345679392, + "learning_rate": 4.583120204603581e-06, + "loss": 0.2013, + "step": 256 + }, + { + "epoch": 0.19827089337175793, + "grad_norm": 1.3290195976171792, + "learning_rate": 4.618925831202046e-06, + "loss": 0.1915, + "step": 258 + }, + { + "epoch": 0.19980787704130643, + "grad_norm": 1.2091766956810812, + "learning_rate": 4.654731457800511e-06, + "loss": 0.1847, + "step": 260 + }, + { + "epoch": 0.20134486071085494, + "grad_norm": 1.3107091320741244, + "learning_rate": 4.690537084398977e-06, + "loss": 0.1844, + "step": 262 + }, + { + "epoch": 0.20288184438040346, + "grad_norm": 1.448237078882646, + "learning_rate": 4.726342710997442e-06, + "loss": 0.1948, + "step": 264 + }, + { + "epoch": 0.20441882804995196, + "grad_norm": 1.2336655516702122, + "learning_rate": 4.762148337595908e-06, + "loss": 0.1949, + "step": 266 + }, + { + "epoch": 0.20595581171950048, + "grad_norm": 1.3658099392493064, + "learning_rate": 4.797953964194374e-06, + "loss": 0.1804, + "step": 268 + }, + { + "epoch": 0.207492795389049, + "grad_norm": 1.373403969669977, + "learning_rate": 4.8337595907928385e-06, + "loss": 0.1905, + "step": 270 + }, + { + "epoch": 0.20902977905859751, + "grad_norm": 1.277467955296089, + "learning_rate": 4.869565217391304e-06, + "loss": 0.1987, + "step": 272 + }, + { + "epoch": 0.210566762728146, + "grad_norm": 1.3159132642953701, + "learning_rate": 4.90537084398977e-06, + "loss": 0.1957, + "step": 274 + }, + { + "epoch": 0.21210374639769453, + "grad_norm": 1.278799705973987, + "learning_rate": 4.941176470588235e-06, + "loss": 0.1903, + "step": 276 + }, + { + "epoch": 0.21364073006724305, + "grad_norm": 1.363154755717981, + "learning_rate": 4.976982097186701e-06, + "loss": 0.181, + "step": 278 + }, + { + "epoch": 0.21517771373679154, + "grad_norm": 1.226652497702008, + "learning_rate": 5.012787723785166e-06, + "loss": 0.185, + "step": 280 + }, + { + "epoch": 0.21671469740634006, + "grad_norm": 1.2660471639800883, + "learning_rate": 5.0485933503836314e-06, + "loss": 0.184, + "step": 282 + }, + { + "epoch": 0.21825168107588858, + "grad_norm": 1.204935258599295, + "learning_rate": 5.084398976982097e-06, + "loss": 0.1816, + "step": 284 + }, + { + "epoch": 0.21978866474543707, + "grad_norm": 1.2808963980429877, + "learning_rate": 5.120204603580563e-06, + "loss": 0.1911, + "step": 286 + }, + { + "epoch": 0.2213256484149856, + "grad_norm": 1.194046158945937, + "learning_rate": 5.156010230179028e-06, + "loss": 0.1838, + "step": 288 + }, + { + "epoch": 0.2228626320845341, + "grad_norm": 1.3438720779184878, + "learning_rate": 5.191815856777493e-06, + "loss": 0.1925, + "step": 290 + }, + { + "epoch": 0.22439961575408263, + "grad_norm": 1.1388013545101945, + "learning_rate": 5.227621483375959e-06, + "loss": 0.1732, + "step": 292 + }, + { + "epoch": 0.22593659942363112, + "grad_norm": 1.151418121106973, + "learning_rate": 5.2634271099744244e-06, + "loss": 0.1936, + "step": 294 + }, + { + "epoch": 0.22747358309317964, + "grad_norm": 1.205495348613184, + "learning_rate": 5.29923273657289e-06, + "loss": 0.1752, + "step": 296 + }, + { + "epoch": 0.22901056676272816, + "grad_norm": 1.2040139295608776, + "learning_rate": 5.335038363171356e-06, + "loss": 0.1873, + "step": 298 + }, + { + "epoch": 0.23054755043227665, + "grad_norm": 1.2348832685626474, + "learning_rate": 5.3708439897698205e-06, + "loss": 0.1898, + "step": 300 + }, + { + "epoch": 0.23208453410182517, + "grad_norm": 1.1372824585247818, + "learning_rate": 5.406649616368286e-06, + "loss": 0.1757, + "step": 302 + }, + { + "epoch": 0.2336215177713737, + "grad_norm": 1.231905059549878, + "learning_rate": 5.442455242966752e-06, + "loss": 0.1952, + "step": 304 + }, + { + "epoch": 0.23515850144092218, + "grad_norm": 1.1803349744715428, + "learning_rate": 5.478260869565217e-06, + "loss": 0.2001, + "step": 306 + }, + { + "epoch": 0.2366954851104707, + "grad_norm": 1.277265628143273, + "learning_rate": 5.514066496163683e-06, + "loss": 0.2008, + "step": 308 + }, + { + "epoch": 0.23823246878001922, + "grad_norm": 1.3927848001575907, + "learning_rate": 5.549872122762148e-06, + "loss": 0.1963, + "step": 310 + }, + { + "epoch": 0.2397694524495677, + "grad_norm": 1.2403730899787904, + "learning_rate": 5.5856777493606135e-06, + "loss": 0.1961, + "step": 312 + }, + { + "epoch": 0.24130643611911623, + "grad_norm": 1.163059501211908, + "learning_rate": 5.62148337595908e-06, + "loss": 0.175, + "step": 314 + }, + { + "epoch": 0.24284341978866475, + "grad_norm": 1.1639642316766559, + "learning_rate": 5.657289002557545e-06, + "loss": 0.1886, + "step": 316 + }, + { + "epoch": 0.24438040345821327, + "grad_norm": 1.2382118437766167, + "learning_rate": 5.69309462915601e-06, + "loss": 0.1973, + "step": 318 + }, + { + "epoch": 0.24591738712776176, + "grad_norm": 1.2074114040937993, + "learning_rate": 5.728900255754475e-06, + "loss": 0.1837, + "step": 320 + }, + { + "epoch": 0.24745437079731028, + "grad_norm": 1.295214887261411, + "learning_rate": 5.764705882352941e-06, + "loss": 0.1979, + "step": 322 + }, + { + "epoch": 0.2489913544668588, + "grad_norm": 1.2581192996807238, + "learning_rate": 5.800511508951407e-06, + "loss": 0.1789, + "step": 324 + }, + { + "epoch": 0.2505283381364073, + "grad_norm": 1.254789252666287, + "learning_rate": 5.836317135549872e-06, + "loss": 0.1941, + "step": 326 + }, + { + "epoch": 0.2520653218059558, + "grad_norm": 1.251310029743626, + "learning_rate": 5.872122762148338e-06, + "loss": 0.189, + "step": 328 + }, + { + "epoch": 0.25360230547550433, + "grad_norm": 1.1550514720366614, + "learning_rate": 5.9079283887468026e-06, + "loss": 0.1968, + "step": 330 + }, + { + "epoch": 0.25513928914505285, + "grad_norm": 1.1535871144055385, + "learning_rate": 5.943734015345268e-06, + "loss": 0.1844, + "step": 332 + }, + { + "epoch": 0.25667627281460137, + "grad_norm": 1.2208131864363314, + "learning_rate": 5.979539641943734e-06, + "loss": 0.1919, + "step": 334 + }, + { + "epoch": 0.25821325648414983, + "grad_norm": 1.1683405652224148, + "learning_rate": 6.0153452685421995e-06, + "loss": 0.1759, + "step": 336 + }, + { + "epoch": 0.25975024015369835, + "grad_norm": 1.1952164336575553, + "learning_rate": 6.051150895140665e-06, + "loss": 0.1795, + "step": 338 + }, + { + "epoch": 0.2612872238232469, + "grad_norm": 1.1820897323143373, + "learning_rate": 6.08695652173913e-06, + "loss": 0.19, + "step": 340 + }, + { + "epoch": 0.2628242074927954, + "grad_norm": 1.1800564956613375, + "learning_rate": 6.1227621483375955e-06, + "loss": 0.1978, + "step": 342 + }, + { + "epoch": 0.2643611911623439, + "grad_norm": 1.1467821141624814, + "learning_rate": 6.158567774936061e-06, + "loss": 0.1911, + "step": 344 + }, + { + "epoch": 0.26589817483189243, + "grad_norm": 1.1627264267097417, + "learning_rate": 6.194373401534527e-06, + "loss": 0.1921, + "step": 346 + }, + { + "epoch": 0.2674351585014409, + "grad_norm": 1.195438253587923, + "learning_rate": 6.2301790281329925e-06, + "loss": 0.1898, + "step": 348 + }, + { + "epoch": 0.2689721421709894, + "grad_norm": 1.1780466341960676, + "learning_rate": 6.265984654731457e-06, + "loss": 0.1792, + "step": 350 + }, + { + "epoch": 0.27050912584053793, + "grad_norm": 1.1731330647661304, + "learning_rate": 6.301790281329923e-06, + "loss": 0.1955, + "step": 352 + }, + { + "epoch": 0.27204610951008645, + "grad_norm": 1.1745568091917384, + "learning_rate": 6.3375959079283885e-06, + "loss": 0.1852, + "step": 354 + }, + { + "epoch": 0.273583093179635, + "grad_norm": 1.1783143021627482, + "learning_rate": 6.373401534526854e-06, + "loss": 0.1845, + "step": 356 + }, + { + "epoch": 0.2751200768491835, + "grad_norm": 1.1827170581568967, + "learning_rate": 6.40920716112532e-06, + "loss": 0.201, + "step": 358 + }, + { + "epoch": 0.276657060518732, + "grad_norm": 1.1115484126704642, + "learning_rate": 6.445012787723785e-06, + "loss": 0.17, + "step": 360 + }, + { + "epoch": 0.2781940441882805, + "grad_norm": 1.0326502834598414, + "learning_rate": 6.48081841432225e-06, + "loss": 0.186, + "step": 362 + }, + { + "epoch": 0.279731027857829, + "grad_norm": 1.259244016304707, + "learning_rate": 6.516624040920716e-06, + "loss": 0.1885, + "step": 364 + }, + { + "epoch": 0.2812680115273775, + "grad_norm": 1.1678776079796356, + "learning_rate": 6.5524296675191815e-06, + "loss": 0.1921, + "step": 366 + }, + { + "epoch": 0.28280499519692603, + "grad_norm": 1.0738638308920176, + "learning_rate": 6.588235294117647e-06, + "loss": 0.1707, + "step": 368 + }, + { + "epoch": 0.28434197886647455, + "grad_norm": 1.1594023002547749, + "learning_rate": 6.624040920716112e-06, + "loss": 0.1903, + "step": 370 + }, + { + "epoch": 0.2858789625360231, + "grad_norm": 1.1471065906349844, + "learning_rate": 6.6598465473145784e-06, + "loss": 0.194, + "step": 372 + }, + { + "epoch": 0.2874159462055716, + "grad_norm": 1.1901453575596241, + "learning_rate": 6.695652173913043e-06, + "loss": 0.1854, + "step": 374 + }, + { + "epoch": 0.28895292987512006, + "grad_norm": 1.1928859018793583, + "learning_rate": 6.731457800511509e-06, + "loss": 0.1906, + "step": 376 + }, + { + "epoch": 0.2904899135446686, + "grad_norm": 1.2778835898977514, + "learning_rate": 6.7672634271099745e-06, + "loss": 0.1871, + "step": 378 + }, + { + "epoch": 0.2920268972142171, + "grad_norm": 1.2992357301881277, + "learning_rate": 6.803069053708439e-06, + "loss": 0.2, + "step": 380 + }, + { + "epoch": 0.2935638808837656, + "grad_norm": 1.0621001143169835, + "learning_rate": 6.838874680306906e-06, + "loss": 0.1735, + "step": 382 + }, + { + "epoch": 0.29510086455331414, + "grad_norm": 1.1201679749220157, + "learning_rate": 6.874680306905371e-06, + "loss": 0.1736, + "step": 384 + }, + { + "epoch": 0.29663784822286265, + "grad_norm": 1.1884134035644247, + "learning_rate": 6.910485933503836e-06, + "loss": 0.1765, + "step": 386 + }, + { + "epoch": 0.2981748318924111, + "grad_norm": 1.0331590850791008, + "learning_rate": 6.946291560102302e-06, + "loss": 0.1751, + "step": 388 + }, + { + "epoch": 0.29971181556195964, + "grad_norm": 1.0474474582479285, + "learning_rate": 6.982097186700767e-06, + "loss": 0.1899, + "step": 390 + }, + { + "epoch": 0.30124879923150816, + "grad_norm": 1.1045197145968615, + "learning_rate": 6.999998599675296e-06, + "loss": 0.1873, + "step": 392 + }, + { + "epoch": 0.3027857829010567, + "grad_norm": 1.0461247560001137, + "learning_rate": 6.999987397084384e-06, + "loss": 0.1844, + "step": 394 + }, + { + "epoch": 0.3043227665706052, + "grad_norm": 1.074110109904979, + "learning_rate": 6.999964991938417e-06, + "loss": 0.1867, + "step": 396 + }, + { + "epoch": 0.3058597502401537, + "grad_norm": 1.1795081212239336, + "learning_rate": 6.999931384309108e-06, + "loss": 0.1927, + "step": 398 + }, + { + "epoch": 0.30739673390970224, + "grad_norm": 1.0767670064958226, + "learning_rate": 6.999886574304027e-06, + "loss": 0.1756, + "step": 400 + }, + { + "epoch": 0.3089337175792507, + "grad_norm": 1.1180599613087716, + "learning_rate": 6.999830562066599e-06, + "loss": 0.1777, + "step": 402 + }, + { + "epoch": 0.3104707012487992, + "grad_norm": 1.1283551526882591, + "learning_rate": 6.999763347776102e-06, + "loss": 0.1806, + "step": 404 + }, + { + "epoch": 0.31200768491834774, + "grad_norm": 1.2128389787316596, + "learning_rate": 6.999684931647677e-06, + "loss": 0.1989, + "step": 406 + }, + { + "epoch": 0.31354466858789626, + "grad_norm": 1.0231317030587057, + "learning_rate": 6.999595313932308e-06, + "loss": 0.1737, + "step": 408 + }, + { + "epoch": 0.3150816522574448, + "grad_norm": 1.2500344807446935, + "learning_rate": 6.999494494916842e-06, + "loss": 0.1906, + "step": 410 + }, + { + "epoch": 0.3166186359269933, + "grad_norm": 1.1142795967528607, + "learning_rate": 6.999382474923973e-06, + "loss": 0.1754, + "step": 412 + }, + { + "epoch": 0.31815561959654176, + "grad_norm": 1.0688175628422532, + "learning_rate": 6.999259254312248e-06, + "loss": 0.1738, + "step": 414 + }, + { + "epoch": 0.3196926032660903, + "grad_norm": 1.075624317236362, + "learning_rate": 6.999124833476066e-06, + "loss": 0.1785, + "step": 416 + }, + { + "epoch": 0.3212295869356388, + "grad_norm": 1.0898379845370674, + "learning_rate": 6.9989792128456716e-06, + "loss": 0.1992, + "step": 418 + }, + { + "epoch": 0.3227665706051873, + "grad_norm": 1.027575992474251, + "learning_rate": 6.998822392887159e-06, + "loss": 0.1773, + "step": 420 + }, + { + "epoch": 0.32430355427473584, + "grad_norm": 1.1417631076293755, + "learning_rate": 6.9986543741024684e-06, + "loss": 0.1905, + "step": 422 + }, + { + "epoch": 0.32584053794428436, + "grad_norm": 1.0666179275230614, + "learning_rate": 6.998475157029385e-06, + "loss": 0.1771, + "step": 424 + }, + { + "epoch": 0.3273775216138329, + "grad_norm": 1.178184842738573, + "learning_rate": 6.998284742241536e-06, + "loss": 0.2027, + "step": 426 + }, + { + "epoch": 0.32891450528338134, + "grad_norm": 1.1389042968070178, + "learning_rate": 6.99808313034839e-06, + "loss": 0.1761, + "step": 428 + }, + { + "epoch": 0.33045148895292986, + "grad_norm": 1.1267197030584097, + "learning_rate": 6.997870321995255e-06, + "loss": 0.1907, + "step": 430 + }, + { + "epoch": 0.3319884726224784, + "grad_norm": 1.07793801111443, + "learning_rate": 6.9976463178632756e-06, + "loss": 0.1826, + "step": 432 + }, + { + "epoch": 0.3335254562920269, + "grad_norm": 1.0872443642223155, + "learning_rate": 6.99741111866943e-06, + "loss": 0.1913, + "step": 434 + }, + { + "epoch": 0.3350624399615754, + "grad_norm": 1.1636846292387626, + "learning_rate": 6.997164725166531e-06, + "loss": 0.1891, + "step": 436 + }, + { + "epoch": 0.33659942363112394, + "grad_norm": 1.1189017028456263, + "learning_rate": 6.996907138143219e-06, + "loss": 0.178, + "step": 438 + }, + { + "epoch": 0.3381364073006724, + "grad_norm": 1.0292145655593512, + "learning_rate": 6.996638358423965e-06, + "loss": 0.1803, + "step": 440 + }, + { + "epoch": 0.3396733909702209, + "grad_norm": 1.0390558930376024, + "learning_rate": 6.996358386869064e-06, + "loss": 0.1881, + "step": 442 + }, + { + "epoch": 0.34121037463976944, + "grad_norm": 1.1559870900026559, + "learning_rate": 6.996067224374631e-06, + "loss": 0.1825, + "step": 444 + }, + { + "epoch": 0.34274735830931796, + "grad_norm": 1.1284033103165994, + "learning_rate": 6.995764871872603e-06, + "loss": 0.1985, + "step": 446 + }, + { + "epoch": 0.3442843419788665, + "grad_norm": 1.0182449603206558, + "learning_rate": 6.995451330330732e-06, + "loss": 0.1963, + "step": 448 + }, + { + "epoch": 0.345821325648415, + "grad_norm": 1.0080327819510069, + "learning_rate": 6.995126600752583e-06, + "loss": 0.189, + "step": 450 + }, + { + "epoch": 0.3473583093179635, + "grad_norm": 1.0876359081678704, + "learning_rate": 6.994790684177531e-06, + "loss": 0.1672, + "step": 452 + }, + { + "epoch": 0.348895292987512, + "grad_norm": 1.0950900793628788, + "learning_rate": 6.99444358168076e-06, + "loss": 0.196, + "step": 454 + }, + { + "epoch": 0.3504322766570605, + "grad_norm": 1.0087724624388277, + "learning_rate": 6.9940852943732534e-06, + "loss": 0.1779, + "step": 456 + }, + { + "epoch": 0.351969260326609, + "grad_norm": 1.0777984809693466, + "learning_rate": 6.993715823401798e-06, + "loss": 0.1867, + "step": 458 + }, + { + "epoch": 0.35350624399615754, + "grad_norm": 1.0121221411630168, + "learning_rate": 6.993335169948972e-06, + "loss": 0.1715, + "step": 460 + }, + { + "epoch": 0.35504322766570606, + "grad_norm": 1.011168047129122, + "learning_rate": 6.992943335233152e-06, + "loss": 0.1707, + "step": 462 + }, + { + "epoch": 0.3565802113352546, + "grad_norm": 1.0120465143177686, + "learning_rate": 6.992540320508498e-06, + "loss": 0.1875, + "step": 464 + }, + { + "epoch": 0.3581171950048031, + "grad_norm": 0.9550620727028469, + "learning_rate": 6.992126127064956e-06, + "loss": 0.1635, + "step": 466 + }, + { + "epoch": 0.35965417867435157, + "grad_norm": 1.1350686459542934, + "learning_rate": 6.9917007562282535e-06, + "loss": 0.1751, + "step": 468 + }, + { + "epoch": 0.3611911623439001, + "grad_norm": 1.1572256160083796, + "learning_rate": 6.991264209359891e-06, + "loss": 0.1883, + "step": 470 + }, + { + "epoch": 0.3627281460134486, + "grad_norm": 1.021485377857385, + "learning_rate": 6.9908164878571425e-06, + "loss": 0.1798, + "step": 472 + }, + { + "epoch": 0.3642651296829971, + "grad_norm": 0.9632560183914399, + "learning_rate": 6.99035759315305e-06, + "loss": 0.1655, + "step": 474 + }, + { + "epoch": 0.36580211335254564, + "grad_norm": 1.0013270641998204, + "learning_rate": 6.989887526716415e-06, + "loss": 0.1927, + "step": 476 + }, + { + "epoch": 0.36733909702209416, + "grad_norm": 0.9745686321374288, + "learning_rate": 6.9894062900517996e-06, + "loss": 0.1674, + "step": 478 + }, + { + "epoch": 0.3688760806916426, + "grad_norm": 1.0407042923473835, + "learning_rate": 6.988913884699518e-06, + "loss": 0.1925, + "step": 480 + }, + { + "epoch": 0.37041306436119115, + "grad_norm": 1.0062661091753247, + "learning_rate": 6.988410312235632e-06, + "loss": 0.1965, + "step": 482 + }, + { + "epoch": 0.37195004803073967, + "grad_norm": 1.1013102934828658, + "learning_rate": 6.987895574271948e-06, + "loss": 0.1998, + "step": 484 + }, + { + "epoch": 0.3734870317002882, + "grad_norm": 1.0597643936100356, + "learning_rate": 6.987369672456009e-06, + "loss": 0.1772, + "step": 486 + }, + { + "epoch": 0.3750240153698367, + "grad_norm": 0.9866253310752731, + "learning_rate": 6.986832608471089e-06, + "loss": 0.1673, + "step": 488 + }, + { + "epoch": 0.3765609990393852, + "grad_norm": 1.0602452001077722, + "learning_rate": 6.986284384036193e-06, + "loss": 0.1739, + "step": 490 + }, + { + "epoch": 0.37809798270893374, + "grad_norm": 1.041644897151479, + "learning_rate": 6.985725000906045e-06, + "loss": 0.1695, + "step": 492 + }, + { + "epoch": 0.3796349663784822, + "grad_norm": 0.952300652923444, + "learning_rate": 6.985154460871086e-06, + "loss": 0.1801, + "step": 494 + }, + { + "epoch": 0.3811719500480307, + "grad_norm": 0.93795302994956, + "learning_rate": 6.984572765757467e-06, + "loss": 0.1691, + "step": 496 + }, + { + "epoch": 0.38270893371757925, + "grad_norm": 0.9322450383167038, + "learning_rate": 6.983979917427043e-06, + "loss": 0.1789, + "step": 498 + }, + { + "epoch": 0.38424591738712777, + "grad_norm": 1.0051761272259416, + "learning_rate": 6.98337591777737e-06, + "loss": 0.1713, + "step": 500 + }, + { + "epoch": 0.38424591738712777, + "eval_loss": 0.16142325103282928, + "eval_runtime": 365.2501, + "eval_samples_per_second": 50.664, + "eval_steps_per_second": 6.335, + "step": 500 + }, + { + "epoch": 0.3857829010566763, + "grad_norm": 1.008256178397373, + "learning_rate": 6.982760768741694e-06, + "loss": 0.178, + "step": 502 + }, + { + "epoch": 0.3873198847262248, + "grad_norm": 0.9324814287025068, + "learning_rate": 6.982134472288947e-06, + "loss": 0.1633, + "step": 504 + }, + { + "epoch": 0.38885686839577327, + "grad_norm": 1.0289506080987343, + "learning_rate": 6.981497030423744e-06, + "loss": 0.1805, + "step": 506 + }, + { + "epoch": 0.3903938520653218, + "grad_norm": 0.9347810671889286, + "learning_rate": 6.980848445186369e-06, + "loss": 0.1727, + "step": 508 + }, + { + "epoch": 0.3919308357348703, + "grad_norm": 1.052017862885602, + "learning_rate": 6.980188718652778e-06, + "loss": 0.1797, + "step": 510 + }, + { + "epoch": 0.39346781940441883, + "grad_norm": 1.056395915175445, + "learning_rate": 6.9795178529345855e-06, + "loss": 0.186, + "step": 512 + }, + { + "epoch": 0.39500480307396735, + "grad_norm": 1.0283503364288544, + "learning_rate": 6.978835850179057e-06, + "loss": 0.1844, + "step": 514 + }, + { + "epoch": 0.39654178674351587, + "grad_norm": 0.9433928178661478, + "learning_rate": 6.978142712569109e-06, + "loss": 0.1719, + "step": 516 + }, + { + "epoch": 0.3980787704130644, + "grad_norm": 0.9623302440193999, + "learning_rate": 6.9774384423232945e-06, + "loss": 0.1613, + "step": 518 + }, + { + "epoch": 0.39961575408261285, + "grad_norm": 1.0414121844166375, + "learning_rate": 6.976723041695802e-06, + "loss": 0.1841, + "step": 520 + }, + { + "epoch": 0.40115273775216137, + "grad_norm": 1.0155405045344752, + "learning_rate": 6.9759965129764425e-06, + "loss": 0.1706, + "step": 522 + }, + { + "epoch": 0.4026897214217099, + "grad_norm": 1.0347436462429636, + "learning_rate": 6.975258858490648e-06, + "loss": 0.1898, + "step": 524 + }, + { + "epoch": 0.4042267050912584, + "grad_norm": 0.9905491545072838, + "learning_rate": 6.974510080599458e-06, + "loss": 0.1701, + "step": 526 + }, + { + "epoch": 0.40576368876080693, + "grad_norm": 1.0065784769120174, + "learning_rate": 6.973750181699518e-06, + "loss": 0.173, + "step": 528 + }, + { + "epoch": 0.40730067243035545, + "grad_norm": 1.0028920218026682, + "learning_rate": 6.972979164223069e-06, + "loss": 0.1815, + "step": 530 + }, + { + "epoch": 0.4088376560999039, + "grad_norm": 0.9926648764778461, + "learning_rate": 6.972197030637938e-06, + "loss": 0.1701, + "step": 532 + }, + { + "epoch": 0.41037463976945243, + "grad_norm": 0.9362580434331931, + "learning_rate": 6.971403783447532e-06, + "loss": 0.162, + "step": 534 + }, + { + "epoch": 0.41191162343900095, + "grad_norm": 1.0313345974784995, + "learning_rate": 6.97059942519083e-06, + "loss": 0.1793, + "step": 536 + }, + { + "epoch": 0.41344860710854947, + "grad_norm": 1.053038519408416, + "learning_rate": 6.969783958442376e-06, + "loss": 0.1744, + "step": 538 + }, + { + "epoch": 0.414985590778098, + "grad_norm": 1.09503010697989, + "learning_rate": 6.968957385812268e-06, + "loss": 0.1837, + "step": 540 + }, + { + "epoch": 0.4165225744476465, + "grad_norm": 0.9963387156020179, + "learning_rate": 6.968119709946151e-06, + "loss": 0.1824, + "step": 542 + }, + { + "epoch": 0.41805955811719503, + "grad_norm": 0.9881607706879252, + "learning_rate": 6.9672709335252075e-06, + "loss": 0.1864, + "step": 544 + }, + { + "epoch": 0.4195965417867435, + "grad_norm": 0.9849697365361947, + "learning_rate": 6.966411059266153e-06, + "loss": 0.1676, + "step": 546 + }, + { + "epoch": 0.421133525456292, + "grad_norm": 0.99285568761607, + "learning_rate": 6.965540089921224e-06, + "loss": 0.1788, + "step": 548 + }, + { + "epoch": 0.42267050912584053, + "grad_norm": 1.0020197625846368, + "learning_rate": 6.964658028278167e-06, + "loss": 0.1717, + "step": 550 + }, + { + "epoch": 0.42420749279538905, + "grad_norm": 1.0446372613390171, + "learning_rate": 6.963764877160232e-06, + "loss": 0.1724, + "step": 552 + }, + { + "epoch": 0.42574447646493757, + "grad_norm": 1.0489345870285778, + "learning_rate": 6.962860639426168e-06, + "loss": 0.1751, + "step": 554 + }, + { + "epoch": 0.4272814601344861, + "grad_norm": 0.9913965985343988, + "learning_rate": 6.9619453179702036e-06, + "loss": 0.1753, + "step": 556 + }, + { + "epoch": 0.42881844380403455, + "grad_norm": 0.9746372697415868, + "learning_rate": 6.9610189157220465e-06, + "loss": 0.1849, + "step": 558 + }, + { + "epoch": 0.4303554274735831, + "grad_norm": 1.0035507982106044, + "learning_rate": 6.960081435646872e-06, + "loss": 0.1887, + "step": 560 + }, + { + "epoch": 0.4318924111431316, + "grad_norm": 1.0189535936717342, + "learning_rate": 6.95913288074531e-06, + "loss": 0.1761, + "step": 562 + }, + { + "epoch": 0.4334293948126801, + "grad_norm": 1.0604859650996308, + "learning_rate": 6.958173254053442e-06, + "loss": 0.1698, + "step": 564 + }, + { + "epoch": 0.43496637848222863, + "grad_norm": 1.04513958829485, + "learning_rate": 6.957202558642782e-06, + "loss": 0.1842, + "step": 566 + }, + { + "epoch": 0.43650336215177715, + "grad_norm": 0.9420876517823673, + "learning_rate": 6.9562207976202775e-06, + "loss": 0.1614, + "step": 568 + }, + { + "epoch": 0.43804034582132567, + "grad_norm": 1.0001567101709503, + "learning_rate": 6.9552279741282916e-06, + "loss": 0.1692, + "step": 570 + }, + { + "epoch": 0.43957732949087414, + "grad_norm": 1.0163931461175468, + "learning_rate": 6.954224091344593e-06, + "loss": 0.1687, + "step": 572 + }, + { + "epoch": 0.44111431316042266, + "grad_norm": 0.98643009928601, + "learning_rate": 6.953209152482355e-06, + "loss": 0.1853, + "step": 574 + }, + { + "epoch": 0.4426512968299712, + "grad_norm": 0.9353513380661882, + "learning_rate": 6.952183160790133e-06, + "loss": 0.1675, + "step": 576 + }, + { + "epoch": 0.4441882804995197, + "grad_norm": 0.9020207899109111, + "learning_rate": 6.951146119551859e-06, + "loss": 0.1679, + "step": 578 + }, + { + "epoch": 0.4457252641690682, + "grad_norm": 0.9841310128546708, + "learning_rate": 6.950098032086837e-06, + "loss": 0.1694, + "step": 580 + }, + { + "epoch": 0.44726224783861673, + "grad_norm": 1.0255232655982072, + "learning_rate": 6.949038901749723e-06, + "loss": 0.1736, + "step": 582 + }, + { + "epoch": 0.44879923150816525, + "grad_norm": 0.9032841104351299, + "learning_rate": 6.947968731930519e-06, + "loss": 0.163, + "step": 584 + }, + { + "epoch": 0.4503362151777137, + "grad_norm": 0.9005602498337584, + "learning_rate": 6.946887526054563e-06, + "loss": 0.1593, + "step": 586 + }, + { + "epoch": 0.45187319884726224, + "grad_norm": 0.9439866081688895, + "learning_rate": 6.945795287582514e-06, + "loss": 0.172, + "step": 588 + }, + { + "epoch": 0.45341018251681076, + "grad_norm": 1.015175377754709, + "learning_rate": 6.9446920200103465e-06, + "loss": 0.1658, + "step": 590 + }, + { + "epoch": 0.4549471661863593, + "grad_norm": 0.9730430866278983, + "learning_rate": 6.943577726869334e-06, + "loss": 0.1607, + "step": 592 + }, + { + "epoch": 0.4564841498559078, + "grad_norm": 0.9619897655090097, + "learning_rate": 6.942452411726042e-06, + "loss": 0.1799, + "step": 594 + }, + { + "epoch": 0.4580211335254563, + "grad_norm": 0.9934784984855407, + "learning_rate": 6.941316078182312e-06, + "loss": 0.1692, + "step": 596 + }, + { + "epoch": 0.4595581171950048, + "grad_norm": 0.9337603705095543, + "learning_rate": 6.940168729875255e-06, + "loss": 0.1787, + "step": 598 + }, + { + "epoch": 0.4610951008645533, + "grad_norm": 0.9057632316451069, + "learning_rate": 6.939010370477235e-06, + "loss": 0.1713, + "step": 600 + }, + { + "epoch": 0.4626320845341018, + "grad_norm": 0.989678630378062, + "learning_rate": 6.9378410036958635e-06, + "loss": 0.1761, + "step": 602 + }, + { + "epoch": 0.46416906820365034, + "grad_norm": 0.9385046490524968, + "learning_rate": 6.936660633273979e-06, + "loss": 0.1745, + "step": 604 + }, + { + "epoch": 0.46570605187319886, + "grad_norm": 0.955462629075025, + "learning_rate": 6.935469262989644e-06, + "loss": 0.1711, + "step": 606 + }, + { + "epoch": 0.4672430355427474, + "grad_norm": 0.8795030512447493, + "learning_rate": 6.9342668966561245e-06, + "loss": 0.169, + "step": 608 + }, + { + "epoch": 0.4687800192122959, + "grad_norm": 0.9495870025972418, + "learning_rate": 6.933053538121886e-06, + "loss": 0.1682, + "step": 610 + }, + { + "epoch": 0.47031700288184436, + "grad_norm": 0.9330347326137807, + "learning_rate": 6.931829191270576e-06, + "loss": 0.1662, + "step": 612 + }, + { + "epoch": 0.4718539865513929, + "grad_norm": 0.9017675551372555, + "learning_rate": 6.930593860021012e-06, + "loss": 0.1628, + "step": 614 + }, + { + "epoch": 0.4733909702209414, + "grad_norm": 0.9466561162964194, + "learning_rate": 6.929347548327168e-06, + "loss": 0.1807, + "step": 616 + }, + { + "epoch": 0.4749279538904899, + "grad_norm": 0.9881577337000146, + "learning_rate": 6.928090260178169e-06, + "loss": 0.1872, + "step": 618 + }, + { + "epoch": 0.47646493756003844, + "grad_norm": 0.9646614031070989, + "learning_rate": 6.926821999598266e-06, + "loss": 0.1709, + "step": 620 + }, + { + "epoch": 0.47800192122958696, + "grad_norm": 0.9609197980971407, + "learning_rate": 6.9255427706468375e-06, + "loss": 0.1774, + "step": 622 + }, + { + "epoch": 0.4795389048991354, + "grad_norm": 0.9541380042708364, + "learning_rate": 6.92425257741836e-06, + "loss": 0.1777, + "step": 624 + }, + { + "epoch": 0.48107588856868394, + "grad_norm": 0.898383778954144, + "learning_rate": 6.922951424042412e-06, + "loss": 0.1691, + "step": 626 + }, + { + "epoch": 0.48261287223823246, + "grad_norm": 0.960885030497367, + "learning_rate": 6.921639314683648e-06, + "loss": 0.1576, + "step": 628 + }, + { + "epoch": 0.484149855907781, + "grad_norm": 0.9454433168177289, + "learning_rate": 6.92031625354179e-06, + "loss": 0.1796, + "step": 630 + }, + { + "epoch": 0.4856868395773295, + "grad_norm": 0.9852240424660407, + "learning_rate": 6.918982244851616e-06, + "loss": 0.171, + "step": 632 + }, + { + "epoch": 0.487223823246878, + "grad_norm": 1.0414345737302746, + "learning_rate": 6.917637292882944e-06, + "loss": 0.1776, + "step": 634 + }, + { + "epoch": 0.48876080691642654, + "grad_norm": 0.9675949339406987, + "learning_rate": 6.916281401940615e-06, + "loss": 0.1806, + "step": 636 + }, + { + "epoch": 0.490297790585975, + "grad_norm": 0.982314844842542, + "learning_rate": 6.914914576364487e-06, + "loss": 0.1785, + "step": 638 + }, + { + "epoch": 0.4918347742555235, + "grad_norm": 0.9775652590949767, + "learning_rate": 6.913536820529416e-06, + "loss": 0.1751, + "step": 640 + }, + { + "epoch": 0.49337175792507204, + "grad_norm": 0.8855730733866225, + "learning_rate": 6.912148138845241e-06, + "loss": 0.1659, + "step": 642 + }, + { + "epoch": 0.49490874159462056, + "grad_norm": 1.0573307642737217, + "learning_rate": 6.910748535756774e-06, + "loss": 0.1781, + "step": 644 + }, + { + "epoch": 0.4964457252641691, + "grad_norm": 0.948011615943263, + "learning_rate": 6.909338015743782e-06, + "loss": 0.1521, + "step": 646 + }, + { + "epoch": 0.4979827089337176, + "grad_norm": 1.006209266616953, + "learning_rate": 6.907916583320976e-06, + "loss": 0.1671, + "step": 648 + }, + { + "epoch": 0.49951969260326606, + "grad_norm": 0.9744730979666755, + "learning_rate": 6.906484243037992e-06, + "loss": 0.1777, + "step": 650 + }, + { + "epoch": 0.5010566762728146, + "grad_norm": 1.0140793456518158, + "learning_rate": 6.9050409994793835e-06, + "loss": 0.1724, + "step": 652 + }, + { + "epoch": 0.5025936599423632, + "grad_norm": 0.9953513391992594, + "learning_rate": 6.903586857264598e-06, + "loss": 0.1612, + "step": 654 + }, + { + "epoch": 0.5041306436119116, + "grad_norm": 0.9120318862748673, + "learning_rate": 6.9021218210479715e-06, + "loss": 0.1565, + "step": 656 + }, + { + "epoch": 0.5056676272814601, + "grad_norm": 0.8703090747149029, + "learning_rate": 6.900645895518703e-06, + "loss": 0.1616, + "step": 658 + }, + { + "epoch": 0.5072046109510087, + "grad_norm": 0.9188361211761223, + "learning_rate": 6.899159085400851e-06, + "loss": 0.1702, + "step": 660 + }, + { + "epoch": 0.5087415946205571, + "grad_norm": 0.9052706219772494, + "learning_rate": 6.897661395453309e-06, + "loss": 0.1671, + "step": 662 + }, + { + "epoch": 0.5102785782901057, + "grad_norm": 0.9790524922779839, + "learning_rate": 6.896152830469797e-06, + "loss": 0.1707, + "step": 664 + }, + { + "epoch": 0.5118155619596542, + "grad_norm": 0.9737405664828704, + "learning_rate": 6.894633395278839e-06, + "loss": 0.1749, + "step": 666 + }, + { + "epoch": 0.5133525456292027, + "grad_norm": 0.8786179004815331, + "learning_rate": 6.893103094743758e-06, + "loss": 0.1662, + "step": 668 + }, + { + "epoch": 0.5148895292987512, + "grad_norm": 0.9648594871932485, + "learning_rate": 6.891561933762648e-06, + "loss": 0.1777, + "step": 670 + }, + { + "epoch": 0.5164265129682997, + "grad_norm": 0.9273934239395568, + "learning_rate": 6.8900099172683675e-06, + "loss": 0.1762, + "step": 672 + }, + { + "epoch": 0.5179634966378482, + "grad_norm": 0.9132188955471342, + "learning_rate": 6.8884470502285195e-06, + "loss": 0.1638, + "step": 674 + }, + { + "epoch": 0.5195004803073967, + "grad_norm": 1.0216457284197233, + "learning_rate": 6.886873337645439e-06, + "loss": 0.1663, + "step": 676 + }, + { + "epoch": 0.5210374639769453, + "grad_norm": 0.9342404527623865, + "learning_rate": 6.885288784556172e-06, + "loss": 0.1692, + "step": 678 + }, + { + "epoch": 0.5225744476464937, + "grad_norm": 0.8975547579280767, + "learning_rate": 6.883693396032463e-06, + "loss": 0.1669, + "step": 680 + }, + { + "epoch": 0.5241114313160423, + "grad_norm": 0.8921639188810697, + "learning_rate": 6.88208717718074e-06, + "loss": 0.1665, + "step": 682 + }, + { + "epoch": 0.5256484149855908, + "grad_norm": 0.9687694730815803, + "learning_rate": 6.880470133142094e-06, + "loss": 0.1674, + "step": 684 + }, + { + "epoch": 0.5271853986551392, + "grad_norm": 0.9456241155960425, + "learning_rate": 6.878842269092263e-06, + "loss": 0.1685, + "step": 686 + }, + { + "epoch": 0.5287223823246878, + "grad_norm": 1.0077089550967064, + "learning_rate": 6.877203590241621e-06, + "loss": 0.1755, + "step": 688 + }, + { + "epoch": 0.5302593659942363, + "grad_norm": 0.9499550767483984, + "learning_rate": 6.875554101835156e-06, + "loss": 0.1727, + "step": 690 + }, + { + "epoch": 0.5317963496637849, + "grad_norm": 0.9677648466898067, + "learning_rate": 6.873893809152453e-06, + "loss": 0.1768, + "step": 692 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.9430584650028841, + "learning_rate": 6.872222717507679e-06, + "loss": 0.1736, + "step": 694 + }, + { + "epoch": 0.5348703170028818, + "grad_norm": 0.8724957576172335, + "learning_rate": 6.870540832249567e-06, + "loss": 0.1586, + "step": 696 + }, + { + "epoch": 0.5364073006724304, + "grad_norm": 0.8628795594957057, + "learning_rate": 6.868848158761398e-06, + "loss": 0.1617, + "step": 698 + }, + { + "epoch": 0.5379442843419788, + "grad_norm": 0.9609714958221978, + "learning_rate": 6.867144702460982e-06, + "loss": 0.1831, + "step": 700 + }, + { + "epoch": 0.5394812680115274, + "grad_norm": 0.9942806006570933, + "learning_rate": 6.865430468800642e-06, + "loss": 0.181, + "step": 702 + }, + { + "epoch": 0.5410182516810759, + "grad_norm": 0.9153357695300892, + "learning_rate": 6.863705463267197e-06, + "loss": 0.1616, + "step": 704 + }, + { + "epoch": 0.5425552353506244, + "grad_norm": 0.903669480913219, + "learning_rate": 6.861969691381943e-06, + "loss": 0.176, + "step": 706 + }, + { + "epoch": 0.5440922190201729, + "grad_norm": 0.88720714763806, + "learning_rate": 6.860223158700639e-06, + "loss": 0.17, + "step": 708 + }, + { + "epoch": 0.5456292026897214, + "grad_norm": 0.9051200845778055, + "learning_rate": 6.8584658708134825e-06, + "loss": 0.1686, + "step": 710 + }, + { + "epoch": 0.54716618635927, + "grad_norm": 0.9513875655430767, + "learning_rate": 6.856697833345101e-06, + "loss": 0.1723, + "step": 712 + }, + { + "epoch": 0.5487031700288184, + "grad_norm": 0.9099315275605531, + "learning_rate": 6.8549190519545206e-06, + "loss": 0.1641, + "step": 714 + }, + { + "epoch": 0.550240153698367, + "grad_norm": 0.8702057226720046, + "learning_rate": 6.8531295323351655e-06, + "loss": 0.1583, + "step": 716 + }, + { + "epoch": 0.5517771373679154, + "grad_norm": 0.9086133226107327, + "learning_rate": 6.851329280214823e-06, + "loss": 0.1637, + "step": 718 + }, + { + "epoch": 0.553314121037464, + "grad_norm": 0.908182677978134, + "learning_rate": 6.8495183013556365e-06, + "loss": 0.1596, + "step": 720 + }, + { + "epoch": 0.5548511047070125, + "grad_norm": 0.9856738002008828, + "learning_rate": 6.847696601554079e-06, + "loss": 0.1701, + "step": 722 + }, + { + "epoch": 0.556388088376561, + "grad_norm": 0.9119819563123349, + "learning_rate": 6.845864186640944e-06, + "loss": 0.1563, + "step": 724 + }, + { + "epoch": 0.5579250720461095, + "grad_norm": 0.9080236111588762, + "learning_rate": 6.844021062481314e-06, + "loss": 0.1753, + "step": 726 + }, + { + "epoch": 0.559462055715658, + "grad_norm": 0.8990390737752408, + "learning_rate": 6.842167234974556e-06, + "loss": 0.1647, + "step": 728 + }, + { + "epoch": 0.5609990393852066, + "grad_norm": 0.9761008697066089, + "learning_rate": 6.840302710054292e-06, + "loss": 0.1708, + "step": 730 + }, + { + "epoch": 0.562536023054755, + "grad_norm": 0.9552176939231856, + "learning_rate": 6.838427493688384e-06, + "loss": 0.1675, + "step": 732 + }, + { + "epoch": 0.5640730067243036, + "grad_norm": 0.8757204503224848, + "learning_rate": 6.836541591878915e-06, + "loss": 0.1624, + "step": 734 + }, + { + "epoch": 0.5656099903938521, + "grad_norm": 0.8887299929134649, + "learning_rate": 6.834645010662169e-06, + "loss": 0.1579, + "step": 736 + }, + { + "epoch": 0.5671469740634005, + "grad_norm": 0.9268495074535574, + "learning_rate": 6.832737756108613e-06, + "loss": 0.1817, + "step": 738 + }, + { + "epoch": 0.5686839577329491, + "grad_norm": 0.8930331129406648, + "learning_rate": 6.830819834322875e-06, + "loss": 0.1722, + "step": 740 + }, + { + "epoch": 0.5702209414024976, + "grad_norm": 0.9181728216635983, + "learning_rate": 6.828891251443729e-06, + "loss": 0.1751, + "step": 742 + }, + { + "epoch": 0.5717579250720461, + "grad_norm": 0.9429482896947388, + "learning_rate": 6.826952013644067e-06, + "loss": 0.1715, + "step": 744 + }, + { + "epoch": 0.5732949087415946, + "grad_norm": 0.9118821908902819, + "learning_rate": 6.825002127130891e-06, + "loss": 0.1696, + "step": 746 + }, + { + "epoch": 0.5748318924111432, + "grad_norm": 0.9132457028236339, + "learning_rate": 6.823041598145282e-06, + "loss": 0.1699, + "step": 748 + }, + { + "epoch": 0.5763688760806917, + "grad_norm": 0.8756070623659923, + "learning_rate": 6.821070432962387e-06, + "loss": 0.1635, + "step": 750 + }, + { + "epoch": 0.5779058597502401, + "grad_norm": 0.8779903149650536, + "learning_rate": 6.819088637891397e-06, + "loss": 0.1571, + "step": 752 + }, + { + "epoch": 0.5794428434197887, + "grad_norm": 0.951553518645419, + "learning_rate": 6.817096219275525e-06, + "loss": 0.177, + "step": 754 + }, + { + "epoch": 0.5809798270893372, + "grad_norm": 0.8815920502748197, + "learning_rate": 6.815093183491988e-06, + "loss": 0.1723, + "step": 756 + }, + { + "epoch": 0.5825168107588857, + "grad_norm": 0.8943776455140009, + "learning_rate": 6.813079536951986e-06, + "loss": 0.1675, + "step": 758 + }, + { + "epoch": 0.5840537944284342, + "grad_norm": 0.9042515471791829, + "learning_rate": 6.811055286100681e-06, + "loss": 0.1668, + "step": 760 + }, + { + "epoch": 0.5855907780979827, + "grad_norm": 0.8985203223108896, + "learning_rate": 6.809020437417178e-06, + "loss": 0.1683, + "step": 762 + }, + { + "epoch": 0.5871277617675312, + "grad_norm": 0.8854626460275703, + "learning_rate": 6.8069749974145e-06, + "loss": 0.1664, + "step": 764 + }, + { + "epoch": 0.5886647454370797, + "grad_norm": 0.9849706761614369, + "learning_rate": 6.804918972639572e-06, + "loss": 0.1603, + "step": 766 + }, + { + "epoch": 0.5902017291066283, + "grad_norm": 0.9492268475374204, + "learning_rate": 6.802852369673199e-06, + "loss": 0.1712, + "step": 768 + }, + { + "epoch": 0.5917387127761767, + "grad_norm": 0.9656447681787407, + "learning_rate": 6.8007751951300425e-06, + "loss": 0.1571, + "step": 770 + }, + { + "epoch": 0.5932756964457253, + "grad_norm": 0.8950523002371793, + "learning_rate": 6.798687455658602e-06, + "loss": 0.166, + "step": 772 + }, + { + "epoch": 0.5948126801152738, + "grad_norm": 0.9273600651692718, + "learning_rate": 6.79658915794119e-06, + "loss": 0.1617, + "step": 774 + }, + { + "epoch": 0.5963496637848222, + "grad_norm": 0.9160428728026682, + "learning_rate": 6.794480308693916e-06, + "loss": 0.1718, + "step": 776 + }, + { + "epoch": 0.5978866474543708, + "grad_norm": 0.9675151521212334, + "learning_rate": 6.792360914666662e-06, + "loss": 0.1618, + "step": 778 + }, + { + "epoch": 0.5994236311239193, + "grad_norm": 0.9613967630774912, + "learning_rate": 6.79023098264306e-06, + "loss": 0.1707, + "step": 780 + }, + { + "epoch": 0.6009606147934679, + "grad_norm": 0.8548759973613992, + "learning_rate": 6.7880905194404735e-06, + "loss": 0.1514, + "step": 782 + }, + { + "epoch": 0.6024975984630163, + "grad_norm": 0.890219782689619, + "learning_rate": 6.78593953190997e-06, + "loss": 0.1715, + "step": 784 + }, + { + "epoch": 0.6040345821325649, + "grad_norm": 0.9086155715910085, + "learning_rate": 6.783778026936305e-06, + "loss": 0.1663, + "step": 786 + }, + { + "epoch": 0.6055715658021134, + "grad_norm": 0.8095718809326027, + "learning_rate": 6.781606011437898e-06, + "loss": 0.1506, + "step": 788 + }, + { + "epoch": 0.6071085494716618, + "grad_norm": 0.8993402244627612, + "learning_rate": 6.779423492366808e-06, + "loss": 0.1778, + "step": 790 + }, + { + "epoch": 0.6086455331412104, + "grad_norm": 0.9179176055571641, + "learning_rate": 6.777230476708715e-06, + "loss": 0.1683, + "step": 792 + }, + { + "epoch": 0.6101825168107589, + "grad_norm": 0.880916421665006, + "learning_rate": 6.775026971482896e-06, + "loss": 0.1679, + "step": 794 + }, + { + "epoch": 0.6117195004803074, + "grad_norm": 0.9118630511257958, + "learning_rate": 6.7728129837422016e-06, + "loss": 0.1647, + "step": 796 + }, + { + "epoch": 0.6132564841498559, + "grad_norm": 0.9503223570669749, + "learning_rate": 6.770588520573034e-06, + "loss": 0.1653, + "step": 798 + }, + { + "epoch": 0.6147934678194045, + "grad_norm": 0.914092375436958, + "learning_rate": 6.768353589095325e-06, + "loss": 0.1706, + "step": 800 + }, + { + "epoch": 0.6163304514889529, + "grad_norm": 0.8780710561728171, + "learning_rate": 6.766108196462512e-06, + "loss": 0.1581, + "step": 802 + }, + { + "epoch": 0.6178674351585014, + "grad_norm": 0.9908640319756162, + "learning_rate": 6.763852349861517e-06, + "loss": 0.1746, + "step": 804 + }, + { + "epoch": 0.61940441882805, + "grad_norm": 0.9382036222959125, + "learning_rate": 6.761586056512721e-06, + "loss": 0.1467, + "step": 806 + }, + { + "epoch": 0.6209414024975984, + "grad_norm": 0.8796176529284295, + "learning_rate": 6.759309323669945e-06, + "loss": 0.1638, + "step": 808 + }, + { + "epoch": 0.622478386167147, + "grad_norm": 0.9402322649840099, + "learning_rate": 6.757022158620422e-06, + "loss": 0.1644, + "step": 810 + }, + { + "epoch": 0.6240153698366955, + "grad_norm": 0.9036305217983724, + "learning_rate": 6.754724568684775e-06, + "loss": 0.1625, + "step": 812 + }, + { + "epoch": 0.6255523535062439, + "grad_norm": 0.8200742046612647, + "learning_rate": 6.752416561216997e-06, + "loss": 0.1617, + "step": 814 + }, + { + "epoch": 0.6270893371757925, + "grad_norm": 0.9107960160473824, + "learning_rate": 6.750098143604423e-06, + "loss": 0.1687, + "step": 816 + }, + { + "epoch": 0.628626320845341, + "grad_norm": 0.8972272932978446, + "learning_rate": 6.747769323267706e-06, + "loss": 0.1746, + "step": 818 + }, + { + "epoch": 0.6301633045148896, + "grad_norm": 0.9499481842269516, + "learning_rate": 6.7454301076608025e-06, + "loss": 0.1696, + "step": 820 + }, + { + "epoch": 0.631700288184438, + "grad_norm": 0.8409255790993163, + "learning_rate": 6.743080504270933e-06, + "loss": 0.1598, + "step": 822 + }, + { + "epoch": 0.6332372718539866, + "grad_norm": 0.9027842885644961, + "learning_rate": 6.740720520618574e-06, + "loss": 0.1585, + "step": 824 + }, + { + "epoch": 0.6347742555235351, + "grad_norm": 0.8719606536944999, + "learning_rate": 6.738350164257421e-06, + "loss": 0.1547, + "step": 826 + }, + { + "epoch": 0.6363112391930835, + "grad_norm": 0.9247004952412972, + "learning_rate": 6.735969442774372e-06, + "loss": 0.173, + "step": 828 + }, + { + "epoch": 0.6378482228626321, + "grad_norm": 0.8506134884934429, + "learning_rate": 6.733578363789503e-06, + "loss": 0.154, + "step": 830 + }, + { + "epoch": 0.6393852065321806, + "grad_norm": 0.9171064482180226, + "learning_rate": 6.731176934956039e-06, + "loss": 0.1652, + "step": 832 + }, + { + "epoch": 0.6409221902017291, + "grad_norm": 0.9248200687434835, + "learning_rate": 6.728765163960333e-06, + "loss": 0.1704, + "step": 834 + }, + { + "epoch": 0.6424591738712776, + "grad_norm": 0.8545348365058903, + "learning_rate": 6.726343058521839e-06, + "loss": 0.1569, + "step": 836 + }, + { + "epoch": 0.6439961575408262, + "grad_norm": 0.9043067784462817, + "learning_rate": 6.723910626393091e-06, + "loss": 0.1715, + "step": 838 + }, + { + "epoch": 0.6455331412103746, + "grad_norm": 1.0091403589112276, + "learning_rate": 6.721467875359678e-06, + "loss": 0.1743, + "step": 840 + }, + { + "epoch": 0.6470701248799231, + "grad_norm": 0.8452475270597983, + "learning_rate": 6.719014813240213e-06, + "loss": 0.1637, + "step": 842 + }, + { + "epoch": 0.6486071085494717, + "grad_norm": 0.9000907310330241, + "learning_rate": 6.716551447886314e-06, + "loss": 0.1664, + "step": 844 + }, + { + "epoch": 0.6501440922190201, + "grad_norm": 0.9367454591962908, + "learning_rate": 6.714077787182576e-06, + "loss": 0.1609, + "step": 846 + }, + { + "epoch": 0.6516810758885687, + "grad_norm": 0.8639855347461337, + "learning_rate": 6.71159383904655e-06, + "loss": 0.1619, + "step": 848 + }, + { + "epoch": 0.6532180595581172, + "grad_norm": 0.8444570709766536, + "learning_rate": 6.709099611428709e-06, + "loss": 0.1569, + "step": 850 + }, + { + "epoch": 0.6547550432276658, + "grad_norm": 0.8660206440875692, + "learning_rate": 6.706595112312432e-06, + "loss": 0.1638, + "step": 852 + }, + { + "epoch": 0.6562920268972142, + "grad_norm": 0.9309827842770139, + "learning_rate": 6.704080349713974e-06, + "loss": 0.1605, + "step": 854 + }, + { + "epoch": 0.6578290105667627, + "grad_norm": 0.8714646054346427, + "learning_rate": 6.70155533168244e-06, + "loss": 0.1595, + "step": 856 + }, + { + "epoch": 0.6593659942363113, + "grad_norm": 0.8712554159426241, + "learning_rate": 6.699020066299759e-06, + "loss": 0.1678, + "step": 858 + }, + { + "epoch": 0.6609029779058597, + "grad_norm": 0.9250738676547778, + "learning_rate": 6.696474561680663e-06, + "loss": 0.1789, + "step": 860 + }, + { + "epoch": 0.6624399615754083, + "grad_norm": 0.9111129955962746, + "learning_rate": 6.693918825972651e-06, + "loss": 0.168, + "step": 862 + }, + { + "epoch": 0.6639769452449568, + "grad_norm": 0.9141522182958596, + "learning_rate": 6.691352867355973e-06, + "loss": 0.16, + "step": 864 + }, + { + "epoch": 0.6655139289145053, + "grad_norm": 0.881012224572559, + "learning_rate": 6.688776694043602e-06, + "loss": 0.1653, + "step": 866 + }, + { + "epoch": 0.6670509125840538, + "grad_norm": 0.8632782839785611, + "learning_rate": 6.6861903142812e-06, + "loss": 0.1564, + "step": 868 + }, + { + "epoch": 0.6685878962536023, + "grad_norm": 0.8720327769692922, + "learning_rate": 6.683593736347102e-06, + "loss": 0.1534, + "step": 870 + }, + { + "epoch": 0.6701248799231508, + "grad_norm": 0.876964430748599, + "learning_rate": 6.680986968552282e-06, + "loss": 0.1657, + "step": 872 + }, + { + "epoch": 0.6716618635926993, + "grad_norm": 0.9631837226264072, + "learning_rate": 6.6783700192403296e-06, + "loss": 0.1683, + "step": 874 + }, + { + "epoch": 0.6731988472622479, + "grad_norm": 0.846575004663177, + "learning_rate": 6.675742896787425e-06, + "loss": 0.1476, + "step": 876 + }, + { + "epoch": 0.6747358309317963, + "grad_norm": 0.9308686661082001, + "learning_rate": 6.6731056096023065e-06, + "loss": 0.1595, + "step": 878 + }, + { + "epoch": 0.6762728146013448, + "grad_norm": 0.8915035477659844, + "learning_rate": 6.6704581661262486e-06, + "loss": 0.1575, + "step": 880 + }, + { + "epoch": 0.6778097982708934, + "grad_norm": 0.8362332131345471, + "learning_rate": 6.6678005748330346e-06, + "loss": 0.1601, + "step": 882 + }, + { + "epoch": 0.6793467819404418, + "grad_norm": 0.9249603517376113, + "learning_rate": 6.665132844228926e-06, + "loss": 0.1654, + "step": 884 + }, + { + "epoch": 0.6808837656099904, + "grad_norm": 0.8814575750008693, + "learning_rate": 6.662454982852641e-06, + "loss": 0.1749, + "step": 886 + }, + { + "epoch": 0.6824207492795389, + "grad_norm": 0.899221716183914, + "learning_rate": 6.65976699927532e-06, + "loss": 0.1682, + "step": 888 + }, + { + "epoch": 0.6839577329490875, + "grad_norm": 0.9139541897412309, + "learning_rate": 6.657068902100504e-06, + "loss": 0.1662, + "step": 890 + }, + { + "epoch": 0.6854947166186359, + "grad_norm": 0.9291186479929943, + "learning_rate": 6.6543606999641065e-06, + "loss": 0.1677, + "step": 892 + }, + { + "epoch": 0.6870317002881844, + "grad_norm": 0.8803357017385457, + "learning_rate": 6.6516424015343795e-06, + "loss": 0.1528, + "step": 894 + }, + { + "epoch": 0.688568683957733, + "grad_norm": 0.9230606940030847, + "learning_rate": 6.6489140155118964e-06, + "loss": 0.1676, + "step": 896 + }, + { + "epoch": 0.6901056676272814, + "grad_norm": 0.8703699992143266, + "learning_rate": 6.6461755506295145e-06, + "loss": 0.1554, + "step": 898 + }, + { + "epoch": 0.69164265129683, + "grad_norm": 0.9524256265392508, + "learning_rate": 6.643427015652351e-06, + "loss": 0.1704, + "step": 900 + }, + { + "epoch": 0.6931796349663785, + "grad_norm": 0.9415591239679129, + "learning_rate": 6.640668419377758e-06, + "loss": 0.1592, + "step": 902 + }, + { + "epoch": 0.694716618635927, + "grad_norm": 0.8738330937457875, + "learning_rate": 6.6378997706352885e-06, + "loss": 0.1692, + "step": 904 + }, + { + "epoch": 0.6962536023054755, + "grad_norm": 0.9005686509467422, + "learning_rate": 6.635121078286671e-06, + "loss": 0.1684, + "step": 906 + }, + { + "epoch": 0.697790585975024, + "grad_norm": 0.8296366500080525, + "learning_rate": 6.632332351225783e-06, + "loss": 0.1593, + "step": 908 + }, + { + "epoch": 0.6993275696445725, + "grad_norm": 0.8623498006286683, + "learning_rate": 6.629533598378617e-06, + "loss": 0.1683, + "step": 910 + }, + { + "epoch": 0.700864553314121, + "grad_norm": 0.8462969581032185, + "learning_rate": 6.626724828703259e-06, + "loss": 0.1534, + "step": 912 + }, + { + "epoch": 0.7024015369836696, + "grad_norm": 0.8628311131127484, + "learning_rate": 6.623906051189854e-06, + "loss": 0.1604, + "step": 914 + }, + { + "epoch": 0.703938520653218, + "grad_norm": 0.9145692129791568, + "learning_rate": 6.621077274860581e-06, + "loss": 0.1594, + "step": 916 + }, + { + "epoch": 0.7054755043227666, + "grad_norm": 0.9195013649008027, + "learning_rate": 6.618238508769621e-06, + "loss": 0.1716, + "step": 918 + }, + { + "epoch": 0.7070124879923151, + "grad_norm": 0.9005634587923742, + "learning_rate": 6.615389762003131e-06, + "loss": 0.169, + "step": 920 + }, + { + "epoch": 0.7085494716618636, + "grad_norm": 0.9087616214061071, + "learning_rate": 6.612531043679213e-06, + "loss": 0.1584, + "step": 922 + }, + { + "epoch": 0.7100864553314121, + "grad_norm": 0.8929943346561857, + "learning_rate": 6.609662362947886e-06, + "loss": 0.1543, + "step": 924 + }, + { + "epoch": 0.7116234390009606, + "grad_norm": 0.8923387750870015, + "learning_rate": 6.606783728991054e-06, + "loss": 0.1576, + "step": 926 + }, + { + "epoch": 0.7131604226705092, + "grad_norm": 0.8545883642264139, + "learning_rate": 6.603895151022483e-06, + "loss": 0.1593, + "step": 928 + }, + { + "epoch": 0.7146974063400576, + "grad_norm": 0.8581885831090912, + "learning_rate": 6.600996638287762e-06, + "loss": 0.1603, + "step": 930 + }, + { + "epoch": 0.7162343900096062, + "grad_norm": 0.9326356897470247, + "learning_rate": 6.598088200064284e-06, + "loss": 0.1731, + "step": 932 + }, + { + "epoch": 0.7177713736791547, + "grad_norm": 0.8874229434905837, + "learning_rate": 6.595169845661204e-06, + "loss": 0.1561, + "step": 934 + }, + { + "epoch": 0.7193083573487031, + "grad_norm": 0.880472017082445, + "learning_rate": 6.592241584419424e-06, + "loss": 0.1589, + "step": 936 + }, + { + "epoch": 0.7208453410182517, + "grad_norm": 0.8802659879981632, + "learning_rate": 6.589303425711548e-06, + "loss": 0.1699, + "step": 938 + }, + { + "epoch": 0.7223823246878002, + "grad_norm": 0.8605735232186383, + "learning_rate": 6.586355378941866e-06, + "loss": 0.1508, + "step": 940 + }, + { + "epoch": 0.7239193083573487, + "grad_norm": 0.9218559892018848, + "learning_rate": 6.58339745354631e-06, + "loss": 0.1536, + "step": 942 + }, + { + "epoch": 0.7254562920268972, + "grad_norm": 0.8848127640286009, + "learning_rate": 6.580429658992438e-06, + "loss": 0.1595, + "step": 944 + }, + { + "epoch": 0.7269932756964457, + "grad_norm": 0.9210041621285134, + "learning_rate": 6.577452004779393e-06, + "loss": 0.1493, + "step": 946 + }, + { + "epoch": 0.7285302593659942, + "grad_norm": 0.9094329668162617, + "learning_rate": 6.574464500437875e-06, + "loss": 0.1656, + "step": 948 + }, + { + "epoch": 0.7300672430355427, + "grad_norm": 0.8599101226794522, + "learning_rate": 6.571467155530114e-06, + "loss": 0.1621, + "step": 950 + }, + { + "epoch": 0.7316042267050913, + "grad_norm": 0.8800370037884307, + "learning_rate": 6.568459979649836e-06, + "loss": 0.1613, + "step": 952 + }, + { + "epoch": 0.7331412103746398, + "grad_norm": 0.8133202498369466, + "learning_rate": 6.565442982422233e-06, + "loss": 0.1498, + "step": 954 + }, + { + "epoch": 0.7346781940441883, + "grad_norm": 0.8854492713891652, + "learning_rate": 6.5624161735039365e-06, + "loss": 0.1576, + "step": 956 + }, + { + "epoch": 0.7362151777137368, + "grad_norm": 0.8860066937387789, + "learning_rate": 6.559379562582976e-06, + "loss": 0.1567, + "step": 958 + }, + { + "epoch": 0.7377521613832853, + "grad_norm": 0.801524450434012, + "learning_rate": 6.556333159378761e-06, + "loss": 0.1507, + "step": 960 + }, + { + "epoch": 0.7392891450528338, + "grad_norm": 0.826356057888294, + "learning_rate": 6.553276973642037e-06, + "loss": 0.1693, + "step": 962 + }, + { + "epoch": 0.7408261287223823, + "grad_norm": 0.8294938538531003, + "learning_rate": 6.550211015154869e-06, + "loss": 0.1558, + "step": 964 + }, + { + "epoch": 0.7423631123919309, + "grad_norm": 0.8307597065545927, + "learning_rate": 6.547135293730595e-06, + "loss": 0.1533, + "step": 966 + }, + { + "epoch": 0.7439000960614793, + "grad_norm": 0.8014991418925953, + "learning_rate": 6.544049819213806e-06, + "loss": 0.1634, + "step": 968 + }, + { + "epoch": 0.7454370797310279, + "grad_norm": 0.8998803118527309, + "learning_rate": 6.540954601480307e-06, + "loss": 0.1678, + "step": 970 + }, + { + "epoch": 0.7469740634005764, + "grad_norm": 0.8815150629442536, + "learning_rate": 6.537849650437091e-06, + "loss": 0.1549, + "step": 972 + }, + { + "epoch": 0.7485110470701248, + "grad_norm": 0.825376179394833, + "learning_rate": 6.534734976022302e-06, + "loss": 0.153, + "step": 974 + }, + { + "epoch": 0.7500480307396734, + "grad_norm": 0.8179612985435794, + "learning_rate": 6.53161058820521e-06, + "loss": 0.1444, + "step": 976 + }, + { + "epoch": 0.7515850144092219, + "grad_norm": 0.8727593413193312, + "learning_rate": 6.528476496986172e-06, + "loss": 0.1517, + "step": 978 + }, + { + "epoch": 0.7531219980787704, + "grad_norm": 0.8512575927431834, + "learning_rate": 6.525332712396604e-06, + "loss": 0.1573, + "step": 980 + }, + { + "epoch": 0.7546589817483189, + "grad_norm": 0.8582941888601493, + "learning_rate": 6.522179244498948e-06, + "loss": 0.1541, + "step": 982 + }, + { + "epoch": 0.7561959654178675, + "grad_norm": 0.8767811552173622, + "learning_rate": 6.519016103386639e-06, + "loss": 0.1546, + "step": 984 + }, + { + "epoch": 0.757732949087416, + "grad_norm": 0.8210784117243145, + "learning_rate": 6.5158432991840755e-06, + "loss": 0.145, + "step": 986 + }, + { + "epoch": 0.7592699327569644, + "grad_norm": 0.8326696940302717, + "learning_rate": 6.512660842046582e-06, + "loss": 0.1597, + "step": 988 + }, + { + "epoch": 0.760806916426513, + "grad_norm": 0.953434296685044, + "learning_rate": 6.509468742160382e-06, + "loss": 0.153, + "step": 990 + }, + { + "epoch": 0.7623439000960615, + "grad_norm": 0.8362202034275996, + "learning_rate": 6.506267009742564e-06, + "loss": 0.1422, + "step": 992 + }, + { + "epoch": 0.76388088376561, + "grad_norm": 0.857361945741671, + "learning_rate": 6.503055655041042e-06, + "loss": 0.1735, + "step": 994 + }, + { + "epoch": 0.7654178674351585, + "grad_norm": 0.891793041703015, + "learning_rate": 6.499834688334537e-06, + "loss": 0.1555, + "step": 996 + }, + { + "epoch": 0.766954851104707, + "grad_norm": 0.9034407482465361, + "learning_rate": 6.496604119932528e-06, + "loss": 0.1692, + "step": 998 + }, + { + "epoch": 0.7684918347742555, + "grad_norm": 0.9404179935204194, + "learning_rate": 6.493363960175231e-06, + "loss": 0.1549, + "step": 1000 + }, + { + "epoch": 0.7684918347742555, + "eval_loss": 0.1432075798511505, + "eval_runtime": 362.8754, + "eval_samples_per_second": 50.995, + "eval_steps_per_second": 6.377, + "step": 1000 + }, + { + "epoch": 0.770028818443804, + "grad_norm": 0.8281134972466698, + "learning_rate": 6.490114219433558e-06, + "loss": 0.1476, + "step": 1002 + }, + { + "epoch": 0.7715658021133526, + "grad_norm": 0.8720452925619442, + "learning_rate": 6.486854908109089e-06, + "loss": 0.1433, + "step": 1004 + }, + { + "epoch": 0.773102785782901, + "grad_norm": 0.8720479928054387, + "learning_rate": 6.483586036634041e-06, + "loss": 0.1553, + "step": 1006 + }, + { + "epoch": 0.7746397694524496, + "grad_norm": 0.8412137015802439, + "learning_rate": 6.480307615471223e-06, + "loss": 0.157, + "step": 1008 + }, + { + "epoch": 0.7761767531219981, + "grad_norm": 0.8527177593745339, + "learning_rate": 6.4770196551140155e-06, + "loss": 0.1614, + "step": 1010 + }, + { + "epoch": 0.7777137367915465, + "grad_norm": 0.8821801330028585, + "learning_rate": 6.473722166086329e-06, + "loss": 0.1525, + "step": 1012 + }, + { + "epoch": 0.7792507204610951, + "grad_norm": 0.9018994294254266, + "learning_rate": 6.470415158942574e-06, + "loss": 0.16, + "step": 1014 + }, + { + "epoch": 0.7807877041306436, + "grad_norm": 0.8703842227810923, + "learning_rate": 6.467098644267625e-06, + "loss": 0.1585, + "step": 1016 + }, + { + "epoch": 0.7823246878001922, + "grad_norm": 0.8051081010409243, + "learning_rate": 6.46377263267679e-06, + "loss": 0.1527, + "step": 1018 + }, + { + "epoch": 0.7838616714697406, + "grad_norm": 0.8888259620906063, + "learning_rate": 6.460437134815771e-06, + "loss": 0.167, + "step": 1020 + }, + { + "epoch": 0.7853986551392892, + "grad_norm": 0.9029947228741374, + "learning_rate": 6.457092161360633e-06, + "loss": 0.1668, + "step": 1022 + }, + { + "epoch": 0.7869356388088377, + "grad_norm": 0.8981361575693058, + "learning_rate": 6.453737723017775e-06, + "loss": 0.1612, + "step": 1024 + }, + { + "epoch": 0.7884726224783861, + "grad_norm": 0.9352139489971404, + "learning_rate": 6.450373830523886e-06, + "loss": 0.1553, + "step": 1026 + }, + { + "epoch": 0.7900096061479347, + "grad_norm": 0.8227972574038364, + "learning_rate": 6.447000494645916e-06, + "loss": 0.1578, + "step": 1028 + }, + { + "epoch": 0.7915465898174832, + "grad_norm": 0.8530604144128302, + "learning_rate": 6.4436177261810395e-06, + "loss": 0.1541, + "step": 1030 + }, + { + "epoch": 0.7930835734870317, + "grad_norm": 0.8897585617894813, + "learning_rate": 6.440225535956627e-06, + "loss": 0.1552, + "step": 1032 + }, + { + "epoch": 0.7946205571565802, + "grad_norm": 0.8139899114750058, + "learning_rate": 6.436823934830201e-06, + "loss": 0.1397, + "step": 1034 + }, + { + "epoch": 0.7961575408261288, + "grad_norm": 0.9155078950589843, + "learning_rate": 6.433412933689408e-06, + "loss": 0.1638, + "step": 1036 + }, + { + "epoch": 0.7976945244956772, + "grad_norm": 0.7986982196116793, + "learning_rate": 6.429992543451982e-06, + "loss": 0.1492, + "step": 1038 + }, + { + "epoch": 0.7992315081652257, + "grad_norm": 0.9067813768385491, + "learning_rate": 6.426562775065706e-06, + "loss": 0.166, + "step": 1040 + }, + { + "epoch": 0.8007684918347743, + "grad_norm": 0.8546237308090272, + "learning_rate": 6.4231236395083835e-06, + "loss": 0.1528, + "step": 1042 + }, + { + "epoch": 0.8023054755043227, + "grad_norm": 0.8301245146373359, + "learning_rate": 6.419675147787799e-06, + "loss": 0.169, + "step": 1044 + }, + { + "epoch": 0.8038424591738713, + "grad_norm": 0.9357811284934665, + "learning_rate": 6.416217310941682e-06, + "loss": 0.1567, + "step": 1046 + }, + { + "epoch": 0.8053794428434198, + "grad_norm": 0.8763494765723041, + "learning_rate": 6.412750140037675e-06, + "loss": 0.1578, + "step": 1048 + }, + { + "epoch": 0.8069164265129684, + "grad_norm": 0.8853921013266428, + "learning_rate": 6.409273646173296e-06, + "loss": 0.1753, + "step": 1050 + }, + { + "epoch": 0.8084534101825168, + "grad_norm": 0.8836427743865589, + "learning_rate": 6.405787840475904e-06, + "loss": 0.1599, + "step": 1052 + }, + { + "epoch": 0.8099903938520653, + "grad_norm": 0.8292826766651644, + "learning_rate": 6.402292734102661e-06, + "loss": 0.1603, + "step": 1054 + }, + { + "epoch": 0.8115273775216139, + "grad_norm": 0.7981150884405339, + "learning_rate": 6.3987883382405e-06, + "loss": 0.152, + "step": 1056 + }, + { + "epoch": 0.8130643611911623, + "grad_norm": 0.8941084452594006, + "learning_rate": 6.395274664106086e-06, + "loss": 0.1675, + "step": 1058 + }, + { + "epoch": 0.8146013448607109, + "grad_norm": 0.8843029348683886, + "learning_rate": 6.39175172294578e-06, + "loss": 0.1647, + "step": 1060 + }, + { + "epoch": 0.8161383285302594, + "grad_norm": 0.8607622983535753, + "learning_rate": 6.38821952603561e-06, + "loss": 0.1642, + "step": 1062 + }, + { + "epoch": 0.8176753121998078, + "grad_norm": 0.8712768415639528, + "learning_rate": 6.3846780846812234e-06, + "loss": 0.165, + "step": 1064 + }, + { + "epoch": 0.8192122958693564, + "grad_norm": 0.839138399301467, + "learning_rate": 6.381127410217858e-06, + "loss": 0.1679, + "step": 1066 + }, + { + "epoch": 0.8207492795389049, + "grad_norm": 0.8561911550355878, + "learning_rate": 6.377567514010304e-06, + "loss": 0.1582, + "step": 1068 + }, + { + "epoch": 0.8222862632084534, + "grad_norm": 0.8290678486131811, + "learning_rate": 6.373998407452873e-06, + "loss": 0.1545, + "step": 1070 + }, + { + "epoch": 0.8238232468780019, + "grad_norm": 0.847760798773364, + "learning_rate": 6.370420101969349e-06, + "loss": 0.1459, + "step": 1072 + }, + { + "epoch": 0.8253602305475505, + "grad_norm": 0.8321417732926941, + "learning_rate": 6.3668326090129645e-06, + "loss": 0.1524, + "step": 1074 + }, + { + "epoch": 0.8268972142170989, + "grad_norm": 0.8458379024110787, + "learning_rate": 6.363235940066358e-06, + "loss": 0.1485, + "step": 1076 + }, + { + "epoch": 0.8284341978866474, + "grad_norm": 0.9374727516067862, + "learning_rate": 6.359630106641535e-06, + "loss": 0.1641, + "step": 1078 + }, + { + "epoch": 0.829971181556196, + "grad_norm": 0.8470538370988849, + "learning_rate": 6.356015120279837e-06, + "loss": 0.1571, + "step": 1080 + }, + { + "epoch": 0.8315081652257444, + "grad_norm": 0.8292449900966895, + "learning_rate": 6.352390992551903e-06, + "loss": 0.1668, + "step": 1082 + }, + { + "epoch": 0.833045148895293, + "grad_norm": 0.8873775326690251, + "learning_rate": 6.348757735057628e-06, + "loss": 0.1573, + "step": 1084 + }, + { + "epoch": 0.8345821325648415, + "grad_norm": 0.8792119220842931, + "learning_rate": 6.345115359426129e-06, + "loss": 0.1578, + "step": 1086 + }, + { + "epoch": 0.8361191162343901, + "grad_norm": 0.9098125786437564, + "learning_rate": 6.341463877315711e-06, + "loss": 0.1631, + "step": 1088 + }, + { + "epoch": 0.8376560999039385, + "grad_norm": 0.801489722135179, + "learning_rate": 6.337803300413822e-06, + "loss": 0.1479, + "step": 1090 + }, + { + "epoch": 0.839193083573487, + "grad_norm": 0.9244844513428183, + "learning_rate": 6.334133640437025e-06, + "loss": 0.1632, + "step": 1092 + }, + { + "epoch": 0.8407300672430356, + "grad_norm": 0.881534980036438, + "learning_rate": 6.330454909130952e-06, + "loss": 0.1595, + "step": 1094 + }, + { + "epoch": 0.842267050912584, + "grad_norm": 0.8598183603447899, + "learning_rate": 6.326767118270271e-06, + "loss": 0.1616, + "step": 1096 + }, + { + "epoch": 0.8438040345821326, + "grad_norm": 0.889022724838577, + "learning_rate": 6.323070279658648e-06, + "loss": 0.1601, + "step": 1098 + }, + { + "epoch": 0.8453410182516811, + "grad_norm": 0.8912527206906096, + "learning_rate": 6.319364405128706e-06, + "loss": 0.1605, + "step": 1100 + }, + { + "epoch": 0.8468780019212296, + "grad_norm": 0.8035411272312927, + "learning_rate": 6.315649506541995e-06, + "loss": 0.1514, + "step": 1102 + }, + { + "epoch": 0.8484149855907781, + "grad_norm": 0.866003931969349, + "learning_rate": 6.311925595788942e-06, + "loss": 0.1535, + "step": 1104 + }, + { + "epoch": 0.8499519692603266, + "grad_norm": 0.8292860317473079, + "learning_rate": 6.308192684788825e-06, + "loss": 0.1517, + "step": 1106 + }, + { + "epoch": 0.8514889529298751, + "grad_norm": 0.8869266953820677, + "learning_rate": 6.3044507854897265e-06, + "loss": 0.1603, + "step": 1108 + }, + { + "epoch": 0.8530259365994236, + "grad_norm": 0.8577246546870172, + "learning_rate": 6.3006999098684985e-06, + "loss": 0.157, + "step": 1110 + }, + { + "epoch": 0.8545629202689722, + "grad_norm": 0.8403234008489573, + "learning_rate": 6.296940069930725e-06, + "loss": 0.1592, + "step": 1112 + }, + { + "epoch": 0.8560999039385206, + "grad_norm": 0.8702914831113494, + "learning_rate": 6.293171277710682e-06, + "loss": 0.1575, + "step": 1114 + }, + { + "epoch": 0.8576368876080691, + "grad_norm": 0.8870545228720214, + "learning_rate": 6.289393545271299e-06, + "loss": 0.163, + "step": 1116 + }, + { + "epoch": 0.8591738712776177, + "grad_norm": 0.8835871253233961, + "learning_rate": 6.285606884704122e-06, + "loss": 0.1505, + "step": 1118 + }, + { + "epoch": 0.8607108549471661, + "grad_norm": 0.9222189898206439, + "learning_rate": 6.281811308129271e-06, + "loss": 0.1565, + "step": 1120 + }, + { + "epoch": 0.8622478386167147, + "grad_norm": 0.865733104634757, + "learning_rate": 6.278006827695407e-06, + "loss": 0.1657, + "step": 1122 + }, + { + "epoch": 0.8637848222862632, + "grad_norm": 0.7873625454059449, + "learning_rate": 6.274193455579688e-06, + "loss": 0.152, + "step": 1124 + }, + { + "epoch": 0.8653218059558118, + "grad_norm": 0.9036534746354465, + "learning_rate": 6.270371203987733e-06, + "loss": 0.1524, + "step": 1126 + }, + { + "epoch": 0.8668587896253602, + "grad_norm": 0.8874043561419231, + "learning_rate": 6.266540085153581e-06, + "loss": 0.1623, + "step": 1128 + }, + { + "epoch": 0.8683957732949087, + "grad_norm": 0.8520395043888209, + "learning_rate": 6.262700111339654e-06, + "loss": 0.1511, + "step": 1130 + }, + { + "epoch": 0.8699327569644573, + "grad_norm": 0.814438124524513, + "learning_rate": 6.2588512948367144e-06, + "loss": 0.143, + "step": 1132 + }, + { + "epoch": 0.8714697406340057, + "grad_norm": 0.8620767282571578, + "learning_rate": 6.254993647963831e-06, + "loss": 0.1616, + "step": 1134 + }, + { + "epoch": 0.8730067243035543, + "grad_norm": 0.892703951825046, + "learning_rate": 6.251127183068331e-06, + "loss": 0.1579, + "step": 1136 + }, + { + "epoch": 0.8745437079731028, + "grad_norm": 0.8623333160707259, + "learning_rate": 6.247251912525773e-06, + "loss": 0.1576, + "step": 1138 + }, + { + "epoch": 0.8760806916426513, + "grad_norm": 0.8199322821651265, + "learning_rate": 6.243367848739894e-06, + "loss": 0.153, + "step": 1140 + }, + { + "epoch": 0.8776176753121998, + "grad_norm": 0.8392074331450308, + "learning_rate": 6.23947500414258e-06, + "loss": 0.1606, + "step": 1142 + }, + { + "epoch": 0.8791546589817483, + "grad_norm": 0.8281996951649421, + "learning_rate": 6.235573391193819e-06, + "loss": 0.1469, + "step": 1144 + }, + { + "epoch": 0.8806916426512968, + "grad_norm": 0.8544291264871913, + "learning_rate": 6.231663022381666e-06, + "loss": 0.17, + "step": 1146 + }, + { + "epoch": 0.8822286263208453, + "grad_norm": 0.8699715620381219, + "learning_rate": 6.227743910222202e-06, + "loss": 0.1543, + "step": 1148 + }, + { + "epoch": 0.8837656099903939, + "grad_norm": 0.8096799920711654, + "learning_rate": 6.22381606725949e-06, + "loss": 0.1562, + "step": 1150 + }, + { + "epoch": 0.8853025936599423, + "grad_norm": 0.8090700874638417, + "learning_rate": 6.219879506065542e-06, + "loss": 0.1499, + "step": 1152 + }, + { + "epoch": 0.8868395773294909, + "grad_norm": 0.7867446095253069, + "learning_rate": 6.215934239240272e-06, + "loss": 0.1558, + "step": 1154 + }, + { + "epoch": 0.8883765609990394, + "grad_norm": 0.788584493474745, + "learning_rate": 6.211980279411459e-06, + "loss": 0.1433, + "step": 1156 + }, + { + "epoch": 0.8899135446685879, + "grad_norm": 0.8362819878588067, + "learning_rate": 6.208017639234708e-06, + "loss": 0.1552, + "step": 1158 + }, + { + "epoch": 0.8914505283381364, + "grad_norm": 0.8817509955093219, + "learning_rate": 6.204046331393405e-06, + "loss": 0.1499, + "step": 1160 + }, + { + "epoch": 0.8929875120076849, + "grad_norm": 0.8313021295240178, + "learning_rate": 6.20006636859868e-06, + "loss": 0.1671, + "step": 1162 + }, + { + "epoch": 0.8945244956772335, + "grad_norm": 0.8485741130804667, + "learning_rate": 6.196077763589365e-06, + "loss": 0.1609, + "step": 1164 + }, + { + "epoch": 0.8960614793467819, + "grad_norm": 0.861196387787182, + "learning_rate": 6.192080529131955e-06, + "loss": 0.1609, + "step": 1166 + }, + { + "epoch": 0.8975984630163305, + "grad_norm": 0.8527772834985116, + "learning_rate": 6.188074678020563e-06, + "loss": 0.1571, + "step": 1168 + }, + { + "epoch": 0.899135446685879, + "grad_norm": 0.820498232150602, + "learning_rate": 6.184060223076884e-06, + "loss": 0.1599, + "step": 1170 + }, + { + "epoch": 0.9006724303554274, + "grad_norm": 0.8519749575925564, + "learning_rate": 6.180037177150149e-06, + "loss": 0.1548, + "step": 1172 + }, + { + "epoch": 0.902209414024976, + "grad_norm": 0.8442399564211222, + "learning_rate": 6.176005553117091e-06, + "loss": 0.151, + "step": 1174 + }, + { + "epoch": 0.9037463976945245, + "grad_norm": 0.8937597645248366, + "learning_rate": 6.171965363881894e-06, + "loss": 0.1682, + "step": 1176 + }, + { + "epoch": 0.905283381364073, + "grad_norm": 0.9004779698121571, + "learning_rate": 6.167916622376161e-06, + "loss": 0.1648, + "step": 1178 + }, + { + "epoch": 0.9068203650336215, + "grad_norm": 0.8739876161966016, + "learning_rate": 6.163859341558867e-06, + "loss": 0.1466, + "step": 1180 + }, + { + "epoch": 0.90835734870317, + "grad_norm": 0.8359328154212177, + "learning_rate": 6.159793534416318e-06, + "loss": 0.1609, + "step": 1182 + }, + { + "epoch": 0.9098943323727186, + "grad_norm": 0.7772963511251748, + "learning_rate": 6.155719213962113e-06, + "loss": 0.1396, + "step": 1184 + }, + { + "epoch": 0.911431316042267, + "grad_norm": 0.8267806690490346, + "learning_rate": 6.151636393237099e-06, + "loss": 0.1557, + "step": 1186 + }, + { + "epoch": 0.9129682997118156, + "grad_norm": 0.8299194507951076, + "learning_rate": 6.147545085309329e-06, + "loss": 0.1535, + "step": 1188 + }, + { + "epoch": 0.914505283381364, + "grad_norm": 0.8107731050092446, + "learning_rate": 6.143445303274022e-06, + "loss": 0.1533, + "step": 1190 + }, + { + "epoch": 0.9160422670509126, + "grad_norm": 0.8664903101940159, + "learning_rate": 6.139337060253521e-06, + "loss": 0.1509, + "step": 1192 + }, + { + "epoch": 0.9175792507204611, + "grad_norm": 0.9096007236323014, + "learning_rate": 6.135220369397252e-06, + "loss": 0.1576, + "step": 1194 + }, + { + "epoch": 0.9191162343900096, + "grad_norm": 0.7794208493232124, + "learning_rate": 6.131095243881675e-06, + "loss": 0.1353, + "step": 1196 + }, + { + "epoch": 0.9206532180595581, + "grad_norm": 0.8177662161991649, + "learning_rate": 6.1269616969102546e-06, + "loss": 0.1409, + "step": 1198 + }, + { + "epoch": 0.9221902017291066, + "grad_norm": 0.7668271695157493, + "learning_rate": 6.122819741713402e-06, + "loss": 0.1442, + "step": 1200 + }, + { + "epoch": 0.9237271853986552, + "grad_norm": 0.7607151071645437, + "learning_rate": 6.118669391548449e-06, + "loss": 0.1497, + "step": 1202 + }, + { + "epoch": 0.9252641690682036, + "grad_norm": 0.8748866630574149, + "learning_rate": 6.114510659699591e-06, + "loss": 0.1611, + "step": 1204 + }, + { + "epoch": 0.9268011527377522, + "grad_norm": 0.8598578438295764, + "learning_rate": 6.110343559477855e-06, + "loss": 0.1542, + "step": 1206 + }, + { + "epoch": 0.9283381364073007, + "grad_norm": 0.8347211442345176, + "learning_rate": 6.106168104221052e-06, + "loss": 0.1488, + "step": 1208 + }, + { + "epoch": 0.9298751200768491, + "grad_norm": 0.8798272771765616, + "learning_rate": 6.101984307293735e-06, + "loss": 0.1528, + "step": 1210 + }, + { + "epoch": 0.9314121037463977, + "grad_norm": 0.809406948238894, + "learning_rate": 6.097792182087156e-06, + "loss": 0.1605, + "step": 1212 + }, + { + "epoch": 0.9329490874159462, + "grad_norm": 0.8465513093584821, + "learning_rate": 6.093591742019225e-06, + "loss": 0.1515, + "step": 1214 + }, + { + "epoch": 0.9344860710854948, + "grad_norm": 0.8717872500128261, + "learning_rate": 6.089383000534465e-06, + "loss": 0.1537, + "step": 1216 + }, + { + "epoch": 0.9360230547550432, + "grad_norm": 0.7761882043003053, + "learning_rate": 6.085165971103969e-06, + "loss": 0.137, + "step": 1218 + }, + { + "epoch": 0.9375600384245918, + "grad_norm": 0.8779867026896587, + "learning_rate": 6.080940667225359e-06, + "loss": 0.1477, + "step": 1220 + }, + { + "epoch": 0.9390970220941403, + "grad_norm": 0.8339829528733452, + "learning_rate": 6.076707102422741e-06, + "loss": 0.1511, + "step": 1222 + }, + { + "epoch": 0.9406340057636887, + "grad_norm": 0.8459206215885985, + "learning_rate": 6.072465290246659e-06, + "loss": 0.1429, + "step": 1224 + }, + { + "epoch": 0.9421709894332373, + "grad_norm": 0.8156133417127609, + "learning_rate": 6.068215244274061e-06, + "loss": 0.1545, + "step": 1226 + }, + { + "epoch": 0.9437079731027858, + "grad_norm": 0.8160966643134648, + "learning_rate": 6.063956978108245e-06, + "loss": 0.1427, + "step": 1228 + }, + { + "epoch": 0.9452449567723343, + "grad_norm": 0.8239874532076136, + "learning_rate": 6.059690505378819e-06, + "loss": 0.1534, + "step": 1230 + }, + { + "epoch": 0.9467819404418828, + "grad_norm": 0.8475441460391161, + "learning_rate": 6.0554158397416596e-06, + "loss": 0.1507, + "step": 1232 + }, + { + "epoch": 0.9483189241114313, + "grad_norm": 0.839569089678522, + "learning_rate": 6.051132994878868e-06, + "loss": 0.1527, + "step": 1234 + }, + { + "epoch": 0.9498559077809798, + "grad_norm": 0.7802196203437861, + "learning_rate": 6.046841984498722e-06, + "loss": 0.1376, + "step": 1236 + }, + { + "epoch": 0.9513928914505283, + "grad_norm": 0.8574149706930279, + "learning_rate": 6.042542822335638e-06, + "loss": 0.1637, + "step": 1238 + }, + { + "epoch": 0.9529298751200769, + "grad_norm": 0.8161418800215421, + "learning_rate": 6.038235522150124e-06, + "loss": 0.1479, + "step": 1240 + }, + { + "epoch": 0.9544668587896253, + "grad_norm": 0.882673614104903, + "learning_rate": 6.0339200977287316e-06, + "loss": 0.1414, + "step": 1242 + }, + { + "epoch": 0.9560038424591739, + "grad_norm": 0.8368270452723762, + "learning_rate": 6.029596562884021e-06, + "loss": 0.158, + "step": 1244 + }, + { + "epoch": 0.9575408261287224, + "grad_norm": 0.8428259430927917, + "learning_rate": 6.025264931454509e-06, + "loss": 0.1474, + "step": 1246 + }, + { + "epoch": 0.9590778097982708, + "grad_norm": 0.8478092566057043, + "learning_rate": 6.020925217304629e-06, + "loss": 0.1526, + "step": 1248 + }, + { + "epoch": 0.9606147934678194, + "grad_norm": 0.8333839896657882, + "learning_rate": 6.016577434324684e-06, + "loss": 0.1419, + "step": 1250 + }, + { + "epoch": 0.9621517771373679, + "grad_norm": 0.8445286054177732, + "learning_rate": 6.012221596430804e-06, + "loss": 0.1596, + "step": 1252 + }, + { + "epoch": 0.9636887608069165, + "grad_norm": 0.851762036838009, + "learning_rate": 6.0078577175649e-06, + "loss": 0.1546, + "step": 1254 + }, + { + "epoch": 0.9652257444764649, + "grad_norm": 0.8437848350814667, + "learning_rate": 6.00348581169462e-06, + "loss": 0.1508, + "step": 1256 + }, + { + "epoch": 0.9667627281460135, + "grad_norm": 0.8945339085483407, + "learning_rate": 5.9991058928133054e-06, + "loss": 0.1576, + "step": 1258 + }, + { + "epoch": 0.968299711815562, + "grad_norm": 0.7944819259983578, + "learning_rate": 5.994717974939944e-06, + "loss": 0.1476, + "step": 1260 + }, + { + "epoch": 0.9698366954851104, + "grad_norm": 0.8729176506312757, + "learning_rate": 5.990322072119126e-06, + "loss": 0.1596, + "step": 1262 + }, + { + "epoch": 0.971373679154659, + "grad_norm": 0.8725237140356162, + "learning_rate": 5.985918198421002e-06, + "loss": 0.1571, + "step": 1264 + }, + { + "epoch": 0.9729106628242075, + "grad_norm": 0.7991775531475158, + "learning_rate": 5.981506367941233e-06, + "loss": 0.1434, + "step": 1266 + }, + { + "epoch": 0.974447646493756, + "grad_norm": 0.8382327345169207, + "learning_rate": 5.977086594800948e-06, + "loss": 0.1596, + "step": 1268 + }, + { + "epoch": 0.9759846301633045, + "grad_norm": 0.9529464464579128, + "learning_rate": 5.972658893146697e-06, + "loss": 0.1514, + "step": 1270 + }, + { + "epoch": 0.9775216138328531, + "grad_norm": 0.8484469791654419, + "learning_rate": 5.96822327715041e-06, + "loss": 0.152, + "step": 1272 + }, + { + "epoch": 0.9790585975024015, + "grad_norm": 0.81835031937336, + "learning_rate": 5.963779761009348e-06, + "loss": 0.1528, + "step": 1274 + }, + { + "epoch": 0.98059558117195, + "grad_norm": 0.8310249942970027, + "learning_rate": 5.959328358946056e-06, + "loss": 0.1473, + "step": 1276 + }, + { + "epoch": 0.9821325648414986, + "grad_norm": 0.8856986735480396, + "learning_rate": 5.954869085208323e-06, + "loss": 0.1609, + "step": 1278 + }, + { + "epoch": 0.983669548511047, + "grad_norm": 0.8166362545867224, + "learning_rate": 5.9504019540691305e-06, + "loss": 0.1465, + "step": 1280 + }, + { + "epoch": 0.9852065321805956, + "grad_norm": 0.8533786504203714, + "learning_rate": 5.945926979826612e-06, + "loss": 0.1555, + "step": 1282 + }, + { + "epoch": 0.9867435158501441, + "grad_norm": 0.8994585946450427, + "learning_rate": 5.941444176804002e-06, + "loss": 0.1495, + "step": 1284 + }, + { + "epoch": 0.9882804995196927, + "grad_norm": 0.8108578942364258, + "learning_rate": 5.936953559349596e-06, + "loss": 0.1505, + "step": 1286 + }, + { + "epoch": 0.9898174831892411, + "grad_norm": 0.8282612807656491, + "learning_rate": 5.932455141836697e-06, + "loss": 0.1561, + "step": 1288 + }, + { + "epoch": 0.9913544668587896, + "grad_norm": 0.8532043323116557, + "learning_rate": 5.927948938663581e-06, + "loss": 0.1559, + "step": 1290 + }, + { + "epoch": 0.9928914505283382, + "grad_norm": 0.8582728286463194, + "learning_rate": 5.923434964253437e-06, + "loss": 0.1628, + "step": 1292 + }, + { + "epoch": 0.9944284341978866, + "grad_norm": 0.8145713992914184, + "learning_rate": 5.91891323305433e-06, + "loss": 0.1503, + "step": 1294 + }, + { + "epoch": 0.9959654178674352, + "grad_norm": 0.8011507540671976, + "learning_rate": 5.914383759539153e-06, + "loss": 0.1439, + "step": 1296 + }, + { + "epoch": 0.9975024015369837, + "grad_norm": 0.9045409623476682, + "learning_rate": 5.909846558205582e-06, + "loss": 0.1533, + "step": 1298 + }, + { + "epoch": 0.9990393852065321, + "grad_norm": 0.8994142078460239, + "learning_rate": 5.905301643576025e-06, + "loss": 0.1635, + "step": 1300 + }, + { + "epoch": 1.0005763688760807, + "grad_norm": 1.9268010896112178, + "learning_rate": 5.900749030197578e-06, + "loss": 0.2109, + "step": 1302 + }, + { + "epoch": 1.0021133525456292, + "grad_norm": 0.7330614000908158, + "learning_rate": 5.8961887326419804e-06, + "loss": 0.0988, + "step": 1304 + }, + { + "epoch": 1.0036503362151776, + "grad_norm": 0.7392181978150959, + "learning_rate": 5.891620765505566e-06, + "loss": 0.1003, + "step": 1306 + }, + { + "epoch": 1.0051873198847263, + "grad_norm": 0.7663128310167143, + "learning_rate": 5.887045143409216e-06, + "loss": 0.1016, + "step": 1308 + }, + { + "epoch": 1.0067243035542748, + "grad_norm": 0.7607110644159815, + "learning_rate": 5.882461880998317e-06, + "loss": 0.0916, + "step": 1310 + }, + { + "epoch": 1.0082612872238232, + "grad_norm": 0.8238052235863006, + "learning_rate": 5.877870992942704e-06, + "loss": 0.0931, + "step": 1312 + }, + { + "epoch": 1.0097982708933717, + "grad_norm": 0.8892662986360846, + "learning_rate": 5.873272493936625e-06, + "loss": 0.1033, + "step": 1314 + }, + { + "epoch": 1.0113352545629202, + "grad_norm": 0.7704764998566338, + "learning_rate": 5.868666398698687e-06, + "loss": 0.1002, + "step": 1316 + }, + { + "epoch": 1.0128722382324689, + "grad_norm": 0.7709377348356039, + "learning_rate": 5.864052721971809e-06, + "loss": 0.0967, + "step": 1318 + }, + { + "epoch": 1.0144092219020173, + "grad_norm": 0.7969803446335071, + "learning_rate": 5.859431478523179e-06, + "loss": 0.1048, + "step": 1320 + }, + { + "epoch": 1.0159462055715658, + "grad_norm": 0.6999689392816087, + "learning_rate": 5.854802683144201e-06, + "loss": 0.088, + "step": 1322 + }, + { + "epoch": 1.0174831892411142, + "grad_norm": 0.7760114509424976, + "learning_rate": 5.850166350650456e-06, + "loss": 0.0991, + "step": 1324 + }, + { + "epoch": 1.019020172910663, + "grad_norm": 0.7880085543010041, + "learning_rate": 5.845522495881642e-06, + "loss": 0.0971, + "step": 1326 + }, + { + "epoch": 1.0205571565802114, + "grad_norm": 0.8289396285343062, + "learning_rate": 5.840871133701542e-06, + "loss": 0.102, + "step": 1328 + }, + { + "epoch": 1.0220941402497599, + "grad_norm": 0.7142434734819356, + "learning_rate": 5.836212278997961e-06, + "loss": 0.0883, + "step": 1330 + }, + { + "epoch": 1.0236311239193083, + "grad_norm": 0.7250033576245166, + "learning_rate": 5.8315459466826895e-06, + "loss": 0.0915, + "step": 1332 + }, + { + "epoch": 1.0251681075888568, + "grad_norm": 0.7570476788775075, + "learning_rate": 5.826872151691452e-06, + "loss": 0.0868, + "step": 1334 + }, + { + "epoch": 1.0267050912584055, + "grad_norm": 0.7965255668625996, + "learning_rate": 5.822190908983859e-06, + "loss": 0.0883, + "step": 1336 + }, + { + "epoch": 1.028242074927954, + "grad_norm": 0.8376697395117676, + "learning_rate": 5.817502233543355e-06, + "loss": 0.0991, + "step": 1338 + }, + { + "epoch": 1.0297790585975024, + "grad_norm": 0.7468897147308834, + "learning_rate": 5.8128061403771815e-06, + "loss": 0.0885, + "step": 1340 + }, + { + "epoch": 1.0313160422670509, + "grad_norm": 0.8645299006954267, + "learning_rate": 5.8081026445163184e-06, + "loss": 0.1036, + "step": 1342 + }, + { + "epoch": 1.0328530259365993, + "grad_norm": 0.8029697731003538, + "learning_rate": 5.80339176101544e-06, + "loss": 0.1027, + "step": 1344 + }, + { + "epoch": 1.034390009606148, + "grad_norm": 0.7986403468821157, + "learning_rate": 5.798673504952866e-06, + "loss": 0.0982, + "step": 1346 + }, + { + "epoch": 1.0359269932756965, + "grad_norm": 0.8210887006805161, + "learning_rate": 5.793947891430516e-06, + "loss": 0.0973, + "step": 1348 + }, + { + "epoch": 1.037463976945245, + "grad_norm": 0.7710919739110469, + "learning_rate": 5.789214935573857e-06, + "loss": 0.097, + "step": 1350 + }, + { + "epoch": 1.0390009606147934, + "grad_norm": 0.7754287784899241, + "learning_rate": 5.784474652531857e-06, + "loss": 0.0962, + "step": 1352 + }, + { + "epoch": 1.0405379442843419, + "grad_norm": 0.7811613718003362, + "learning_rate": 5.779727057476938e-06, + "loss": 0.101, + "step": 1354 + }, + { + "epoch": 1.0420749279538906, + "grad_norm": 0.7683352035150061, + "learning_rate": 5.774972165604923e-06, + "loss": 0.0926, + "step": 1356 + }, + { + "epoch": 1.043611911623439, + "grad_norm": 0.8121646663638018, + "learning_rate": 5.770209992134992e-06, + "loss": 0.1025, + "step": 1358 + }, + { + "epoch": 1.0451488952929875, + "grad_norm": 0.7946845821407413, + "learning_rate": 5.765440552309633e-06, + "loss": 0.098, + "step": 1360 + }, + { + "epoch": 1.046685878962536, + "grad_norm": 0.8107985568570227, + "learning_rate": 5.760663861394589e-06, + "loss": 0.0939, + "step": 1362 + }, + { + "epoch": 1.0482228626320846, + "grad_norm": 0.7738201688286012, + "learning_rate": 5.755879934678815e-06, + "loss": 0.0977, + "step": 1364 + }, + { + "epoch": 1.049759846301633, + "grad_norm": 0.8043266326845157, + "learning_rate": 5.751088787474421e-06, + "loss": 0.104, + "step": 1366 + }, + { + "epoch": 1.0512968299711816, + "grad_norm": 0.8212861671574773, + "learning_rate": 5.746290435116633e-06, + "loss": 0.1012, + "step": 1368 + }, + { + "epoch": 1.05283381364073, + "grad_norm": 0.8217655361068997, + "learning_rate": 5.7414848929637344e-06, + "loss": 0.0948, + "step": 1370 + }, + { + "epoch": 1.0543707973102785, + "grad_norm": 0.7398326798927687, + "learning_rate": 5.7366721763970276e-06, + "loss": 0.0926, + "step": 1372 + }, + { + "epoch": 1.0559077809798272, + "grad_norm": 0.8684068565935559, + "learning_rate": 5.73185230082077e-06, + "loss": 0.0977, + "step": 1374 + }, + { + "epoch": 1.0574447646493756, + "grad_norm": 0.812673696526823, + "learning_rate": 5.727025281662141e-06, + "loss": 0.0922, + "step": 1376 + }, + { + "epoch": 1.058981748318924, + "grad_norm": 0.8179613499881782, + "learning_rate": 5.722191134371179e-06, + "loss": 0.0944, + "step": 1378 + }, + { + "epoch": 1.0605187319884726, + "grad_norm": 0.810924645502839, + "learning_rate": 5.717349874420742e-06, + "loss": 0.0924, + "step": 1380 + }, + { + "epoch": 1.062055715658021, + "grad_norm": 0.763255489678059, + "learning_rate": 5.71250151730645e-06, + "loss": 0.0896, + "step": 1382 + }, + { + "epoch": 1.0635926993275697, + "grad_norm": 0.8239338886221861, + "learning_rate": 5.707646078546642e-06, + "loss": 0.1043, + "step": 1384 + }, + { + "epoch": 1.0651296829971182, + "grad_norm": 0.7862418625776587, + "learning_rate": 5.702783573682323e-06, + "loss": 0.0927, + "step": 1386 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 0.7694175148908957, + "learning_rate": 5.697914018277113e-06, + "loss": 0.0905, + "step": 1388 + }, + { + "epoch": 1.0682036503362151, + "grad_norm": 0.8056119084783939, + "learning_rate": 5.693037427917201e-06, + "loss": 0.0964, + "step": 1390 + }, + { + "epoch": 1.0697406340057638, + "grad_norm": 0.827006265409657, + "learning_rate": 5.688153818211293e-06, + "loss": 0.0905, + "step": 1392 + }, + { + "epoch": 1.0712776176753123, + "grad_norm": 0.8282583664130263, + "learning_rate": 5.683263204790561e-06, + "loss": 0.0915, + "step": 1394 + }, + { + "epoch": 1.0728146013448607, + "grad_norm": 0.7937251940987193, + "learning_rate": 5.678365603308593e-06, + "loss": 0.0969, + "step": 1396 + }, + { + "epoch": 1.0743515850144092, + "grad_norm": 0.7954663404413561, + "learning_rate": 5.673461029441347e-06, + "loss": 0.0996, + "step": 1398 + }, + { + "epoch": 1.0758885686839577, + "grad_norm": 0.8270927064110352, + "learning_rate": 5.668549498887098e-06, + "loss": 0.1034, + "step": 1400 + }, + { + "epoch": 1.0774255523535063, + "grad_norm": 0.7670514687913251, + "learning_rate": 5.663631027366382e-06, + "loss": 0.0913, + "step": 1402 + }, + { + "epoch": 1.0789625360230548, + "grad_norm": 0.8715536438358069, + "learning_rate": 5.658705630621959e-06, + "loss": 0.1004, + "step": 1404 + }, + { + "epoch": 1.0804995196926033, + "grad_norm": 0.8161604730434415, + "learning_rate": 5.653773324418748e-06, + "loss": 0.0943, + "step": 1406 + }, + { + "epoch": 1.0820365033621517, + "grad_norm": 0.8667760108772686, + "learning_rate": 5.648834124543787e-06, + "loss": 0.0954, + "step": 1408 + }, + { + "epoch": 1.0835734870317002, + "grad_norm": 0.8231577798724082, + "learning_rate": 5.643888046806179e-06, + "loss": 0.1, + "step": 1410 + }, + { + "epoch": 1.0851104707012489, + "grad_norm": 0.7413599439846686, + "learning_rate": 5.638935107037041e-06, + "loss": 0.0866, + "step": 1412 + }, + { + "epoch": 1.0866474543707973, + "grad_norm": 0.7754362451670733, + "learning_rate": 5.633975321089452e-06, + "loss": 0.0964, + "step": 1414 + }, + { + "epoch": 1.0881844380403458, + "grad_norm": 0.7854818133378504, + "learning_rate": 5.629008704838404e-06, + "loss": 0.0978, + "step": 1416 + }, + { + "epoch": 1.0897214217098943, + "grad_norm": 0.8230491957163081, + "learning_rate": 5.624035274180755e-06, + "loss": 0.0904, + "step": 1418 + }, + { + "epoch": 1.0912584053794427, + "grad_norm": 0.8415692370246974, + "learning_rate": 5.619055045035168e-06, + "loss": 0.0949, + "step": 1420 + }, + { + "epoch": 1.0927953890489914, + "grad_norm": 0.7752546061977003, + "learning_rate": 5.6140680333420714e-06, + "loss": 0.0977, + "step": 1422 + }, + { + "epoch": 1.09433237271854, + "grad_norm": 0.796060251145471, + "learning_rate": 5.609074255063598e-06, + "loss": 0.0986, + "step": 1424 + }, + { + "epoch": 1.0958693563880884, + "grad_norm": 0.8014273381014567, + "learning_rate": 5.604073726183545e-06, + "loss": 0.0996, + "step": 1426 + }, + { + "epoch": 1.0974063400576368, + "grad_norm": 0.8574802894140412, + "learning_rate": 5.599066462707311e-06, + "loss": 0.0915, + "step": 1428 + }, + { + "epoch": 1.0989433237271853, + "grad_norm": 0.7973180847732659, + "learning_rate": 5.594052480661852e-06, + "loss": 0.098, + "step": 1430 + }, + { + "epoch": 1.100480307396734, + "grad_norm": 0.7978978724248726, + "learning_rate": 5.589031796095629e-06, + "loss": 0.095, + "step": 1432 + }, + { + "epoch": 1.1020172910662824, + "grad_norm": 0.7735364885990075, + "learning_rate": 5.584004425078556e-06, + "loss": 0.0977, + "step": 1434 + }, + { + "epoch": 1.103554274735831, + "grad_norm": 0.824645677844322, + "learning_rate": 5.5789703837019465e-06, + "loss": 0.101, + "step": 1436 + }, + { + "epoch": 1.1050912584053794, + "grad_norm": 0.8045096798761333, + "learning_rate": 5.5739296880784685e-06, + "loss": 0.0995, + "step": 1438 + }, + { + "epoch": 1.106628242074928, + "grad_norm": 0.7785417170752582, + "learning_rate": 5.568882354342084e-06, + "loss": 0.0944, + "step": 1440 + }, + { + "epoch": 1.1081652257444765, + "grad_norm": 0.7865456166072924, + "learning_rate": 5.5638283986480055e-06, + "loss": 0.0993, + "step": 1442 + }, + { + "epoch": 1.109702209414025, + "grad_norm": 0.7482822898483027, + "learning_rate": 5.5587678371726365e-06, + "loss": 0.0923, + "step": 1444 + }, + { + "epoch": 1.1112391930835734, + "grad_norm": 0.847672498751733, + "learning_rate": 5.553700686113528e-06, + "loss": 0.0924, + "step": 1446 + }, + { + "epoch": 1.112776176753122, + "grad_norm": 0.8088910111550272, + "learning_rate": 5.54862696168932e-06, + "loss": 0.0969, + "step": 1448 + }, + { + "epoch": 1.1143131604226706, + "grad_norm": 0.7477679679317595, + "learning_rate": 5.543546680139695e-06, + "loss": 0.089, + "step": 1450 + }, + { + "epoch": 1.115850144092219, + "grad_norm": 0.8008986821135872, + "learning_rate": 5.5384598577253185e-06, + "loss": 0.0979, + "step": 1452 + }, + { + "epoch": 1.1173871277617675, + "grad_norm": 0.8322762593719337, + "learning_rate": 5.533366510727797e-06, + "loss": 0.0959, + "step": 1454 + }, + { + "epoch": 1.118924111431316, + "grad_norm": 0.7810694389767996, + "learning_rate": 5.528266655449615e-06, + "loss": 0.1004, + "step": 1456 + }, + { + "epoch": 1.1204610951008647, + "grad_norm": 0.8168615959182653, + "learning_rate": 5.523160308214093e-06, + "loss": 0.0934, + "step": 1458 + }, + { + "epoch": 1.1219980787704131, + "grad_norm": 0.8202613520761898, + "learning_rate": 5.518047485365328e-06, + "loss": 0.0923, + "step": 1460 + }, + { + "epoch": 1.1235350624399616, + "grad_norm": 0.7506685980798529, + "learning_rate": 5.512928203268145e-06, + "loss": 0.0983, + "step": 1462 + }, + { + "epoch": 1.12507204610951, + "grad_norm": 0.7964645064908876, + "learning_rate": 5.507802478308043e-06, + "loss": 0.1018, + "step": 1464 + }, + { + "epoch": 1.1266090297790585, + "grad_norm": 0.7856943241580147, + "learning_rate": 5.502670326891141e-06, + "loss": 0.0897, + "step": 1466 + }, + { + "epoch": 1.1281460134486072, + "grad_norm": 0.9369819821403769, + "learning_rate": 5.497531765444132e-06, + "loss": 0.0997, + "step": 1468 + }, + { + "epoch": 1.1296829971181557, + "grad_norm": 0.8411240897070034, + "learning_rate": 5.492386810414222e-06, + "loss": 0.099, + "step": 1470 + }, + { + "epoch": 1.1312199807877041, + "grad_norm": 0.7880140283795076, + "learning_rate": 5.48723547826908e-06, + "loss": 0.0962, + "step": 1472 + }, + { + "epoch": 1.1327569644572526, + "grad_norm": 0.7705964805253543, + "learning_rate": 5.482077785496794e-06, + "loss": 0.0931, + "step": 1474 + }, + { + "epoch": 1.134293948126801, + "grad_norm": 0.8165812210553237, + "learning_rate": 5.4769137486058e-06, + "loss": 0.1001, + "step": 1476 + }, + { + "epoch": 1.1358309317963498, + "grad_norm": 0.8057592991161191, + "learning_rate": 5.471743384124848e-06, + "loss": 0.0943, + "step": 1478 + }, + { + "epoch": 1.1373679154658982, + "grad_norm": 0.7946279160573669, + "learning_rate": 5.466566708602939e-06, + "loss": 0.0968, + "step": 1480 + }, + { + "epoch": 1.1389048991354467, + "grad_norm": 0.7589266561686325, + "learning_rate": 5.461383738609272e-06, + "loss": 0.0919, + "step": 1482 + }, + { + "epoch": 1.1404418828049951, + "grad_norm": 0.7750698355623663, + "learning_rate": 5.456194490733194e-06, + "loss": 0.0905, + "step": 1484 + }, + { + "epoch": 1.1419788664745436, + "grad_norm": 0.7612447169194039, + "learning_rate": 5.450998981584148e-06, + "loss": 0.0876, + "step": 1486 + }, + { + "epoch": 1.1435158501440923, + "grad_norm": 0.7659227044968493, + "learning_rate": 5.445797227791616e-06, + "loss": 0.0913, + "step": 1488 + }, + { + "epoch": 1.1450528338136408, + "grad_norm": 0.8296191241219513, + "learning_rate": 5.440589246005064e-06, + "loss": 0.1023, + "step": 1490 + }, + { + "epoch": 1.1465898174831892, + "grad_norm": 0.8332033205007243, + "learning_rate": 5.4353750528938995e-06, + "loss": 0.1018, + "step": 1492 + }, + { + "epoch": 1.1481268011527377, + "grad_norm": 0.755633743693491, + "learning_rate": 5.430154665147406e-06, + "loss": 0.0982, + "step": 1494 + }, + { + "epoch": 1.1496637848222861, + "grad_norm": 0.8294945642825525, + "learning_rate": 5.424928099474694e-06, + "loss": 0.101, + "step": 1496 + }, + { + "epoch": 1.1512007684918348, + "grad_norm": 0.8705045376216894, + "learning_rate": 5.419695372604652e-06, + "loss": 0.0966, + "step": 1498 + }, + { + "epoch": 1.1527377521613833, + "grad_norm": 0.8386991034461505, + "learning_rate": 5.414456501285883e-06, + "loss": 0.0948, + "step": 1500 + }, + { + "epoch": 1.1527377521613833, + "eval_loss": 0.14147287607192993, + "eval_runtime": 362.5666, + "eval_samples_per_second": 51.039, + "eval_steps_per_second": 6.382, + "step": 1500 + }, + { + "epoch": 1.1542747358309318, + "grad_norm": 0.808895295948306, + "learning_rate": 5.409211502286663e-06, + "loss": 0.1023, + "step": 1502 + }, + { + "epoch": 1.1558117195004802, + "grad_norm": 0.8129387791807478, + "learning_rate": 5.403960392394877e-06, + "loss": 0.0917, + "step": 1504 + }, + { + "epoch": 1.157348703170029, + "grad_norm": 0.8709858446003281, + "learning_rate": 5.398703188417971e-06, + "loss": 0.1076, + "step": 1506 + }, + { + "epoch": 1.1588856868395774, + "grad_norm": 0.7731592974500305, + "learning_rate": 5.393439907182895e-06, + "loss": 0.0932, + "step": 1508 + }, + { + "epoch": 1.1604226705091258, + "grad_norm": 0.8193191652163263, + "learning_rate": 5.388170565536052e-06, + "loss": 0.087, + "step": 1510 + }, + { + "epoch": 1.1619596541786743, + "grad_norm": 0.8821413504599133, + "learning_rate": 5.382895180343243e-06, + "loss": 0.1016, + "step": 1512 + }, + { + "epoch": 1.1634966378482228, + "grad_norm": 0.8330757982422947, + "learning_rate": 5.377613768489613e-06, + "loss": 0.0902, + "step": 1514 + }, + { + "epoch": 1.1650336215177715, + "grad_norm": 0.7755643958777008, + "learning_rate": 5.372326346879597e-06, + "loss": 0.0935, + "step": 1516 + }, + { + "epoch": 1.16657060518732, + "grad_norm": 0.7798742092931141, + "learning_rate": 5.367032932436863e-06, + "loss": 0.0907, + "step": 1518 + }, + { + "epoch": 1.1681075888568684, + "grad_norm": 0.7669670923992181, + "learning_rate": 5.3617335421042644e-06, + "loss": 0.1023, + "step": 1520 + }, + { + "epoch": 1.1696445725264168, + "grad_norm": 0.8341959799418742, + "learning_rate": 5.3564281928437785e-06, + "loss": 0.1016, + "step": 1522 + }, + { + "epoch": 1.1711815561959655, + "grad_norm": 0.7444403179993622, + "learning_rate": 5.351116901636459e-06, + "loss": 0.0895, + "step": 1524 + }, + { + "epoch": 1.172718539865514, + "grad_norm": 0.788295757514909, + "learning_rate": 5.3457996854823736e-06, + "loss": 0.0955, + "step": 1526 + }, + { + "epoch": 1.1742555235350625, + "grad_norm": 0.8581606629276641, + "learning_rate": 5.340476561400559e-06, + "loss": 0.0956, + "step": 1528 + }, + { + "epoch": 1.175792507204611, + "grad_norm": 0.7680372748823804, + "learning_rate": 5.33514754642896e-06, + "loss": 0.0892, + "step": 1530 + }, + { + "epoch": 1.1773294908741594, + "grad_norm": 0.8579631291966535, + "learning_rate": 5.329812657624374e-06, + "loss": 0.0947, + "step": 1532 + }, + { + "epoch": 1.178866474543708, + "grad_norm": 0.8028770597268665, + "learning_rate": 5.324471912062402e-06, + "loss": 0.0919, + "step": 1534 + }, + { + "epoch": 1.1804034582132565, + "grad_norm": 0.8274846559591892, + "learning_rate": 5.319125326837392e-06, + "loss": 0.0931, + "step": 1536 + }, + { + "epoch": 1.181940441882805, + "grad_norm": 0.8099150871185563, + "learning_rate": 5.3137729190623784e-06, + "loss": 0.1059, + "step": 1538 + }, + { + "epoch": 1.1834774255523535, + "grad_norm": 0.7888074507502317, + "learning_rate": 5.308414705869037e-06, + "loss": 0.0948, + "step": 1540 + }, + { + "epoch": 1.185014409221902, + "grad_norm": 0.736344371575378, + "learning_rate": 5.3030507044076244e-06, + "loss": 0.0927, + "step": 1542 + }, + { + "epoch": 1.1865513928914506, + "grad_norm": 0.8067822180532194, + "learning_rate": 5.29768093184692e-06, + "loss": 0.0916, + "step": 1544 + }, + { + "epoch": 1.188088376560999, + "grad_norm": 0.8588531827838762, + "learning_rate": 5.29230540537418e-06, + "loss": 0.0962, + "step": 1546 + }, + { + "epoch": 1.1896253602305475, + "grad_norm": 0.8183423934241679, + "learning_rate": 5.286924142195075e-06, + "loss": 0.0908, + "step": 1548 + }, + { + "epoch": 1.191162343900096, + "grad_norm": 0.7893384100991108, + "learning_rate": 5.2815371595336375e-06, + "loss": 0.0949, + "step": 1550 + }, + { + "epoch": 1.1926993275696445, + "grad_norm": 0.8369129068810596, + "learning_rate": 5.2761444746322054e-06, + "loss": 0.1004, + "step": 1552 + }, + { + "epoch": 1.1942363112391932, + "grad_norm": 0.852519706321052, + "learning_rate": 5.270746104751371e-06, + "loss": 0.1071, + "step": 1554 + }, + { + "epoch": 1.1957732949087416, + "grad_norm": 0.7067264280513246, + "learning_rate": 5.265342067169921e-06, + "loss": 0.0866, + "step": 1556 + }, + { + "epoch": 1.19731027857829, + "grad_norm": 0.7807816607071811, + "learning_rate": 5.259932379184782e-06, + "loss": 0.0947, + "step": 1558 + }, + { + "epoch": 1.1988472622478386, + "grad_norm": 0.7595283315156356, + "learning_rate": 5.254517058110968e-06, + "loss": 0.093, + "step": 1560 + }, + { + "epoch": 1.200384245917387, + "grad_norm": 0.7355421808089894, + "learning_rate": 5.249096121281521e-06, + "loss": 0.0929, + "step": 1562 + }, + { + "epoch": 1.2019212295869357, + "grad_norm": 0.8309945335729965, + "learning_rate": 5.243669586047459e-06, + "loss": 0.1038, + "step": 1564 + }, + { + "epoch": 1.2034582132564842, + "grad_norm": 0.7783243017854818, + "learning_rate": 5.238237469777719e-06, + "loss": 0.0919, + "step": 1566 + }, + { + "epoch": 1.2049951969260326, + "grad_norm": 0.7536478224038158, + "learning_rate": 5.232799789859102e-06, + "loss": 0.0832, + "step": 1568 + }, + { + "epoch": 1.206532180595581, + "grad_norm": 0.8375220966676139, + "learning_rate": 5.227356563696215e-06, + "loss": 0.0933, + "step": 1570 + }, + { + "epoch": 1.2080691642651298, + "grad_norm": 0.8168530200077441, + "learning_rate": 5.221907808711418e-06, + "loss": 0.0962, + "step": 1572 + }, + { + "epoch": 1.2096061479346782, + "grad_norm": 0.7669312918946458, + "learning_rate": 5.216453542344768e-06, + "loss": 0.0876, + "step": 1574 + }, + { + "epoch": 1.2111431316042267, + "grad_norm": 0.8257303860560026, + "learning_rate": 5.210993782053961e-06, + "loss": 0.0923, + "step": 1576 + }, + { + "epoch": 1.2126801152737752, + "grad_norm": 0.8359416165543359, + "learning_rate": 5.205528545314281e-06, + "loss": 0.09, + "step": 1578 + }, + { + "epoch": 1.2142170989433236, + "grad_norm": 0.8022199675695576, + "learning_rate": 5.200057849618535e-06, + "loss": 0.0928, + "step": 1580 + }, + { + "epoch": 1.2157540826128723, + "grad_norm": 0.809397949668016, + "learning_rate": 5.194581712477007e-06, + "loss": 0.1011, + "step": 1582 + }, + { + "epoch": 1.2172910662824208, + "grad_norm": 0.7620719694922747, + "learning_rate": 5.1891001514173994e-06, + "loss": 0.0947, + "step": 1584 + }, + { + "epoch": 1.2188280499519693, + "grad_norm": 0.8748786606947648, + "learning_rate": 5.183613183984768e-06, + "loss": 0.1022, + "step": 1586 + }, + { + "epoch": 1.2203650336215177, + "grad_norm": 0.8471054549064284, + "learning_rate": 5.178120827741481e-06, + "loss": 0.0994, + "step": 1588 + }, + { + "epoch": 1.2219020172910664, + "grad_norm": 0.8328172662255555, + "learning_rate": 5.172623100267148e-06, + "loss": 0.09, + "step": 1590 + }, + { + "epoch": 1.2234390009606149, + "grad_norm": 0.7751726026024895, + "learning_rate": 5.167120019158578e-06, + "loss": 0.0872, + "step": 1592 + }, + { + "epoch": 1.2249759846301633, + "grad_norm": 0.8122210518131329, + "learning_rate": 5.1616116020297065e-06, + "loss": 0.0901, + "step": 1594 + }, + { + "epoch": 1.2265129682997118, + "grad_norm": 0.7575239070267672, + "learning_rate": 5.1560978665115555e-06, + "loss": 0.0911, + "step": 1596 + }, + { + "epoch": 1.2280499519692603, + "grad_norm": 0.7802284165444789, + "learning_rate": 5.150578830252168e-06, + "loss": 0.0943, + "step": 1598 + }, + { + "epoch": 1.229586935638809, + "grad_norm": 0.8020724802994549, + "learning_rate": 5.145054510916552e-06, + "loss": 0.0861, + "step": 1600 + }, + { + "epoch": 1.2311239193083574, + "grad_norm": 0.9021178290335383, + "learning_rate": 5.139524926186624e-06, + "loss": 0.1049, + "step": 1602 + }, + { + "epoch": 1.2326609029779059, + "grad_norm": 0.7954364435694474, + "learning_rate": 5.133990093761158e-06, + "loss": 0.0975, + "step": 1604 + }, + { + "epoch": 1.2341978866474543, + "grad_norm": 0.862849089104852, + "learning_rate": 5.1284500313557214e-06, + "loss": 0.1001, + "step": 1606 + }, + { + "epoch": 1.2357348703170028, + "grad_norm": 0.8234510165839098, + "learning_rate": 5.122904756702622e-06, + "loss": 0.096, + "step": 1608 + }, + { + "epoch": 1.2372718539865515, + "grad_norm": 0.8180910858318627, + "learning_rate": 5.1173542875508495e-06, + "loss": 0.0929, + "step": 1610 + }, + { + "epoch": 1.2388088376561, + "grad_norm": 0.8254978234033816, + "learning_rate": 5.111798641666022e-06, + "loss": 0.0986, + "step": 1612 + }, + { + "epoch": 1.2403458213256484, + "grad_norm": 0.7970300036804161, + "learning_rate": 5.1062378368303286e-06, + "loss": 0.0968, + "step": 1614 + }, + { + "epoch": 1.2418828049951969, + "grad_norm": 0.7942368711186357, + "learning_rate": 5.100671890842464e-06, + "loss": 0.0946, + "step": 1616 + }, + { + "epoch": 1.2434197886647453, + "grad_norm": 0.781461470784258, + "learning_rate": 5.095100821517586e-06, + "loss": 0.0909, + "step": 1618 + }, + { + "epoch": 1.244956772334294, + "grad_norm": 0.7637849604119566, + "learning_rate": 5.089524646687245e-06, + "loss": 0.089, + "step": 1620 + }, + { + "epoch": 1.2464937560038425, + "grad_norm": 0.8322634659167238, + "learning_rate": 5.083943384199339e-06, + "loss": 0.0946, + "step": 1622 + }, + { + "epoch": 1.248030739673391, + "grad_norm": 0.8673983066924847, + "learning_rate": 5.078357051918042e-06, + "loss": 0.0997, + "step": 1624 + }, + { + "epoch": 1.2495677233429394, + "grad_norm": 0.8093921155750359, + "learning_rate": 5.072765667723763e-06, + "loss": 0.0924, + "step": 1626 + }, + { + "epoch": 1.2511047070124879, + "grad_norm": 0.7682990135468413, + "learning_rate": 5.067169249513078e-06, + "loss": 0.0908, + "step": 1628 + }, + { + "epoch": 1.2526416906820366, + "grad_norm": 0.9219480609910328, + "learning_rate": 5.061567815198674e-06, + "loss": 0.0992, + "step": 1630 + }, + { + "epoch": 1.254178674351585, + "grad_norm": 0.7864964134564114, + "learning_rate": 5.055961382709294e-06, + "loss": 0.0856, + "step": 1632 + }, + { + "epoch": 1.2557156580211335, + "grad_norm": 0.789369364367871, + "learning_rate": 5.05034996998968e-06, + "loss": 0.101, + "step": 1634 + }, + { + "epoch": 1.257252641690682, + "grad_norm": 0.785618855267339, + "learning_rate": 5.044733595000514e-06, + "loss": 0.0913, + "step": 1636 + }, + { + "epoch": 1.2587896253602304, + "grad_norm": 0.8540857446211212, + "learning_rate": 5.0391122757183605e-06, + "loss": 0.1008, + "step": 1638 + }, + { + "epoch": 1.260326609029779, + "grad_norm": 0.8815323610676825, + "learning_rate": 5.03348603013561e-06, + "loss": 0.0967, + "step": 1640 + }, + { + "epoch": 1.2618635926993276, + "grad_norm": 0.8589693358904777, + "learning_rate": 5.02785487626042e-06, + "loss": 0.0979, + "step": 1642 + }, + { + "epoch": 1.263400576368876, + "grad_norm": 0.8415206231916414, + "learning_rate": 5.022218832116659e-06, + "loss": 0.1038, + "step": 1644 + }, + { + "epoch": 1.2649375600384247, + "grad_norm": 0.8392847365814639, + "learning_rate": 5.016577915743848e-06, + "loss": 0.0999, + "step": 1646 + }, + { + "epoch": 1.266474543707973, + "grad_norm": 0.840680092317177, + "learning_rate": 5.010932145197101e-06, + "loss": 0.0959, + "step": 1648 + }, + { + "epoch": 1.2680115273775217, + "grad_norm": 0.8176605216769884, + "learning_rate": 5.005281538547071e-06, + "loss": 0.0968, + "step": 1650 + }, + { + "epoch": 1.2695485110470701, + "grad_norm": 0.7636164672938492, + "learning_rate": 4.999626113879891e-06, + "loss": 0.0855, + "step": 1652 + }, + { + "epoch": 1.2710854947166186, + "grad_norm": 0.8820138872988845, + "learning_rate": 4.9939658892971106e-06, + "loss": 0.0963, + "step": 1654 + }, + { + "epoch": 1.2726224783861673, + "grad_norm": 0.8230226501954376, + "learning_rate": 4.9883008829156475e-06, + "loss": 0.0908, + "step": 1656 + }, + { + "epoch": 1.2741594620557157, + "grad_norm": 0.7954482370699276, + "learning_rate": 4.982631112867724e-06, + "loss": 0.0949, + "step": 1658 + }, + { + "epoch": 1.2756964457252642, + "grad_norm": 0.7446617995311455, + "learning_rate": 4.976956597300806e-06, + "loss": 0.0922, + "step": 1660 + }, + { + "epoch": 1.2772334293948127, + "grad_norm": 0.8316466452619663, + "learning_rate": 4.971277354377554e-06, + "loss": 0.0961, + "step": 1662 + }, + { + "epoch": 1.2787704130643611, + "grad_norm": 0.8717333357230952, + "learning_rate": 4.965593402275754e-06, + "loss": 0.0989, + "step": 1664 + }, + { + "epoch": 1.2803073967339098, + "grad_norm": 0.8635438600294546, + "learning_rate": 4.959904759188271e-06, + "loss": 0.0965, + "step": 1666 + }, + { + "epoch": 1.2818443804034583, + "grad_norm": 0.8348320154619787, + "learning_rate": 4.954211443322978e-06, + "loss": 0.0982, + "step": 1668 + }, + { + "epoch": 1.2833813640730067, + "grad_norm": 0.7587760801820468, + "learning_rate": 4.948513472902709e-06, + "loss": 0.0894, + "step": 1670 + }, + { + "epoch": 1.2849183477425552, + "grad_norm": 0.823101148418266, + "learning_rate": 4.942810866165194e-06, + "loss": 0.1011, + "step": 1672 + }, + { + "epoch": 1.2864553314121037, + "grad_norm": 0.8615662400758028, + "learning_rate": 4.937103641363004e-06, + "loss": 0.0965, + "step": 1674 + }, + { + "epoch": 1.2879923150816524, + "grad_norm": 0.7508304626813055, + "learning_rate": 4.931391816763491e-06, + "loss": 0.0902, + "step": 1676 + }, + { + "epoch": 1.2895292987512008, + "grad_norm": 0.7605099804332683, + "learning_rate": 4.925675410648728e-06, + "loss": 0.0957, + "step": 1678 + }, + { + "epoch": 1.2910662824207493, + "grad_norm": 0.8276765994390878, + "learning_rate": 4.919954441315453e-06, + "loss": 0.0928, + "step": 1680 + }, + { + "epoch": 1.2926032660902977, + "grad_norm": 0.8394305545603477, + "learning_rate": 4.914228927075014e-06, + "loss": 0.0933, + "step": 1682 + }, + { + "epoch": 1.2941402497598462, + "grad_norm": 0.8228704133694662, + "learning_rate": 4.908498886253298e-06, + "loss": 0.0954, + "step": 1684 + }, + { + "epoch": 1.295677233429395, + "grad_norm": 0.8428554748260062, + "learning_rate": 4.902764337190685e-06, + "loss": 0.101, + "step": 1686 + }, + { + "epoch": 1.2972142170989434, + "grad_norm": 0.8148117938502604, + "learning_rate": 4.897025298241987e-06, + "loss": 0.0939, + "step": 1688 + }, + { + "epoch": 1.2987512007684918, + "grad_norm": 0.7520640222305418, + "learning_rate": 4.891281787776383e-06, + "loss": 0.0897, + "step": 1690 + }, + { + "epoch": 1.3002881844380403, + "grad_norm": 0.7757726426358977, + "learning_rate": 4.885533824177365e-06, + "loss": 0.0907, + "step": 1692 + }, + { + "epoch": 1.3018251681075887, + "grad_norm": 0.8281519655081531, + "learning_rate": 4.87978142584268e-06, + "loss": 0.102, + "step": 1694 + }, + { + "epoch": 1.3033621517771374, + "grad_norm": 0.845824623898633, + "learning_rate": 4.874024611184266e-06, + "loss": 0.0958, + "step": 1696 + }, + { + "epoch": 1.304899135446686, + "grad_norm": 0.8422886597995062, + "learning_rate": 4.868263398628203e-06, + "loss": 0.0944, + "step": 1698 + }, + { + "epoch": 1.3064361191162344, + "grad_norm": 0.8314043674269285, + "learning_rate": 4.86249780661464e-06, + "loss": 0.1048, + "step": 1700 + }, + { + "epoch": 1.3079731027857828, + "grad_norm": 0.8629695831852081, + "learning_rate": 4.8567278535977475e-06, + "loss": 0.1105, + "step": 1702 + }, + { + "epoch": 1.3095100864553313, + "grad_norm": 0.8379948759443979, + "learning_rate": 4.850953558045653e-06, + "loss": 0.0973, + "step": 1704 + }, + { + "epoch": 1.31104707012488, + "grad_norm": 0.7985221017745798, + "learning_rate": 4.845174938440386e-06, + "loss": 0.0994, + "step": 1706 + }, + { + "epoch": 1.3125840537944284, + "grad_norm": 0.8271445315963422, + "learning_rate": 4.8393920132778144e-06, + "loss": 0.089, + "step": 1708 + }, + { + "epoch": 1.314121037463977, + "grad_norm": 0.8199327418927324, + "learning_rate": 4.833604801067585e-06, + "loss": 0.0979, + "step": 1710 + }, + { + "epoch": 1.3156580211335256, + "grad_norm": 0.7802159907934838, + "learning_rate": 4.827813320333071e-06, + "loss": 0.0891, + "step": 1712 + }, + { + "epoch": 1.3171950048030738, + "grad_norm": 0.8828098223970235, + "learning_rate": 4.822017589611302e-06, + "loss": 0.0968, + "step": 1714 + }, + { + "epoch": 1.3187319884726225, + "grad_norm": 0.7743361926366948, + "learning_rate": 4.816217627452917e-06, + "loss": 0.0901, + "step": 1716 + }, + { + "epoch": 1.320268972142171, + "grad_norm": 0.8452670586980338, + "learning_rate": 4.810413452422094e-06, + "loss": 0.0923, + "step": 1718 + }, + { + "epoch": 1.3218059558117194, + "grad_norm": 0.8330276027385624, + "learning_rate": 4.804605083096499e-06, + "loss": 0.0952, + "step": 1720 + }, + { + "epoch": 1.3233429394812681, + "grad_norm": 0.805109828914744, + "learning_rate": 4.798792538067218e-06, + "loss": 0.0946, + "step": 1722 + }, + { + "epoch": 1.3248799231508166, + "grad_norm": 0.8508784027134717, + "learning_rate": 4.792975835938709e-06, + "loss": 0.0957, + "step": 1724 + }, + { + "epoch": 1.326416906820365, + "grad_norm": 0.7937254392396291, + "learning_rate": 4.787154995328729e-06, + "loss": 0.0948, + "step": 1726 + }, + { + "epoch": 1.3279538904899135, + "grad_norm": 0.7723472021275456, + "learning_rate": 4.781330034868287e-06, + "loss": 0.0954, + "step": 1728 + }, + { + "epoch": 1.329490874159462, + "grad_norm": 0.8147194013177234, + "learning_rate": 4.775500973201573e-06, + "loss": 0.0994, + "step": 1730 + }, + { + "epoch": 1.3310278578290107, + "grad_norm": 0.7865073470553021, + "learning_rate": 4.76966782898591e-06, + "loss": 0.0908, + "step": 1732 + }, + { + "epoch": 1.3325648414985591, + "grad_norm": 0.8909496268629883, + "learning_rate": 4.763830620891682e-06, + "loss": 0.0994, + "step": 1734 + }, + { + "epoch": 1.3341018251681076, + "grad_norm": 0.8439047859296126, + "learning_rate": 4.757989367602286e-06, + "loss": 0.0948, + "step": 1736 + }, + { + "epoch": 1.335638808837656, + "grad_norm": 0.8146110101810443, + "learning_rate": 4.752144087814062e-06, + "loss": 0.0932, + "step": 1738 + }, + { + "epoch": 1.3371757925072045, + "grad_norm": 0.8639874055710914, + "learning_rate": 4.746294800236241e-06, + "loss": 0.103, + "step": 1740 + }, + { + "epoch": 1.3387127761767532, + "grad_norm": 0.8343222963762533, + "learning_rate": 4.740441523590881e-06, + "loss": 0.0933, + "step": 1742 + }, + { + "epoch": 1.3402497598463017, + "grad_norm": 0.9191398577485145, + "learning_rate": 4.734584276612807e-06, + "loss": 0.0983, + "step": 1744 + }, + { + "epoch": 1.3417867435158501, + "grad_norm": 0.8075250938783436, + "learning_rate": 4.7287230780495525e-06, + "loss": 0.0958, + "step": 1746 + }, + { + "epoch": 1.3433237271853986, + "grad_norm": 0.8741221440894832, + "learning_rate": 4.722857946661299e-06, + "loss": 0.0997, + "step": 1748 + }, + { + "epoch": 1.344860710854947, + "grad_norm": 0.8738624905786072, + "learning_rate": 4.7169889012208174e-06, + "loss": 0.1046, + "step": 1750 + }, + { + "epoch": 1.3463976945244958, + "grad_norm": 0.8003724652951454, + "learning_rate": 4.711115960513405e-06, + "loss": 0.099, + "step": 1752 + }, + { + "epoch": 1.3479346781940442, + "grad_norm": 0.7908228630871454, + "learning_rate": 4.705239143336827e-06, + "loss": 0.0991, + "step": 1754 + }, + { + "epoch": 1.3494716618635927, + "grad_norm": 0.7736523132228244, + "learning_rate": 4.6993584685012554e-06, + "loss": 0.1028, + "step": 1756 + }, + { + "epoch": 1.3510086455331412, + "grad_norm": 0.8141238639955589, + "learning_rate": 4.693473954829211e-06, + "loss": 0.0954, + "step": 1758 + }, + { + "epoch": 1.3525456292026896, + "grad_norm": 0.7526341018348459, + "learning_rate": 4.687585621155502e-06, + "loss": 0.0905, + "step": 1760 + }, + { + "epoch": 1.3540826128722383, + "grad_norm": 0.776573467548187, + "learning_rate": 4.68169348632716e-06, + "loss": 0.098, + "step": 1762 + }, + { + "epoch": 1.3556195965417868, + "grad_norm": 0.8263354203916096, + "learning_rate": 4.675797569203389e-06, + "loss": 0.1029, + "step": 1764 + }, + { + "epoch": 1.3571565802113352, + "grad_norm": 0.7840394777513143, + "learning_rate": 4.669897888655494e-06, + "loss": 0.0938, + "step": 1766 + }, + { + "epoch": 1.3586935638808837, + "grad_norm": 0.8026477022187467, + "learning_rate": 4.663994463566828e-06, + "loss": 0.0972, + "step": 1768 + }, + { + "epoch": 1.3602305475504322, + "grad_norm": 0.7984909693964071, + "learning_rate": 4.658087312832729e-06, + "loss": 0.0934, + "step": 1770 + }, + { + "epoch": 1.3617675312199808, + "grad_norm": 0.8445059001868617, + "learning_rate": 4.652176455360459e-06, + "loss": 0.0969, + "step": 1772 + }, + { + "epoch": 1.3633045148895293, + "grad_norm": 0.8508994597739256, + "learning_rate": 4.646261910069147e-06, + "loss": 0.0895, + "step": 1774 + }, + { + "epoch": 1.3648414985590778, + "grad_norm": 0.7381165146693729, + "learning_rate": 4.640343695889721e-06, + "loss": 0.0893, + "step": 1776 + }, + { + "epoch": 1.3663784822286265, + "grad_norm": 0.7966384986609926, + "learning_rate": 4.634421831764857e-06, + "loss": 0.0992, + "step": 1778 + }, + { + "epoch": 1.3679154658981747, + "grad_norm": 0.7322872414451935, + "learning_rate": 4.628496336648911e-06, + "loss": 0.0868, + "step": 1780 + }, + { + "epoch": 1.3694524495677234, + "grad_norm": 0.7490827880894491, + "learning_rate": 4.6225672295078615e-06, + "loss": 0.0898, + "step": 1782 + }, + { + "epoch": 1.3709894332372718, + "grad_norm": 0.8010232538915442, + "learning_rate": 4.616634529319249e-06, + "loss": 0.0917, + "step": 1784 + }, + { + "epoch": 1.3725264169068203, + "grad_norm": 0.8095923768198692, + "learning_rate": 4.61069825507211e-06, + "loss": 0.092, + "step": 1786 + }, + { + "epoch": 1.374063400576369, + "grad_norm": 0.7861781833880861, + "learning_rate": 4.604758425766928e-06, + "loss": 0.0891, + "step": 1788 + }, + { + "epoch": 1.3756003842459175, + "grad_norm": 0.8541796594703853, + "learning_rate": 4.5988150604155585e-06, + "loss": 0.0962, + "step": 1790 + }, + { + "epoch": 1.377137367915466, + "grad_norm": 0.8065855822446965, + "learning_rate": 4.592868178041181e-06, + "loss": 0.0976, + "step": 1792 + }, + { + "epoch": 1.3786743515850144, + "grad_norm": 0.8147810473519597, + "learning_rate": 4.586917797678225e-06, + "loss": 0.0931, + "step": 1794 + }, + { + "epoch": 1.3802113352545629, + "grad_norm": 0.7931514880597085, + "learning_rate": 4.5809639383723245e-06, + "loss": 0.0952, + "step": 1796 + }, + { + "epoch": 1.3817483189241115, + "grad_norm": 0.8182579050054116, + "learning_rate": 4.57500661918024e-06, + "loss": 0.0971, + "step": 1798 + }, + { + "epoch": 1.38328530259366, + "grad_norm": 0.7690232401474136, + "learning_rate": 4.569045859169814e-06, + "loss": 0.083, + "step": 1800 + }, + { + "epoch": 1.3848222862632085, + "grad_norm": 0.7899387190866939, + "learning_rate": 4.563081677419897e-06, + "loss": 0.0908, + "step": 1802 + }, + { + "epoch": 1.386359269932757, + "grad_norm": 0.8326942252427081, + "learning_rate": 4.557114093020294e-06, + "loss": 0.1004, + "step": 1804 + }, + { + "epoch": 1.3878962536023054, + "grad_norm": 0.868404572818935, + "learning_rate": 4.551143125071698e-06, + "loss": 0.0989, + "step": 1806 + }, + { + "epoch": 1.389433237271854, + "grad_norm": 0.7829935600541224, + "learning_rate": 4.545168792685637e-06, + "loss": 0.0904, + "step": 1808 + }, + { + "epoch": 1.3909702209414025, + "grad_norm": 0.8030113132084329, + "learning_rate": 4.539191114984403e-06, + "loss": 0.0948, + "step": 1810 + }, + { + "epoch": 1.392507204610951, + "grad_norm": 0.8322013967452164, + "learning_rate": 4.533210111101e-06, + "loss": 0.0889, + "step": 1812 + }, + { + "epoch": 1.3940441882804995, + "grad_norm": 0.854566482621519, + "learning_rate": 4.527225800179074e-06, + "loss": 0.0926, + "step": 1814 + }, + { + "epoch": 1.395581171950048, + "grad_norm": 0.8546853000055383, + "learning_rate": 4.521238201372854e-06, + "loss": 0.0985, + "step": 1816 + }, + { + "epoch": 1.3971181556195966, + "grad_norm": 0.7861004953262382, + "learning_rate": 4.5152473338471025e-06, + "loss": 0.0842, + "step": 1818 + }, + { + "epoch": 1.398655139289145, + "grad_norm": 0.8279303392404915, + "learning_rate": 4.509253216777034e-06, + "loss": 0.093, + "step": 1820 + }, + { + "epoch": 1.4001921229586936, + "grad_norm": 0.7776536059593128, + "learning_rate": 4.50325586934827e-06, + "loss": 0.0892, + "step": 1822 + }, + { + "epoch": 1.401729106628242, + "grad_norm": 0.823356682996185, + "learning_rate": 4.497255310756767e-06, + "loss": 0.0873, + "step": 1824 + }, + { + "epoch": 1.4032660902977905, + "grad_norm": 0.781617011211452, + "learning_rate": 4.491251560208766e-06, + "loss": 0.0962, + "step": 1826 + }, + { + "epoch": 1.4048030739673392, + "grad_norm": 0.8182872551902259, + "learning_rate": 4.485244636920716e-06, + "loss": 0.09, + "step": 1828 + }, + { + "epoch": 1.4063400576368876, + "grad_norm": 0.7732731640678324, + "learning_rate": 4.479234560119231e-06, + "loss": 0.0909, + "step": 1830 + }, + { + "epoch": 1.407877041306436, + "grad_norm": 0.8358121488236012, + "learning_rate": 4.473221349041009e-06, + "loss": 0.0945, + "step": 1832 + }, + { + "epoch": 1.4094140249759846, + "grad_norm": 0.783898477086607, + "learning_rate": 4.467205022932788e-06, + "loss": 0.0868, + "step": 1834 + }, + { + "epoch": 1.410951008645533, + "grad_norm": 0.8347502314619144, + "learning_rate": 4.4611856010512696e-06, + "loss": 0.0859, + "step": 1836 + }, + { + "epoch": 1.4124879923150817, + "grad_norm": 0.8029436387846094, + "learning_rate": 4.455163102663071e-06, + "loss": 0.0986, + "step": 1838 + }, + { + "epoch": 1.4140249759846302, + "grad_norm": 0.780720619644275, + "learning_rate": 4.449137547044651e-06, + "loss": 0.092, + "step": 1840 + }, + { + "epoch": 1.4155619596541786, + "grad_norm": 0.7647611604463465, + "learning_rate": 4.443108953482255e-06, + "loss": 0.0843, + "step": 1842 + }, + { + "epoch": 1.4170989433237273, + "grad_norm": 0.7569931424177674, + "learning_rate": 4.437077341271854e-06, + "loss": 0.1, + "step": 1844 + }, + { + "epoch": 1.4186359269932756, + "grad_norm": 0.780426436469334, + "learning_rate": 4.431042729719081e-06, + "loss": 0.0922, + "step": 1846 + }, + { + "epoch": 1.4201729106628243, + "grad_norm": 0.8266309005493522, + "learning_rate": 4.425005138139165e-06, + "loss": 0.091, + "step": 1848 + }, + { + "epoch": 1.4217098943323727, + "grad_norm": 0.7919591525646804, + "learning_rate": 4.418964585856878e-06, + "loss": 0.0914, + "step": 1850 + }, + { + "epoch": 1.4232468780019212, + "grad_norm": 0.8676995223573973, + "learning_rate": 4.4129210922064655e-06, + "loss": 0.0933, + "step": 1852 + }, + { + "epoch": 1.4247838616714699, + "grad_norm": 0.8245651321649498, + "learning_rate": 4.406874676531591e-06, + "loss": 0.099, + "step": 1854 + }, + { + "epoch": 1.4263208453410183, + "grad_norm": 0.8084380870931988, + "learning_rate": 4.400825358185267e-06, + "loss": 0.094, + "step": 1856 + }, + { + "epoch": 1.4278578290105668, + "grad_norm": 0.8383642555183834, + "learning_rate": 4.394773156529796e-06, + "loss": 0.1035, + "step": 1858 + }, + { + "epoch": 1.4293948126801153, + "grad_norm": 0.8496971372491992, + "learning_rate": 4.388718090936714e-06, + "loss": 0.0961, + "step": 1860 + }, + { + "epoch": 1.4309317963496637, + "grad_norm": 0.778441978283022, + "learning_rate": 4.38266018078672e-06, + "loss": 0.0903, + "step": 1862 + }, + { + "epoch": 1.4324687800192124, + "grad_norm": 0.8399196603924103, + "learning_rate": 4.376599445469619e-06, + "loss": 0.0951, + "step": 1864 + }, + { + "epoch": 1.4340057636887609, + "grad_norm": 0.8277510344863601, + "learning_rate": 4.370535904384257e-06, + "loss": 0.0947, + "step": 1866 + }, + { + "epoch": 1.4355427473583093, + "grad_norm": 0.8546854576792136, + "learning_rate": 4.3644695769384634e-06, + "loss": 0.0978, + "step": 1868 + }, + { + "epoch": 1.4370797310278578, + "grad_norm": 0.8038219716990682, + "learning_rate": 4.358400482548984e-06, + "loss": 0.0901, + "step": 1870 + }, + { + "epoch": 1.4386167146974063, + "grad_norm": 0.8079184472772435, + "learning_rate": 4.352328640641422e-06, + "loss": 0.0885, + "step": 1872 + }, + { + "epoch": 1.440153698366955, + "grad_norm": 0.7459154763602515, + "learning_rate": 4.346254070650172e-06, + "loss": 0.0886, + "step": 1874 + }, + { + "epoch": 1.4416906820365034, + "grad_norm": 0.7909488135828993, + "learning_rate": 4.340176792018365e-06, + "loss": 0.0928, + "step": 1876 + }, + { + "epoch": 1.4432276657060519, + "grad_norm": 0.8321374668536021, + "learning_rate": 4.3340968241977975e-06, + "loss": 0.0932, + "step": 1878 + }, + { + "epoch": 1.4447646493756003, + "grad_norm": 0.7923523546601555, + "learning_rate": 4.328014186648875e-06, + "loss": 0.0867, + "step": 1880 + }, + { + "epoch": 1.4463016330451488, + "grad_norm": 0.8123704722222743, + "learning_rate": 4.321928898840549e-06, + "loss": 0.0883, + "step": 1882 + }, + { + "epoch": 1.4478386167146975, + "grad_norm": 0.8443684423046657, + "learning_rate": 4.315840980250253e-06, + "loss": 0.0941, + "step": 1884 + }, + { + "epoch": 1.449375600384246, + "grad_norm": 0.8218259476991273, + "learning_rate": 4.30975045036384e-06, + "loss": 0.0934, + "step": 1886 + }, + { + "epoch": 1.4509125840537944, + "grad_norm": 0.7796868880134117, + "learning_rate": 4.3036573286755225e-06, + "loss": 0.0904, + "step": 1888 + }, + { + "epoch": 1.4524495677233429, + "grad_norm": 0.8382801166615858, + "learning_rate": 4.297561634687809e-06, + "loss": 0.0963, + "step": 1890 + }, + { + "epoch": 1.4539865513928913, + "grad_norm": 0.7638917693205276, + "learning_rate": 4.291463387911439e-06, + "loss": 0.0846, + "step": 1892 + }, + { + "epoch": 1.45552353506244, + "grad_norm": 0.8232905374055427, + "learning_rate": 4.2853626078653255e-06, + "loss": 0.0907, + "step": 1894 + }, + { + "epoch": 1.4570605187319885, + "grad_norm": 0.8327882309952656, + "learning_rate": 4.279259314076488e-06, + "loss": 0.0924, + "step": 1896 + }, + { + "epoch": 1.458597502401537, + "grad_norm": 0.8362322094432861, + "learning_rate": 4.273153526079994e-06, + "loss": 0.0961, + "step": 1898 + }, + { + "epoch": 1.4601344860710854, + "grad_norm": 0.8371159641695654, + "learning_rate": 4.2670452634188895e-06, + "loss": 0.0857, + "step": 1900 + }, + { + "epoch": 1.4616714697406339, + "grad_norm": 0.7922677007185823, + "learning_rate": 4.260934545644148e-06, + "loss": 0.0919, + "step": 1902 + }, + { + "epoch": 1.4632084534101826, + "grad_norm": 0.7923726734910232, + "learning_rate": 4.254821392314595e-06, + "loss": 0.0903, + "step": 1904 + }, + { + "epoch": 1.464745437079731, + "grad_norm": 0.7722229174216336, + "learning_rate": 4.248705822996856e-06, + "loss": 0.0879, + "step": 1906 + }, + { + "epoch": 1.4662824207492795, + "grad_norm": 0.8218964345538021, + "learning_rate": 4.242587857265288e-06, + "loss": 0.0993, + "step": 1908 + }, + { + "epoch": 1.4678194044188282, + "grad_norm": 0.8010254286113982, + "learning_rate": 4.236467514701916e-06, + "loss": 0.0897, + "step": 1910 + }, + { + "epoch": 1.4693563880883764, + "grad_norm": 0.7744562817644429, + "learning_rate": 4.230344814896378e-06, + "loss": 0.0913, + "step": 1912 + }, + { + "epoch": 1.4708933717579251, + "grad_norm": 0.8343016463549663, + "learning_rate": 4.22421977744585e-06, + "loss": 0.0993, + "step": 1914 + }, + { + "epoch": 1.4724303554274736, + "grad_norm": 0.8688236869752541, + "learning_rate": 4.2180924219549964e-06, + "loss": 0.0997, + "step": 1916 + }, + { + "epoch": 1.473967339097022, + "grad_norm": 0.7734307337086462, + "learning_rate": 4.211962768035896e-06, + "loss": 0.0984, + "step": 1918 + }, + { + "epoch": 1.4755043227665707, + "grad_norm": 0.7679518244992084, + "learning_rate": 4.205830835307989e-06, + "loss": 0.0885, + "step": 1920 + }, + { + "epoch": 1.4770413064361192, + "grad_norm": 0.7926045518052727, + "learning_rate": 4.199696643398008e-06, + "loss": 0.0914, + "step": 1922 + }, + { + "epoch": 1.4785782901056677, + "grad_norm": 0.8283722584130274, + "learning_rate": 4.193560211939913e-06, + "loss": 0.094, + "step": 1924 + }, + { + "epoch": 1.4801152737752161, + "grad_norm": 0.7250571779590265, + "learning_rate": 4.1874215605748366e-06, + "loss": 0.079, + "step": 1926 + }, + { + "epoch": 1.4816522574447646, + "grad_norm": 0.8580770642845996, + "learning_rate": 4.181280708951015e-06, + "loss": 0.1018, + "step": 1928 + }, + { + "epoch": 1.4831892411143133, + "grad_norm": 0.8728686047506982, + "learning_rate": 4.175137676723726e-06, + "loss": 0.0964, + "step": 1930 + }, + { + "epoch": 1.4847262247838617, + "grad_norm": 0.7923177951847387, + "learning_rate": 4.168992483555231e-06, + "loss": 0.095, + "step": 1932 + }, + { + "epoch": 1.4862632084534102, + "grad_norm": 0.8120910936666785, + "learning_rate": 4.162845149114702e-06, + "loss": 0.0969, + "step": 1934 + }, + { + "epoch": 1.4878001921229587, + "grad_norm": 0.788706989115151, + "learning_rate": 4.15669569307817e-06, + "loss": 0.0885, + "step": 1936 + }, + { + "epoch": 1.4893371757925071, + "grad_norm": 0.830625147311433, + "learning_rate": 4.1505441351284526e-06, + "loss": 0.0972, + "step": 1938 + }, + { + "epoch": 1.4908741594620558, + "grad_norm": 0.8072145444813656, + "learning_rate": 4.144390494955098e-06, + "loss": 0.0887, + "step": 1940 + }, + { + "epoch": 1.4924111431316043, + "grad_norm": 0.8139524053166517, + "learning_rate": 4.138234792254317e-06, + "loss": 0.0939, + "step": 1942 + }, + { + "epoch": 1.4939481268011527, + "grad_norm": 0.7807279012822973, + "learning_rate": 4.132077046728924e-06, + "loss": 0.0804, + "step": 1944 + }, + { + "epoch": 1.4954851104707012, + "grad_norm": 0.7814387788672251, + "learning_rate": 4.125917278088269e-06, + "loss": 0.0924, + "step": 1946 + }, + { + "epoch": 1.4970220941402497, + "grad_norm": 0.884806793139695, + "learning_rate": 4.1197555060481836e-06, + "loss": 0.0977, + "step": 1948 + }, + { + "epoch": 1.4985590778097984, + "grad_norm": 0.8315758503149716, + "learning_rate": 4.1135917503309026e-06, + "loss": 0.0895, + "step": 1950 + }, + { + "epoch": 1.5000960614793468, + "grad_norm": 0.8674422252417301, + "learning_rate": 4.107426030665016e-06, + "loss": 0.0882, + "step": 1952 + }, + { + "epoch": 1.5016330451488953, + "grad_norm": 0.8449293445755316, + "learning_rate": 4.101258366785402e-06, + "loss": 0.0967, + "step": 1954 + }, + { + "epoch": 1.5031700288184437, + "grad_norm": 0.8359354740918613, + "learning_rate": 4.095088778433156e-06, + "loss": 0.0998, + "step": 1956 + }, + { + "epoch": 1.5047070124879922, + "grad_norm": 0.8262628504512934, + "learning_rate": 4.088917285355536e-06, + "loss": 0.0945, + "step": 1958 + }, + { + "epoch": 1.506243996157541, + "grad_norm": 0.7479486694048768, + "learning_rate": 4.082743907305897e-06, + "loss": 0.0943, + "step": 1960 + }, + { + "epoch": 1.5077809798270894, + "grad_norm": 0.8436782180522755, + "learning_rate": 4.076568664043625e-06, + "loss": 0.1004, + "step": 1962 + }, + { + "epoch": 1.5093179634966378, + "grad_norm": 0.7748345501977325, + "learning_rate": 4.0703915753340804e-06, + "loss": 0.0875, + "step": 1964 + }, + { + "epoch": 1.5108549471661865, + "grad_norm": 0.8382516980717638, + "learning_rate": 4.064212660948524e-06, + "loss": 0.094, + "step": 1966 + }, + { + "epoch": 1.5123919308357348, + "grad_norm": 0.7852255919020612, + "learning_rate": 4.058031940664067e-06, + "loss": 0.0912, + "step": 1968 + }, + { + "epoch": 1.5139289145052834, + "grad_norm": 0.8541412693322231, + "learning_rate": 4.051849434263595e-06, + "loss": 0.0968, + "step": 1970 + }, + { + "epoch": 1.515465898174832, + "grad_norm": 0.8000408705444431, + "learning_rate": 4.0456651615357155e-06, + "loss": 0.0874, + "step": 1972 + }, + { + "epoch": 1.5170028818443804, + "grad_norm": 0.7870384108772879, + "learning_rate": 4.039479142274686e-06, + "loss": 0.089, + "step": 1974 + }, + { + "epoch": 1.518539865513929, + "grad_norm": 0.820652722071049, + "learning_rate": 4.033291396280355e-06, + "loss": 0.0926, + "step": 1976 + }, + { + "epoch": 1.5200768491834773, + "grad_norm": 0.8154568505994646, + "learning_rate": 4.027101943358098e-06, + "loss": 0.0969, + "step": 1978 + }, + { + "epoch": 1.521613832853026, + "grad_norm": 0.8770625870074841, + "learning_rate": 4.020910803318756e-06, + "loss": 0.0993, + "step": 1980 + }, + { + "epoch": 1.5231508165225744, + "grad_norm": 0.6758733981469482, + "learning_rate": 4.014717995978565e-06, + "loss": 0.081, + "step": 1982 + }, + { + "epoch": 1.524687800192123, + "grad_norm": 0.8441728203990428, + "learning_rate": 4.008523541159104e-06, + "loss": 0.1009, + "step": 1984 + }, + { + "epoch": 1.5262247838616716, + "grad_norm": 0.8110301274459378, + "learning_rate": 4.002327458687218e-06, + "loss": 0.0872, + "step": 1986 + }, + { + "epoch": 1.5277617675312198, + "grad_norm": 0.8833546884887249, + "learning_rate": 3.996129768394969e-06, + "loss": 0.0982, + "step": 1988 + }, + { + "epoch": 1.5292987512007685, + "grad_norm": 0.8265167766685495, + "learning_rate": 3.989930490119561e-06, + "loss": 0.0967, + "step": 1990 + }, + { + "epoch": 1.530835734870317, + "grad_norm": 0.8707509834765963, + "learning_rate": 3.98372964370328e-06, + "loss": 0.0956, + "step": 1992 + }, + { + "epoch": 1.5323727185398655, + "grad_norm": 0.7763758628665975, + "learning_rate": 3.977527248993434e-06, + "loss": 0.0867, + "step": 1994 + }, + { + "epoch": 1.5339097022094141, + "grad_norm": 0.7789258378178834, + "learning_rate": 3.9713233258422855e-06, + "loss": 0.0878, + "step": 1996 + }, + { + "epoch": 1.5354466858789624, + "grad_norm": 0.7521669320980235, + "learning_rate": 3.965117894106988e-06, + "loss": 0.0888, + "step": 1998 + }, + { + "epoch": 1.536983669548511, + "grad_norm": 0.8412076854628533, + "learning_rate": 3.958910973649527e-06, + "loss": 0.0846, + "step": 2000 + }, + { + "epoch": 1.536983669548511, + "eval_loss": 0.1369212120771408, + "eval_runtime": 362.0502, + "eval_samples_per_second": 51.112, + "eval_steps_per_second": 6.391, + "step": 2000 + }, + { + "epoch": 1.5385206532180595, + "grad_norm": 0.7902219827328915, + "learning_rate": 3.952702584336648e-06, + "loss": 0.0984, + "step": 2002 + }, + { + "epoch": 1.540057636887608, + "grad_norm": 0.7801431687989913, + "learning_rate": 3.9464927460398e-06, + "loss": 0.0979, + "step": 2004 + }, + { + "epoch": 1.5415946205571567, + "grad_norm": 0.8810086083716847, + "learning_rate": 3.940281478635074e-06, + "loss": 0.0916, + "step": 2006 + }, + { + "epoch": 1.5431316042267051, + "grad_norm": 0.8496438323924863, + "learning_rate": 3.934068802003129e-06, + "loss": 0.0943, + "step": 2008 + }, + { + "epoch": 1.5446685878962536, + "grad_norm": 0.8624052138866306, + "learning_rate": 3.927854736029138e-06, + "loss": 0.0941, + "step": 2010 + }, + { + "epoch": 1.546205571565802, + "grad_norm": 0.803691628281103, + "learning_rate": 3.921639300602719e-06, + "loss": 0.0964, + "step": 2012 + }, + { + "epoch": 1.5477425552353505, + "grad_norm": 0.7838512381219084, + "learning_rate": 3.915422515617876e-06, + "loss": 0.0877, + "step": 2014 + }, + { + "epoch": 1.5492795389048992, + "grad_norm": 0.8154032044869861, + "learning_rate": 3.90920440097293e-06, + "loss": 0.0898, + "step": 2016 + }, + { + "epoch": 1.5508165225744477, + "grad_norm": 0.7774689436181877, + "learning_rate": 3.902984976570459e-06, + "loss": 0.0942, + "step": 2018 + }, + { + "epoch": 1.5523535062439962, + "grad_norm": 0.7714965765671886, + "learning_rate": 3.896764262317232e-06, + "loss": 0.0882, + "step": 2020 + }, + { + "epoch": 1.5538904899135446, + "grad_norm": 0.8482244169935028, + "learning_rate": 3.890542278124151e-06, + "loss": 0.092, + "step": 2022 + }, + { + "epoch": 1.555427473583093, + "grad_norm": 0.8089857501256293, + "learning_rate": 3.884319043906175e-06, + "loss": 0.0885, + "step": 2024 + }, + { + "epoch": 1.5569644572526418, + "grad_norm": 0.7732451008458415, + "learning_rate": 3.878094579582271e-06, + "loss": 0.0852, + "step": 2026 + }, + { + "epoch": 1.5585014409221902, + "grad_norm": 0.8485314140958313, + "learning_rate": 3.871868905075339e-06, + "loss": 0.097, + "step": 2028 + }, + { + "epoch": 1.5600384245917387, + "grad_norm": 0.7295949942508037, + "learning_rate": 3.865642040312155e-06, + "loss": 0.0765, + "step": 2030 + }, + { + "epoch": 1.5615754082612874, + "grad_norm": 0.8109480864302396, + "learning_rate": 3.859414005223303e-06, + "loss": 0.0879, + "step": 2032 + }, + { + "epoch": 1.5631123919308356, + "grad_norm": 0.8308781255627026, + "learning_rate": 3.8531848197431155e-06, + "loss": 0.0957, + "step": 2034 + }, + { + "epoch": 1.5646493756003843, + "grad_norm": 0.8436684145142693, + "learning_rate": 3.846954503809602e-06, + "loss": 0.0941, + "step": 2036 + }, + { + "epoch": 1.5661863592699328, + "grad_norm": 0.7993009937419601, + "learning_rate": 3.840723077364396e-06, + "loss": 0.0907, + "step": 2038 + }, + { + "epoch": 1.5677233429394812, + "grad_norm": 0.7917608535459787, + "learning_rate": 3.834490560352682e-06, + "loss": 0.0946, + "step": 2040 + }, + { + "epoch": 1.56926032660903, + "grad_norm": 0.775435052389592, + "learning_rate": 3.828256972723137e-06, + "loss": 0.0866, + "step": 2042 + }, + { + "epoch": 1.5707973102785782, + "grad_norm": 0.8417542619925643, + "learning_rate": 3.822022334427863e-06, + "loss": 0.0956, + "step": 2044 + }, + { + "epoch": 1.5723342939481268, + "grad_norm": 0.7534098817985344, + "learning_rate": 3.815786665422326e-06, + "loss": 0.0907, + "step": 2046 + }, + { + "epoch": 1.5738712776176753, + "grad_norm": 0.7454391500744776, + "learning_rate": 3.8095499856652907e-06, + "loss": 0.0942, + "step": 2048 + }, + { + "epoch": 1.5754082612872238, + "grad_norm": 0.8059844013874078, + "learning_rate": 3.803312315118758e-06, + "loss": 0.0833, + "step": 2050 + }, + { + "epoch": 1.5769452449567725, + "grad_norm": 0.831173234796314, + "learning_rate": 3.7970736737478976e-06, + "loss": 0.0937, + "step": 2052 + }, + { + "epoch": 1.5784822286263207, + "grad_norm": 0.8319311638041307, + "learning_rate": 3.790834081520988e-06, + "loss": 0.0941, + "step": 2054 + }, + { + "epoch": 1.5800192122958694, + "grad_norm": 0.8017768441792715, + "learning_rate": 3.7845935584093535e-06, + "loss": 0.088, + "step": 2056 + }, + { + "epoch": 1.5815561959654179, + "grad_norm": 0.8146738006411376, + "learning_rate": 3.7783521243872945e-06, + "loss": 0.0929, + "step": 2058 + }, + { + "epoch": 1.5830931796349663, + "grad_norm": 0.7862299852946094, + "learning_rate": 3.7721097994320274e-06, + "loss": 0.088, + "step": 2060 + }, + { + "epoch": 1.584630163304515, + "grad_norm": 0.7644295350251104, + "learning_rate": 3.765866603523621e-06, + "loss": 0.0971, + "step": 2062 + }, + { + "epoch": 1.5861671469740632, + "grad_norm": 0.8145450168514264, + "learning_rate": 3.7596225566449326e-06, + "loss": 0.0902, + "step": 2064 + }, + { + "epoch": 1.587704130643612, + "grad_norm": 0.8269635685870652, + "learning_rate": 3.753377678781543e-06, + "loss": 0.0923, + "step": 2066 + }, + { + "epoch": 1.5892411143131604, + "grad_norm": 0.7490940289331177, + "learning_rate": 3.7471319899216904e-06, + "loss": 0.0779, + "step": 2068 + }, + { + "epoch": 1.5907780979827089, + "grad_norm": 0.7723884968412873, + "learning_rate": 3.7408855100562114e-06, + "loss": 0.0901, + "step": 2070 + }, + { + "epoch": 1.5923150816522575, + "grad_norm": 0.8319441486290872, + "learning_rate": 3.7346382591784747e-06, + "loss": 0.0956, + "step": 2072 + }, + { + "epoch": 1.593852065321806, + "grad_norm": 0.8690168846750308, + "learning_rate": 3.728390257284314e-06, + "loss": 0.0953, + "step": 2074 + }, + { + "epoch": 1.5953890489913545, + "grad_norm": 0.7909069572145911, + "learning_rate": 3.722141524371969e-06, + "loss": 0.086, + "step": 2076 + }, + { + "epoch": 1.596926032660903, + "grad_norm": 0.8062074972797826, + "learning_rate": 3.7158920804420203e-06, + "loss": 0.0892, + "step": 2078 + }, + { + "epoch": 1.5984630163304514, + "grad_norm": 0.7611492040694079, + "learning_rate": 3.7096419454973193e-06, + "loss": 0.0969, + "step": 2080 + }, + { + "epoch": 1.6, + "grad_norm": 0.8268228564890713, + "learning_rate": 3.703391139542937e-06, + "loss": 0.0927, + "step": 2082 + }, + { + "epoch": 1.6015369836695486, + "grad_norm": 0.7570539959602375, + "learning_rate": 3.6971396825860836e-06, + "loss": 0.0833, + "step": 2084 + }, + { + "epoch": 1.603073967339097, + "grad_norm": 0.797065245492781, + "learning_rate": 3.6908875946360597e-06, + "loss": 0.0895, + "step": 2086 + }, + { + "epoch": 1.6046109510086455, + "grad_norm": 0.8082111005522078, + "learning_rate": 3.68463489570418e-06, + "loss": 0.0891, + "step": 2088 + }, + { + "epoch": 1.606147934678194, + "grad_norm": 0.8233331752704219, + "learning_rate": 3.6783816058037215e-06, + "loss": 0.0948, + "step": 2090 + }, + { + "epoch": 1.6076849183477426, + "grad_norm": 0.8349017726953393, + "learning_rate": 3.672127744949847e-06, + "loss": 0.0908, + "step": 2092 + }, + { + "epoch": 1.609221902017291, + "grad_norm": 0.7751955893816385, + "learning_rate": 3.6658733331595493e-06, + "loss": 0.0878, + "step": 2094 + }, + { + "epoch": 1.6107588856868396, + "grad_norm": 0.806749618964119, + "learning_rate": 3.6596183904515817e-06, + "loss": 0.0908, + "step": 2096 + }, + { + "epoch": 1.6122958693563882, + "grad_norm": 0.798020323683331, + "learning_rate": 3.6533629368464026e-06, + "loss": 0.085, + "step": 2098 + }, + { + "epoch": 1.6138328530259365, + "grad_norm": 0.8845913837287194, + "learning_rate": 3.6471069923661e-06, + "loss": 0.0923, + "step": 2100 + }, + { + "epoch": 1.6153698366954852, + "grad_norm": 0.7975937926027286, + "learning_rate": 3.6408505770343365e-06, + "loss": 0.0945, + "step": 2102 + }, + { + "epoch": 1.6169068203650336, + "grad_norm": 0.7977338905828745, + "learning_rate": 3.6345937108762803e-06, + "loss": 0.0886, + "step": 2104 + }, + { + "epoch": 1.618443804034582, + "grad_norm": 0.887493292191394, + "learning_rate": 3.628336413918541e-06, + "loss": 0.0909, + "step": 2106 + }, + { + "epoch": 1.6199807877041308, + "grad_norm": 0.7939745654491456, + "learning_rate": 3.622078706189112e-06, + "loss": 0.0929, + "step": 2108 + }, + { + "epoch": 1.621517771373679, + "grad_norm": 0.8183822731345303, + "learning_rate": 3.615820607717296e-06, + "loss": 0.0858, + "step": 2110 + }, + { + "epoch": 1.6230547550432277, + "grad_norm": 0.7807975084192331, + "learning_rate": 3.6095621385336505e-06, + "loss": 0.0957, + "step": 2112 + }, + { + "epoch": 1.6245917387127762, + "grad_norm": 0.8063009489440389, + "learning_rate": 3.6033033186699152e-06, + "loss": 0.0919, + "step": 2114 + }, + { + "epoch": 1.6261287223823246, + "grad_norm": 0.8112570440921344, + "learning_rate": 3.597044168158958e-06, + "loss": 0.0933, + "step": 2116 + }, + { + "epoch": 1.6276657060518733, + "grad_norm": 0.8331881631759025, + "learning_rate": 3.5907847070346997e-06, + "loss": 0.0927, + "step": 2118 + }, + { + "epoch": 1.6292026897214216, + "grad_norm": 0.7415764943928016, + "learning_rate": 3.5845249553320595e-06, + "loss": 0.0889, + "step": 2120 + }, + { + "epoch": 1.6307396733909703, + "grad_norm": 0.8003668361045585, + "learning_rate": 3.5782649330868817e-06, + "loss": 0.0971, + "step": 2122 + }, + { + "epoch": 1.6322766570605187, + "grad_norm": 0.7911658768866721, + "learning_rate": 3.5720046603358823e-06, + "loss": 0.0878, + "step": 2124 + }, + { + "epoch": 1.6338136407300672, + "grad_norm": 0.8045681908855453, + "learning_rate": 3.5657441571165754e-06, + "loss": 0.0874, + "step": 2126 + }, + { + "epoch": 1.6353506243996159, + "grad_norm": 0.893368127193528, + "learning_rate": 3.5594834434672142e-06, + "loss": 0.0904, + "step": 2128 + }, + { + "epoch": 1.6368876080691641, + "grad_norm": 0.7865118382233219, + "learning_rate": 3.553222539426724e-06, + "loss": 0.0841, + "step": 2130 + }, + { + "epoch": 1.6384245917387128, + "grad_norm": 0.7830147718405267, + "learning_rate": 3.546961465034641e-06, + "loss": 0.0936, + "step": 2132 + }, + { + "epoch": 1.6399615754082613, + "grad_norm": 0.8530754022949045, + "learning_rate": 3.5407002403310453e-06, + "loss": 0.094, + "step": 2134 + }, + { + "epoch": 1.6414985590778097, + "grad_norm": 0.7937086579061391, + "learning_rate": 3.5344388853565013e-06, + "loss": 0.0919, + "step": 2136 + }, + { + "epoch": 1.6430355427473584, + "grad_norm": 0.8490157093915195, + "learning_rate": 3.528177420151984e-06, + "loss": 0.0978, + "step": 2138 + }, + { + "epoch": 1.6445725264169067, + "grad_norm": 0.8109475329632754, + "learning_rate": 3.521915864758829e-06, + "loss": 0.0891, + "step": 2140 + }, + { + "epoch": 1.6461095100864553, + "grad_norm": 0.7936184951568639, + "learning_rate": 3.5156542392186554e-06, + "loss": 0.0916, + "step": 2142 + }, + { + "epoch": 1.6476464937560038, + "grad_norm": 0.916031078014482, + "learning_rate": 3.5093925635733084e-06, + "loss": 0.1031, + "step": 2144 + }, + { + "epoch": 1.6491834774255523, + "grad_norm": 0.7265572887246778, + "learning_rate": 3.503130857864792e-06, + "loss": 0.0872, + "step": 2146 + }, + { + "epoch": 1.650720461095101, + "grad_norm": 0.7277873762559617, + "learning_rate": 3.496869142135209e-06, + "loss": 0.086, + "step": 2148 + }, + { + "epoch": 1.6522574447646494, + "grad_norm": 0.7780905988859983, + "learning_rate": 3.4906074364266932e-06, + "loss": 0.0887, + "step": 2150 + }, + { + "epoch": 1.6537944284341979, + "grad_norm": 0.7713751406958493, + "learning_rate": 3.4843457607813445e-06, + "loss": 0.0871, + "step": 2152 + }, + { + "epoch": 1.6553314121037463, + "grad_norm": 0.8650115956133952, + "learning_rate": 3.478084135241171e-06, + "loss": 0.0948, + "step": 2154 + }, + { + "epoch": 1.6568683957732948, + "grad_norm": 0.8279315849380673, + "learning_rate": 3.4718225798480157e-06, + "loss": 0.0931, + "step": 2156 + }, + { + "epoch": 1.6584053794428435, + "grad_norm": 0.8057279660058387, + "learning_rate": 3.4655611146435003e-06, + "loss": 0.0906, + "step": 2158 + }, + { + "epoch": 1.659942363112392, + "grad_norm": 0.8002672116393091, + "learning_rate": 3.459299759668954e-06, + "loss": 0.0889, + "step": 2160 + }, + { + "epoch": 1.6614793467819404, + "grad_norm": 0.7562188109651065, + "learning_rate": 3.4530385349653597e-06, + "loss": 0.0846, + "step": 2162 + }, + { + "epoch": 1.663016330451489, + "grad_norm": 0.9071744451213971, + "learning_rate": 3.4467774605732763e-06, + "loss": 0.0975, + "step": 2164 + }, + { + "epoch": 1.6645533141210374, + "grad_norm": 0.7996156080383284, + "learning_rate": 3.440516556532787e-06, + "loss": 0.0924, + "step": 2166 + }, + { + "epoch": 1.666090297790586, + "grad_norm": 0.7950473833323731, + "learning_rate": 3.434255842883424e-06, + "loss": 0.0879, + "step": 2168 + }, + { + "epoch": 1.6676272814601345, + "grad_norm": 0.7899806697862958, + "learning_rate": 3.427995339664118e-06, + "loss": 0.0937, + "step": 2170 + }, + { + "epoch": 1.669164265129683, + "grad_norm": 0.7336053866465901, + "learning_rate": 3.421735066913118e-06, + "loss": 0.0835, + "step": 2172 + }, + { + "epoch": 1.6707012487992317, + "grad_norm": 0.7728149688439113, + "learning_rate": 3.415475044667942e-06, + "loss": 0.0927, + "step": 2174 + }, + { + "epoch": 1.67223823246878, + "grad_norm": 0.8305527642036483, + "learning_rate": 3.4092152929653e-06, + "loss": 0.0969, + "step": 2176 + }, + { + "epoch": 1.6737752161383286, + "grad_norm": 0.773715213471355, + "learning_rate": 3.4029558318410426e-06, + "loss": 0.0937, + "step": 2178 + }, + { + "epoch": 1.675312199807877, + "grad_norm": 0.8059092352832676, + "learning_rate": 3.396696681330085e-06, + "loss": 0.0863, + "step": 2180 + }, + { + "epoch": 1.6768491834774255, + "grad_norm": 0.803662636009022, + "learning_rate": 3.3904378614663507e-06, + "loss": 0.084, + "step": 2182 + }, + { + "epoch": 1.6783861671469742, + "grad_norm": 0.837705389217021, + "learning_rate": 3.3841793922827035e-06, + "loss": 0.0882, + "step": 2184 + }, + { + "epoch": 1.6799231508165224, + "grad_norm": 0.7922764494195353, + "learning_rate": 3.3779212938108883e-06, + "loss": 0.0827, + "step": 2186 + }, + { + "epoch": 1.6814601344860711, + "grad_norm": 0.7860318255345747, + "learning_rate": 3.3716635860814593e-06, + "loss": 0.0876, + "step": 2188 + }, + { + "epoch": 1.6829971181556196, + "grad_norm": 0.8066504756055669, + "learning_rate": 3.365406289123721e-06, + "loss": 0.0938, + "step": 2190 + }, + { + "epoch": 1.684534101825168, + "grad_norm": 0.7921416532988432, + "learning_rate": 3.3591494229656634e-06, + "loss": 0.0934, + "step": 2192 + }, + { + "epoch": 1.6860710854947167, + "grad_norm": 0.745529905487878, + "learning_rate": 3.3528930076339002e-06, + "loss": 0.0813, + "step": 2194 + }, + { + "epoch": 1.687608069164265, + "grad_norm": 0.832234842686226, + "learning_rate": 3.346637063153598e-06, + "loss": 0.09, + "step": 2196 + }, + { + "epoch": 1.6891450528338137, + "grad_norm": 0.787715663482219, + "learning_rate": 3.3403816095484177e-06, + "loss": 0.0883, + "step": 2198 + }, + { + "epoch": 1.6906820365033621, + "grad_norm": 0.8626434473168217, + "learning_rate": 3.3341266668404514e-06, + "loss": 0.1017, + "step": 2200 + }, + { + "epoch": 1.6922190201729106, + "grad_norm": 0.804548679573353, + "learning_rate": 3.3278722550501534e-06, + "loss": 0.0894, + "step": 2202 + }, + { + "epoch": 1.6937560038424593, + "grad_norm": 0.7757347097137762, + "learning_rate": 3.3216183941962793e-06, + "loss": 0.0841, + "step": 2204 + }, + { + "epoch": 1.6952929875120075, + "grad_norm": 0.7833610959261708, + "learning_rate": 3.3153651042958196e-06, + "loss": 0.096, + "step": 2206 + }, + { + "epoch": 1.6968299711815562, + "grad_norm": 0.7848923969699126, + "learning_rate": 3.309112405363941e-06, + "loss": 0.0952, + "step": 2208 + }, + { + "epoch": 1.6983669548511047, + "grad_norm": 0.8052464321733062, + "learning_rate": 3.302860317413917e-06, + "loss": 0.0915, + "step": 2210 + }, + { + "epoch": 1.6999039385206531, + "grad_norm": 0.8512154630818453, + "learning_rate": 3.2966088604570648e-06, + "loss": 0.0876, + "step": 2212 + }, + { + "epoch": 1.7014409221902018, + "grad_norm": 0.7790903398816384, + "learning_rate": 3.2903580545026797e-06, + "loss": 0.087, + "step": 2214 + }, + { + "epoch": 1.7029779058597503, + "grad_norm": 0.8376240131132313, + "learning_rate": 3.28410791955798e-06, + "loss": 0.0967, + "step": 2216 + }, + { + "epoch": 1.7045148895292987, + "grad_norm": 0.7967627035735608, + "learning_rate": 3.2778584756280307e-06, + "loss": 0.0915, + "step": 2218 + }, + { + "epoch": 1.7060518731988472, + "grad_norm": 0.7702278491400715, + "learning_rate": 3.271609742715687e-06, + "loss": 0.0862, + "step": 2220 + }, + { + "epoch": 1.7075888568683957, + "grad_norm": 0.7544840615892947, + "learning_rate": 3.265361740821525e-06, + "loss": 0.0833, + "step": 2222 + }, + { + "epoch": 1.7091258405379444, + "grad_norm": 0.7621768245734015, + "learning_rate": 3.2591144899437885e-06, + "loss": 0.0871, + "step": 2224 + }, + { + "epoch": 1.7106628242074928, + "grad_norm": 0.8831931871572729, + "learning_rate": 3.25286801007831e-06, + "loss": 0.0882, + "step": 2226 + }, + { + "epoch": 1.7121998078770413, + "grad_norm": 0.8197620745142645, + "learning_rate": 3.246622321218458e-06, + "loss": 0.0928, + "step": 2228 + }, + { + "epoch": 1.71373679154659, + "grad_norm": 0.77604756798234, + "learning_rate": 3.2403774433550673e-06, + "loss": 0.0832, + "step": 2230 + }, + { + "epoch": 1.7152737752161382, + "grad_norm": 0.8842522339003976, + "learning_rate": 3.2341333964763795e-06, + "loss": 0.0988, + "step": 2232 + }, + { + "epoch": 1.716810758885687, + "grad_norm": 0.7768312377818398, + "learning_rate": 3.2278902005679734e-06, + "loss": 0.0882, + "step": 2234 + }, + { + "epoch": 1.7183477425552354, + "grad_norm": 0.8327531068216764, + "learning_rate": 3.2216478756127067e-06, + "loss": 0.085, + "step": 2236 + }, + { + "epoch": 1.7198847262247838, + "grad_norm": 0.8186258517055159, + "learning_rate": 3.215406441590646e-06, + "loss": 0.0885, + "step": 2238 + }, + { + "epoch": 1.7214217098943325, + "grad_norm": 0.6874479376398842, + "learning_rate": 3.209165918479012e-06, + "loss": 0.0781, + "step": 2240 + }, + { + "epoch": 1.7229586935638808, + "grad_norm": 0.8115322816676634, + "learning_rate": 3.202926326252103e-06, + "loss": 0.0969, + "step": 2242 + }, + { + "epoch": 1.7244956772334294, + "grad_norm": 0.8146826671168621, + "learning_rate": 3.1966876848812434e-06, + "loss": 0.089, + "step": 2244 + }, + { + "epoch": 1.726032660902978, + "grad_norm": 0.8248049130832095, + "learning_rate": 3.1904500143347092e-06, + "loss": 0.0882, + "step": 2246 + }, + { + "epoch": 1.7275696445725264, + "grad_norm": 0.8015905899655976, + "learning_rate": 3.184213334577675e-06, + "loss": 0.0914, + "step": 2248 + }, + { + "epoch": 1.729106628242075, + "grad_norm": 0.8160931505603131, + "learning_rate": 3.1779776655721374e-06, + "loss": 0.0898, + "step": 2250 + }, + { + "epoch": 1.7306436119116233, + "grad_norm": 0.8170663856878996, + "learning_rate": 3.1717430272768637e-06, + "loss": 0.0925, + "step": 2252 + }, + { + "epoch": 1.732180595581172, + "grad_norm": 0.8051970712819015, + "learning_rate": 3.1655094396473175e-06, + "loss": 0.0929, + "step": 2254 + }, + { + "epoch": 1.7337175792507205, + "grad_norm": 0.8967076051389512, + "learning_rate": 3.1592769226356045e-06, + "loss": 0.0931, + "step": 2256 + }, + { + "epoch": 1.735254562920269, + "grad_norm": 0.8217397870462823, + "learning_rate": 3.153045496190398e-06, + "loss": 0.0885, + "step": 2258 + }, + { + "epoch": 1.7367915465898176, + "grad_norm": 0.7584555916208618, + "learning_rate": 3.1468151802568857e-06, + "loss": 0.0868, + "step": 2260 + }, + { + "epoch": 1.7383285302593658, + "grad_norm": 0.8377345355524379, + "learning_rate": 3.1405859947766965e-06, + "loss": 0.084, + "step": 2262 + }, + { + "epoch": 1.7398655139289145, + "grad_norm": 0.7974250660258231, + "learning_rate": 3.1343579596878455e-06, + "loss": 0.0896, + "step": 2264 + }, + { + "epoch": 1.741402497598463, + "grad_norm": 0.8423863324362034, + "learning_rate": 3.128131094924661e-06, + "loss": 0.0914, + "step": 2266 + }, + { + "epoch": 1.7429394812680115, + "grad_norm": 0.7998587118371564, + "learning_rate": 3.12190542041773e-06, + "loss": 0.0877, + "step": 2268 + }, + { + "epoch": 1.7444764649375601, + "grad_norm": 0.8420892166338229, + "learning_rate": 3.1156809560938246e-06, + "loss": 0.0892, + "step": 2270 + }, + { + "epoch": 1.7460134486071084, + "grad_norm": 0.7811169390028642, + "learning_rate": 3.1094577218758497e-06, + "loss": 0.0826, + "step": 2272 + }, + { + "epoch": 1.747550432276657, + "grad_norm": 0.856105762521616, + "learning_rate": 3.103235737682768e-06, + "loss": 0.0925, + "step": 2274 + }, + { + "epoch": 1.7490874159462055, + "grad_norm": 0.8434878009386567, + "learning_rate": 3.0970150234295416e-06, + "loss": 0.0895, + "step": 2276 + }, + { + "epoch": 1.750624399615754, + "grad_norm": 0.7912672991129802, + "learning_rate": 3.09079559902707e-06, + "loss": 0.0844, + "step": 2278 + }, + { + "epoch": 1.7521613832853027, + "grad_norm": 0.8093805440566595, + "learning_rate": 3.0845774843821242e-06, + "loss": 0.0857, + "step": 2280 + }, + { + "epoch": 1.7536983669548512, + "grad_norm": 0.7841327968506241, + "learning_rate": 3.0783606993972816e-06, + "loss": 0.0824, + "step": 2282 + }, + { + "epoch": 1.7552353506243996, + "grad_norm": 0.8784822733296287, + "learning_rate": 3.072145263970863e-06, + "loss": 0.0896, + "step": 2284 + }, + { + "epoch": 1.756772334293948, + "grad_norm": 0.785311512287812, + "learning_rate": 3.0659311979968707e-06, + "loss": 0.0787, + "step": 2286 + }, + { + "epoch": 1.7583093179634965, + "grad_norm": 0.8489973269201797, + "learning_rate": 3.059718521364926e-06, + "loss": 0.0965, + "step": 2288 + }, + { + "epoch": 1.7598463016330452, + "grad_norm": 0.8687972454705519, + "learning_rate": 3.0535072539602004e-06, + "loss": 0.0875, + "step": 2290 + }, + { + "epoch": 1.7613832853025937, + "grad_norm": 0.8170562404470127, + "learning_rate": 3.0472974156633535e-06, + "loss": 0.0947, + "step": 2292 + }, + { + "epoch": 1.7629202689721422, + "grad_norm": 0.7542361214197597, + "learning_rate": 3.0410890263504736e-06, + "loss": 0.0838, + "step": 2294 + }, + { + "epoch": 1.7644572526416908, + "grad_norm": 0.8228131888155036, + "learning_rate": 3.0348821058930117e-06, + "loss": 0.0956, + "step": 2296 + }, + { + "epoch": 1.765994236311239, + "grad_norm": 0.8281839285188529, + "learning_rate": 3.0286766741577156e-06, + "loss": 0.0897, + "step": 2298 + }, + { + "epoch": 1.7675312199807878, + "grad_norm": 0.7587083990715107, + "learning_rate": 3.022472751006566e-06, + "loss": 0.0805, + "step": 2300 + }, + { + "epoch": 1.7690682036503362, + "grad_norm": 0.8251420202843136, + "learning_rate": 3.0162703562967197e-06, + "loss": 0.0845, + "step": 2302 + }, + { + "epoch": 1.7706051873198847, + "grad_norm": 0.8065903172521137, + "learning_rate": 3.01006950988044e-06, + "loss": 0.0891, + "step": 2304 + }, + { + "epoch": 1.7721421709894334, + "grad_norm": 0.8124974824000051, + "learning_rate": 3.0038702316050317e-06, + "loss": 0.0896, + "step": 2306 + }, + { + "epoch": 1.7736791546589816, + "grad_norm": 0.7529764285672882, + "learning_rate": 2.997672541312782e-06, + "loss": 0.0776, + "step": 2308 + }, + { + "epoch": 1.7752161383285303, + "grad_norm": 0.8091489227633613, + "learning_rate": 2.9914764588408966e-06, + "loss": 0.0922, + "step": 2310 + }, + { + "epoch": 1.7767531219980788, + "grad_norm": 0.8938294981287168, + "learning_rate": 2.985282004021435e-06, + "loss": 0.0924, + "step": 2312 + }, + { + "epoch": 1.7782901056676272, + "grad_norm": 0.9013544600990553, + "learning_rate": 2.979089196681245e-06, + "loss": 0.091, + "step": 2314 + }, + { + "epoch": 1.779827089337176, + "grad_norm": 0.8597516613442064, + "learning_rate": 2.9728980566419016e-06, + "loss": 0.0847, + "step": 2316 + }, + { + "epoch": 1.7813640730067242, + "grad_norm": 0.8090979220931821, + "learning_rate": 2.966708603719645e-06, + "loss": 0.091, + "step": 2318 + }, + { + "epoch": 1.7829010566762729, + "grad_norm": 0.7916631388831536, + "learning_rate": 2.960520857725314e-06, + "loss": 0.0867, + "step": 2320 + }, + { + "epoch": 1.7844380403458213, + "grad_norm": 0.7616856425912224, + "learning_rate": 2.954334838464285e-06, + "loss": 0.0892, + "step": 2322 + }, + { + "epoch": 1.7859750240153698, + "grad_norm": 0.7512880043816689, + "learning_rate": 2.948150565736404e-06, + "loss": 0.0825, + "step": 2324 + }, + { + "epoch": 1.7875120076849185, + "grad_norm": 0.7903862314768638, + "learning_rate": 2.9419680593359335e-06, + "loss": 0.0815, + "step": 2326 + }, + { + "epoch": 1.7890489913544667, + "grad_norm": 0.826491887413533, + "learning_rate": 2.9357873390514757e-06, + "loss": 0.0868, + "step": 2328 + }, + { + "epoch": 1.7905859750240154, + "grad_norm": 0.8078642556595113, + "learning_rate": 2.929608424665921e-06, + "loss": 0.0945, + "step": 2330 + }, + { + "epoch": 1.7921229586935639, + "grad_norm": 0.8097155781369342, + "learning_rate": 2.9234313359563744e-06, + "loss": 0.0938, + "step": 2332 + }, + { + "epoch": 1.7936599423631123, + "grad_norm": 0.8150186954176805, + "learning_rate": 2.9172560926941037e-06, + "loss": 0.0828, + "step": 2334 + }, + { + "epoch": 1.795196926032661, + "grad_norm": 0.7813243625319553, + "learning_rate": 2.9110827146444643e-06, + "loss": 0.0858, + "step": 2336 + }, + { + "epoch": 1.7967339097022093, + "grad_norm": 0.7791024554992239, + "learning_rate": 2.904911221566845e-06, + "loss": 0.0844, + "step": 2338 + }, + { + "epoch": 1.798270893371758, + "grad_norm": 0.8124678993462557, + "learning_rate": 2.898741633214598e-06, + "loss": 0.0893, + "step": 2340 + }, + { + "epoch": 1.7998078770413064, + "grad_norm": 0.829294783575689, + "learning_rate": 2.8925739693349833e-06, + "loss": 0.0861, + "step": 2342 + }, + { + "epoch": 1.8013448607108549, + "grad_norm": 0.7957217580914165, + "learning_rate": 2.886408249669098e-06, + "loss": 0.0836, + "step": 2344 + }, + { + "epoch": 1.8028818443804036, + "grad_norm": 0.8631788141169373, + "learning_rate": 2.880244493951818e-06, + "loss": 0.1018, + "step": 2346 + }, + { + "epoch": 1.804418828049952, + "grad_norm": 0.6989805954411332, + "learning_rate": 2.8740827219117302e-06, + "loss": 0.0827, + "step": 2348 + }, + { + "epoch": 1.8059558117195005, + "grad_norm": 0.8472076650897804, + "learning_rate": 2.867922953271077e-06, + "loss": 0.0912, + "step": 2350 + }, + { + "epoch": 1.807492795389049, + "grad_norm": 0.8663805858998409, + "learning_rate": 2.861765207745683e-06, + "loss": 0.094, + "step": 2352 + }, + { + "epoch": 1.8090297790585974, + "grad_norm": 0.7756793455854972, + "learning_rate": 2.8556095050449032e-06, + "loss": 0.086, + "step": 2354 + }, + { + "epoch": 1.810566762728146, + "grad_norm": 0.8038188721900218, + "learning_rate": 2.8494558648715473e-06, + "loss": 0.0895, + "step": 2356 + }, + { + "epoch": 1.8121037463976946, + "grad_norm": 0.7788550764950732, + "learning_rate": 2.8433043069218307e-06, + "loss": 0.0912, + "step": 2358 + }, + { + "epoch": 1.813640730067243, + "grad_norm": 0.8331357355337144, + "learning_rate": 2.8371548508852977e-06, + "loss": 0.0982, + "step": 2360 + }, + { + "epoch": 1.8151777137367917, + "grad_norm": 0.8118854152234461, + "learning_rate": 2.8310075164447696e-06, + "loss": 0.0925, + "step": 2362 + }, + { + "epoch": 1.81671469740634, + "grad_norm": 0.8017256660340538, + "learning_rate": 2.824862323276273e-06, + "loss": 0.0855, + "step": 2364 + }, + { + "epoch": 1.8182516810758886, + "grad_norm": 0.7800752551508296, + "learning_rate": 2.8187192910489856e-06, + "loss": 0.0834, + "step": 2366 + }, + { + "epoch": 1.819788664745437, + "grad_norm": 0.8380624718020749, + "learning_rate": 2.812578439425164e-06, + "loss": 0.0945, + "step": 2368 + }, + { + "epoch": 1.8213256484149856, + "grad_norm": 0.8276945914691816, + "learning_rate": 2.806439788060088e-06, + "loss": 0.0889, + "step": 2370 + }, + { + "epoch": 1.8228626320845343, + "grad_norm": 0.7985257731425042, + "learning_rate": 2.8003033566019922e-06, + "loss": 0.0841, + "step": 2372 + }, + { + "epoch": 1.8243996157540825, + "grad_norm": 0.7637963888810898, + "learning_rate": 2.7941691646920105e-06, + "loss": 0.0751, + "step": 2374 + }, + { + "epoch": 1.8259365994236312, + "grad_norm": 0.8413312901210165, + "learning_rate": 2.7880372319641042e-06, + "loss": 0.089, + "step": 2376 + }, + { + "epoch": 1.8274735830931796, + "grad_norm": 0.8586650363460536, + "learning_rate": 2.781907578045005e-06, + "loss": 0.0887, + "step": 2378 + }, + { + "epoch": 1.829010566762728, + "grad_norm": 0.8360563017968606, + "learning_rate": 2.77578022255415e-06, + "loss": 0.0939, + "step": 2380 + }, + { + "epoch": 1.8305475504322768, + "grad_norm": 0.877204652341484, + "learning_rate": 2.769655185103623e-06, + "loss": 0.0893, + "step": 2382 + }, + { + "epoch": 1.832084534101825, + "grad_norm": 0.8297837842829701, + "learning_rate": 2.7635324852980843e-06, + "loss": 0.0907, + "step": 2384 + }, + { + "epoch": 1.8336215177713737, + "grad_norm": 0.8431709281215559, + "learning_rate": 2.7574121427347133e-06, + "loss": 0.0982, + "step": 2386 + }, + { + "epoch": 1.8351585014409222, + "grad_norm": 0.7604258311920689, + "learning_rate": 2.751294177003143e-06, + "loss": 0.0789, + "step": 2388 + }, + { + "epoch": 1.8366954851104706, + "grad_norm": 0.7739652641897983, + "learning_rate": 2.745178607685405e-06, + "loss": 0.0928, + "step": 2390 + }, + { + "epoch": 1.8382324687800193, + "grad_norm": 0.8167833653726546, + "learning_rate": 2.7390654543558534e-06, + "loss": 0.0842, + "step": 2392 + }, + { + "epoch": 1.8397694524495676, + "grad_norm": 0.7908210162723227, + "learning_rate": 2.7329547365811104e-06, + "loss": 0.0843, + "step": 2394 + }, + { + "epoch": 1.8413064361191163, + "grad_norm": 0.8201467727248781, + "learning_rate": 2.726846473920006e-06, + "loss": 0.0919, + "step": 2396 + }, + { + "epoch": 1.8428434197886647, + "grad_norm": 0.8484929622858332, + "learning_rate": 2.7207406859235117e-06, + "loss": 0.0817, + "step": 2398 + }, + { + "epoch": 1.8443804034582132, + "grad_norm": 0.7641741013611666, + "learning_rate": 2.714637392134675e-06, + "loss": 0.0875, + "step": 2400 + }, + { + "epoch": 1.8459173871277619, + "grad_norm": 0.8726969374519596, + "learning_rate": 2.708536612088561e-06, + "loss": 0.0883, + "step": 2402 + }, + { + "epoch": 1.8474543707973101, + "grad_norm": 0.7568052808653818, + "learning_rate": 2.702438365312191e-06, + "loss": 0.0847, + "step": 2404 + }, + { + "epoch": 1.8489913544668588, + "grad_norm": 0.774661999809938, + "learning_rate": 2.696342671324478e-06, + "loss": 0.0912, + "step": 2406 + }, + { + "epoch": 1.8505283381364073, + "grad_norm": 0.8750231519779115, + "learning_rate": 2.6902495496361613e-06, + "loss": 0.094, + "step": 2408 + }, + { + "epoch": 1.8520653218059557, + "grad_norm": 0.7904137960913173, + "learning_rate": 2.6841590197497476e-06, + "loss": 0.0858, + "step": 2410 + }, + { + "epoch": 1.8536023054755044, + "grad_norm": 0.7711313526212195, + "learning_rate": 2.678071101159451e-06, + "loss": 0.0825, + "step": 2412 + }, + { + "epoch": 1.8551392891450529, + "grad_norm": 0.8221962336381063, + "learning_rate": 2.6719858133511257e-06, + "loss": 0.0906, + "step": 2414 + }, + { + "epoch": 1.8566762728146013, + "grad_norm": 0.8337437421475409, + "learning_rate": 2.665903175802204e-06, + "loss": 0.0868, + "step": 2416 + }, + { + "epoch": 1.8582132564841498, + "grad_norm": 0.8378451337853939, + "learning_rate": 2.6598232079816353e-06, + "loss": 0.0933, + "step": 2418 + }, + { + "epoch": 1.8597502401536983, + "grad_norm": 0.7639730128714044, + "learning_rate": 2.6537459293498277e-06, + "loss": 0.0846, + "step": 2420 + }, + { + "epoch": 1.861287223823247, + "grad_norm": 0.7830022284803336, + "learning_rate": 2.6476713593585783e-06, + "loss": 0.0863, + "step": 2422 + }, + { + "epoch": 1.8628242074927954, + "grad_norm": 0.816812769548626, + "learning_rate": 2.641599517451016e-06, + "loss": 0.0937, + "step": 2424 + }, + { + "epoch": 1.864361191162344, + "grad_norm": 0.8742317390360426, + "learning_rate": 2.6355304230615356e-06, + "loss": 0.0975, + "step": 2426 + }, + { + "epoch": 1.8658981748318926, + "grad_norm": 0.8026788777916881, + "learning_rate": 2.629464095615743e-06, + "loss": 0.0848, + "step": 2428 + }, + { + "epoch": 1.8674351585014408, + "grad_norm": 0.8437565172328019, + "learning_rate": 2.623400554530382e-06, + "loss": 0.0947, + "step": 2430 + }, + { + "epoch": 1.8689721421709895, + "grad_norm": 0.801208290733419, + "learning_rate": 2.6173398192132812e-06, + "loss": 0.0899, + "step": 2432 + }, + { + "epoch": 1.870509125840538, + "grad_norm": 0.7932315598508702, + "learning_rate": 2.6112819090632854e-06, + "loss": 0.0895, + "step": 2434 + }, + { + "epoch": 1.8720461095100864, + "grad_norm": 0.9042900524043032, + "learning_rate": 2.6052268434702042e-06, + "loss": 0.0946, + "step": 2436 + }, + { + "epoch": 1.8735830931796351, + "grad_norm": 0.7894581411758911, + "learning_rate": 2.599174641814734e-06, + "loss": 0.0867, + "step": 2438 + }, + { + "epoch": 1.8751200768491834, + "grad_norm": 0.7854020543576818, + "learning_rate": 2.59312532346841e-06, + "loss": 0.0911, + "step": 2440 + }, + { + "epoch": 1.876657060518732, + "grad_norm": 0.815693549418664, + "learning_rate": 2.5870789077935335e-06, + "loss": 0.0895, + "step": 2442 + }, + { + "epoch": 1.8781940441882805, + "grad_norm": 0.7887109470073577, + "learning_rate": 2.5810354141431226e-06, + "loss": 0.0903, + "step": 2444 + }, + { + "epoch": 1.879731027857829, + "grad_norm": 0.7896777569740607, + "learning_rate": 2.5749948618608356e-06, + "loss": 0.0809, + "step": 2446 + }, + { + "epoch": 1.8812680115273777, + "grad_norm": 0.7615729562820233, + "learning_rate": 2.5689572702809203e-06, + "loss": 0.0838, + "step": 2448 + }, + { + "epoch": 1.882804995196926, + "grad_norm": 0.8582942386637096, + "learning_rate": 2.562922658728145e-06, + "loss": 0.09, + "step": 2450 + }, + { + "epoch": 1.8843419788664746, + "grad_norm": 0.7611707316427125, + "learning_rate": 2.556891046517745e-06, + "loss": 0.0775, + "step": 2452 + }, + { + "epoch": 1.885878962536023, + "grad_norm": 0.7455166273658402, + "learning_rate": 2.5508624529553496e-06, + "loss": 0.0866, + "step": 2454 + }, + { + "epoch": 1.8874159462055715, + "grad_norm": 0.8572197727943554, + "learning_rate": 2.5448368973369295e-06, + "loss": 0.0867, + "step": 2456 + }, + { + "epoch": 1.8889529298751202, + "grad_norm": 0.8079812248707947, + "learning_rate": 2.5388143989487295e-06, + "loss": 0.09, + "step": 2458 + }, + { + "epoch": 1.8904899135446684, + "grad_norm": 0.8079920947645508, + "learning_rate": 2.5327949770672125e-06, + "loss": 0.0872, + "step": 2460 + }, + { + "epoch": 1.8920268972142171, + "grad_norm": 0.8329864584858913, + "learning_rate": 2.5267786509589907e-06, + "loss": 0.0892, + "step": 2462 + }, + { + "epoch": 1.8935638808837656, + "grad_norm": 0.8044295715601575, + "learning_rate": 2.52076543988077e-06, + "loss": 0.0874, + "step": 2464 + }, + { + "epoch": 1.895100864553314, + "grad_norm": 0.7566977212719446, + "learning_rate": 2.5147553630792827e-06, + "loss": 0.079, + "step": 2466 + }, + { + "epoch": 1.8966378482228627, + "grad_norm": 0.7703458660050836, + "learning_rate": 2.5087484397912354e-06, + "loss": 0.0803, + "step": 2468 + }, + { + "epoch": 1.898174831892411, + "grad_norm": 0.7472342418402756, + "learning_rate": 2.5027446892432335e-06, + "loss": 0.0831, + "step": 2470 + }, + { + "epoch": 1.8997118155619597, + "grad_norm": 0.7860659793588969, + "learning_rate": 2.496744130651731e-06, + "loss": 0.0861, + "step": 2472 + }, + { + "epoch": 1.9012487992315081, + "grad_norm": 0.7782192620932303, + "learning_rate": 2.4907467832229655e-06, + "loss": 0.0904, + "step": 2474 + }, + { + "epoch": 1.9027857829010566, + "grad_norm": 0.7976874165660952, + "learning_rate": 2.4847526661528974e-06, + "loss": 0.0868, + "step": 2476 + }, + { + "epoch": 1.9043227665706053, + "grad_norm": 0.8130144869116065, + "learning_rate": 2.4787617986271457e-06, + "loss": 0.0887, + "step": 2478 + }, + { + "epoch": 1.9058597502401537, + "grad_norm": 0.8306432379209792, + "learning_rate": 2.4727741998209278e-06, + "loss": 0.0817, + "step": 2480 + }, + { + "epoch": 1.9073967339097022, + "grad_norm": 0.7830733463300852, + "learning_rate": 2.4667898888989997e-06, + "loss": 0.0894, + "step": 2482 + }, + { + "epoch": 1.9089337175792507, + "grad_norm": 0.7962828362447372, + "learning_rate": 2.460808885015596e-06, + "loss": 0.0833, + "step": 2484 + }, + { + "epoch": 1.9104707012487991, + "grad_norm": 0.84538791014193, + "learning_rate": 2.454831207314364e-06, + "loss": 0.0898, + "step": 2486 + }, + { + "epoch": 1.9120076849183478, + "grad_norm": 0.7980052835225272, + "learning_rate": 2.4488568749283024e-06, + "loss": 0.0867, + "step": 2488 + }, + { + "epoch": 1.9135446685878963, + "grad_norm": 0.7823748772923179, + "learning_rate": 2.4428859069797065e-06, + "loss": 0.086, + "step": 2490 + }, + { + "epoch": 1.9150816522574448, + "grad_norm": 0.8497213682534336, + "learning_rate": 2.4369183225801037e-06, + "loss": 0.0876, + "step": 2492 + }, + { + "epoch": 1.9166186359269934, + "grad_norm": 0.729672453173719, + "learning_rate": 2.430954140830187e-06, + "loss": 0.08, + "step": 2494 + }, + { + "epoch": 1.9181556195965417, + "grad_norm": 0.8535703260306475, + "learning_rate": 2.424993380819759e-06, + "loss": 0.0809, + "step": 2496 + }, + { + "epoch": 1.9196926032660904, + "grad_norm": 0.8459889716819726, + "learning_rate": 2.419036061627676e-06, + "loss": 0.0898, + "step": 2498 + }, + { + "epoch": 1.9212295869356388, + "grad_norm": 0.8483259441228239, + "learning_rate": 2.4130822023217745e-06, + "loss": 0.0841, + "step": 2500 + }, + { + "epoch": 1.9212295869356388, + "eval_loss": 0.13174882531166077, + "eval_runtime": 362.8867, + "eval_samples_per_second": 50.994, + "eval_steps_per_second": 6.377, + "step": 2500 + }, + { + "epoch": 1.9227665706051873, + "grad_norm": 0.7537082873290889, + "learning_rate": 2.40713182195882e-06, + "loss": 0.0835, + "step": 2502 + }, + { + "epoch": 1.924303554274736, + "grad_norm": 0.8025926008593399, + "learning_rate": 2.401184939584441e-06, + "loss": 0.091, + "step": 2504 + }, + { + "epoch": 1.9258405379442842, + "grad_norm": 0.8587481263834186, + "learning_rate": 2.3952415742330715e-06, + "loss": 0.0817, + "step": 2506 + }, + { + "epoch": 1.927377521613833, + "grad_norm": 0.8496889861192923, + "learning_rate": 2.38930174492789e-06, + "loss": 0.0871, + "step": 2508 + }, + { + "epoch": 1.9289145052833814, + "grad_norm": 0.7449830563434601, + "learning_rate": 2.383365470680753e-06, + "loss": 0.0889, + "step": 2510 + }, + { + "epoch": 1.9304514889529298, + "grad_norm": 0.7842488010652338, + "learning_rate": 2.377432770492138e-06, + "loss": 0.0896, + "step": 2512 + }, + { + "epoch": 1.9319884726224785, + "grad_norm": 0.7604948684079603, + "learning_rate": 2.3715036633510887e-06, + "loss": 0.0768, + "step": 2514 + }, + { + "epoch": 1.9335254562920268, + "grad_norm": 0.8622740265461863, + "learning_rate": 2.365578168235143e-06, + "loss": 0.0884, + "step": 2516 + }, + { + "epoch": 1.9350624399615755, + "grad_norm": 0.8075457215159016, + "learning_rate": 2.3596563041102794e-06, + "loss": 0.0859, + "step": 2518 + }, + { + "epoch": 1.936599423631124, + "grad_norm": 0.7735312713920628, + "learning_rate": 2.3537380899308532e-06, + "loss": 0.0888, + "step": 2520 + }, + { + "epoch": 1.9381364073006724, + "grad_norm": 0.7250205794783319, + "learning_rate": 2.347823544639541e-06, + "loss": 0.0812, + "step": 2522 + }, + { + "epoch": 1.939673390970221, + "grad_norm": 0.8168723067376802, + "learning_rate": 2.3419126871672716e-06, + "loss": 0.085, + "step": 2524 + }, + { + "epoch": 1.9412103746397693, + "grad_norm": 0.7558226843999474, + "learning_rate": 2.3360055364331726e-06, + "loss": 0.0842, + "step": 2526 + }, + { + "epoch": 1.942747358309318, + "grad_norm": 0.8367673308249525, + "learning_rate": 2.3301021113445057e-06, + "loss": 0.0859, + "step": 2528 + }, + { + "epoch": 1.9442843419788665, + "grad_norm": 0.771933781721656, + "learning_rate": 2.3242024307966115e-06, + "loss": 0.0853, + "step": 2530 + }, + { + "epoch": 1.945821325648415, + "grad_norm": 0.8022281457204921, + "learning_rate": 2.3183065136728395e-06, + "loss": 0.0829, + "step": 2532 + }, + { + "epoch": 1.9473583093179636, + "grad_norm": 0.8770111872133347, + "learning_rate": 2.3124143788444994e-06, + "loss": 0.0871, + "step": 2534 + }, + { + "epoch": 1.9488952929875119, + "grad_norm": 0.7289175750768725, + "learning_rate": 2.3065260451707887e-06, + "loss": 0.0765, + "step": 2536 + }, + { + "epoch": 1.9504322766570605, + "grad_norm": 0.8573432100144109, + "learning_rate": 2.3006415314987453e-06, + "loss": 0.0926, + "step": 2538 + }, + { + "epoch": 1.951969260326609, + "grad_norm": 0.8049758874747357, + "learning_rate": 2.2947608566631738e-06, + "loss": 0.09, + "step": 2540 + }, + { + "epoch": 1.9535062439961575, + "grad_norm": 0.8070974588287757, + "learning_rate": 2.288884039486595e-06, + "loss": 0.0805, + "step": 2542 + }, + { + "epoch": 1.9550432276657062, + "grad_norm": 0.7988980263528441, + "learning_rate": 2.2830110987791816e-06, + "loss": 0.0868, + "step": 2544 + }, + { + "epoch": 1.9565802113352546, + "grad_norm": 0.8018076390389216, + "learning_rate": 2.277142053338701e-06, + "loss": 0.0833, + "step": 2546 + }, + { + "epoch": 1.958117195004803, + "grad_norm": 0.8761877037291468, + "learning_rate": 2.271276921950448e-06, + "loss": 0.0889, + "step": 2548 + }, + { + "epoch": 1.9596541786743515, + "grad_norm": 0.8222253983712655, + "learning_rate": 2.265415723387194e-06, + "loss": 0.0884, + "step": 2550 + }, + { + "epoch": 1.9611911623439, + "grad_norm": 0.8067488111444437, + "learning_rate": 2.259558476409119e-06, + "loss": 0.0878, + "step": 2552 + }, + { + "epoch": 1.9627281460134487, + "grad_norm": 0.7772228994300671, + "learning_rate": 2.253705199763759e-06, + "loss": 0.0827, + "step": 2554 + }, + { + "epoch": 1.9642651296829972, + "grad_norm": 0.8171211068617626, + "learning_rate": 2.247855912185938e-06, + "loss": 0.0853, + "step": 2556 + }, + { + "epoch": 1.9658021133525456, + "grad_norm": 0.7146445332349611, + "learning_rate": 2.242010632397715e-06, + "loss": 0.0782, + "step": 2558 + }, + { + "epoch": 1.967339097022094, + "grad_norm": 0.7891447556881424, + "learning_rate": 2.2361693791083176e-06, + "loss": 0.0887, + "step": 2560 + }, + { + "epoch": 1.9688760806916425, + "grad_norm": 0.7846780714035666, + "learning_rate": 2.230332171014091e-06, + "loss": 0.0851, + "step": 2562 + }, + { + "epoch": 1.9704130643611912, + "grad_norm": 0.8098130826581111, + "learning_rate": 2.2244990267984265e-06, + "loss": 0.0915, + "step": 2564 + }, + { + "epoch": 1.9719500480307397, + "grad_norm": 0.777976273941637, + "learning_rate": 2.2186699651317143e-06, + "loss": 0.0802, + "step": 2566 + }, + { + "epoch": 1.9734870317002882, + "grad_norm": 0.8355719530974074, + "learning_rate": 2.2128450046712702e-06, + "loss": 0.0904, + "step": 2568 + }, + { + "epoch": 1.9750240153698368, + "grad_norm": 0.7798614757356649, + "learning_rate": 2.2070241640612915e-06, + "loss": 0.0838, + "step": 2570 + }, + { + "epoch": 1.976560999039385, + "grad_norm": 0.8328604297726137, + "learning_rate": 2.2012074619327824e-06, + "loss": 0.0865, + "step": 2572 + }, + { + "epoch": 1.9780979827089338, + "grad_norm": 0.793696845619225, + "learning_rate": 2.195394916903502e-06, + "loss": 0.0782, + "step": 2574 + }, + { + "epoch": 1.9796349663784822, + "grad_norm": 0.7815626290849819, + "learning_rate": 2.1895865475779054e-06, + "loss": 0.0788, + "step": 2576 + }, + { + "epoch": 1.9811719500480307, + "grad_norm": 0.8439158338940291, + "learning_rate": 2.1837823725470835e-06, + "loss": 0.0889, + "step": 2578 + }, + { + "epoch": 1.9827089337175794, + "grad_norm": 0.8288860856763497, + "learning_rate": 2.177982410388699e-06, + "loss": 0.0884, + "step": 2580 + }, + { + "epoch": 1.9842459173871276, + "grad_norm": 0.8187063512218921, + "learning_rate": 2.1721866796669302e-06, + "loss": 0.088, + "step": 2582 + }, + { + "epoch": 1.9857829010566763, + "grad_norm": 0.7887200768254294, + "learning_rate": 2.166395198932414e-06, + "loss": 0.0836, + "step": 2584 + }, + { + "epoch": 1.9873198847262248, + "grad_norm": 0.8058422750958129, + "learning_rate": 2.160607986722186e-06, + "loss": 0.0872, + "step": 2586 + }, + { + "epoch": 1.9888568683957732, + "grad_norm": 0.8186786966538023, + "learning_rate": 2.154825061559614e-06, + "loss": 0.0921, + "step": 2588 + }, + { + "epoch": 1.990393852065322, + "grad_norm": 0.8648916595771557, + "learning_rate": 2.149046441954347e-06, + "loss": 0.0859, + "step": 2590 + }, + { + "epoch": 1.9919308357348702, + "grad_norm": 0.8612159243706975, + "learning_rate": 2.1432721464022532e-06, + "loss": 0.0997, + "step": 2592 + }, + { + "epoch": 1.9934678194044189, + "grad_norm": 0.8214954020731858, + "learning_rate": 2.137502193385361e-06, + "loss": 0.0875, + "step": 2594 + }, + { + "epoch": 1.9950048030739673, + "grad_norm": 0.7979371383049858, + "learning_rate": 2.1317366013717983e-06, + "loss": 0.0843, + "step": 2596 + }, + { + "epoch": 1.9965417867435158, + "grad_norm": 0.7620709623218802, + "learning_rate": 2.125975388815733e-06, + "loss": 0.0793, + "step": 2598 + }, + { + "epoch": 1.9980787704130645, + "grad_norm": 0.7554666873391905, + "learning_rate": 2.1202185741573206e-06, + "loss": 0.0871, + "step": 2600 + }, + { + "epoch": 1.9996157540826127, + "grad_norm": 0.7689027561104417, + "learning_rate": 2.1144661758226355e-06, + "loss": 0.0787, + "step": 2602 + }, + { + "epoch": 2.0011527377521614, + "grad_norm": 0.6010513132525606, + "learning_rate": 2.108718212223618e-06, + "loss": 0.0835, + "step": 2604 + }, + { + "epoch": 2.00268972142171, + "grad_norm": 0.5735836702570751, + "learning_rate": 2.1029747017580132e-06, + "loss": 0.0485, + "step": 2606 + }, + { + "epoch": 2.0042267050912583, + "grad_norm": 0.5702538041925809, + "learning_rate": 2.0972356628093154e-06, + "loss": 0.0452, + "step": 2608 + }, + { + "epoch": 2.005763688760807, + "grad_norm": 0.5895447128781794, + "learning_rate": 2.091501113746703e-06, + "loss": 0.0446, + "step": 2610 + }, + { + "epoch": 2.0073006724303553, + "grad_norm": 0.6069626452449407, + "learning_rate": 2.085771072924988e-06, + "loss": 0.0439, + "step": 2612 + }, + { + "epoch": 2.008837656099904, + "grad_norm": 0.6625893476183761, + "learning_rate": 2.080045558684546e-06, + "loss": 0.0426, + "step": 2614 + }, + { + "epoch": 2.0103746397694526, + "grad_norm": 0.7034714400173309, + "learning_rate": 2.0743245893512725e-06, + "loss": 0.0433, + "step": 2616 + }, + { + "epoch": 2.011911623439001, + "grad_norm": 0.7707738025199298, + "learning_rate": 2.0686081832365095e-06, + "loss": 0.045, + "step": 2618 + }, + { + "epoch": 2.0134486071085496, + "grad_norm": 0.793812018682344, + "learning_rate": 2.0628963586369966e-06, + "loss": 0.043, + "step": 2620 + }, + { + "epoch": 2.014985590778098, + "grad_norm": 0.7298066829996939, + "learning_rate": 2.057189133834806e-06, + "loss": 0.0413, + "step": 2622 + }, + { + "epoch": 2.0165225744476465, + "grad_norm": 0.7317715663050453, + "learning_rate": 2.051486527097292e-06, + "loss": 0.0413, + "step": 2624 + }, + { + "epoch": 2.018059558117195, + "grad_norm": 0.7006778779832138, + "learning_rate": 2.045788556677023e-06, + "loss": 0.0425, + "step": 2626 + }, + { + "epoch": 2.0195965417867434, + "grad_norm": 0.747566077495069, + "learning_rate": 2.04009524081173e-06, + "loss": 0.0416, + "step": 2628 + }, + { + "epoch": 2.021133525456292, + "grad_norm": 0.6905039170767364, + "learning_rate": 2.034406597724246e-06, + "loss": 0.0366, + "step": 2630 + }, + { + "epoch": 2.0226705091258403, + "grad_norm": 0.6513726296876595, + "learning_rate": 2.0287226456224464e-06, + "loss": 0.0417, + "step": 2632 + }, + { + "epoch": 2.024207492795389, + "grad_norm": 0.6710660115404629, + "learning_rate": 2.0230434026991936e-06, + "loss": 0.0439, + "step": 2634 + }, + { + "epoch": 2.0257444764649377, + "grad_norm": 0.6687092945349834, + "learning_rate": 2.0173688871322763e-06, + "loss": 0.0396, + "step": 2636 + }, + { + "epoch": 2.027281460134486, + "grad_norm": 0.7735624904374961, + "learning_rate": 2.011699117084352e-06, + "loss": 0.0431, + "step": 2638 + }, + { + "epoch": 2.0288184438040346, + "grad_norm": 0.7166923451943271, + "learning_rate": 2.0060341107028893e-06, + "loss": 0.0439, + "step": 2640 + }, + { + "epoch": 2.030355427473583, + "grad_norm": 0.7077961035403634, + "learning_rate": 2.0003738861201104e-06, + "loss": 0.0499, + "step": 2642 + }, + { + "epoch": 2.0318924111431316, + "grad_norm": 0.6883244536280981, + "learning_rate": 1.994718461452929e-06, + "loss": 0.0443, + "step": 2644 + }, + { + "epoch": 2.0334293948126803, + "grad_norm": 0.6214200682417671, + "learning_rate": 1.9890678548028994e-06, + "loss": 0.0425, + "step": 2646 + }, + { + "epoch": 2.0349663784822285, + "grad_norm": 0.6365455258734565, + "learning_rate": 1.9834220842561525e-06, + "loss": 0.0438, + "step": 2648 + }, + { + "epoch": 2.036503362151777, + "grad_norm": 0.6181452869288234, + "learning_rate": 1.9777811678833405e-06, + "loss": 0.0363, + "step": 2650 + }, + { + "epoch": 2.038040345821326, + "grad_norm": 0.6491919777612177, + "learning_rate": 1.972145123739581e-06, + "loss": 0.0397, + "step": 2652 + }, + { + "epoch": 2.039577329490874, + "grad_norm": 0.6585608787162763, + "learning_rate": 1.9665139698643894e-06, + "loss": 0.0368, + "step": 2654 + }, + { + "epoch": 2.041114313160423, + "grad_norm": 0.7727352776818418, + "learning_rate": 1.960887724281639e-06, + "loss": 0.0454, + "step": 2656 + }, + { + "epoch": 2.042651296829971, + "grad_norm": 0.6357364480821968, + "learning_rate": 1.955266404999487e-06, + "loss": 0.0415, + "step": 2658 + }, + { + "epoch": 2.0441882804995197, + "grad_norm": 0.6281847150403672, + "learning_rate": 1.9496500300103206e-06, + "loss": 0.0371, + "step": 2660 + }, + { + "epoch": 2.0457252641690684, + "grad_norm": 0.6912027377771374, + "learning_rate": 1.944038617290707e-06, + "loss": 0.04, + "step": 2662 + }, + { + "epoch": 2.0472622478386167, + "grad_norm": 0.7127284309719673, + "learning_rate": 1.938432184801327e-06, + "loss": 0.0444, + "step": 2664 + }, + { + "epoch": 2.0487992315081653, + "grad_norm": 0.6753527235959302, + "learning_rate": 1.9328307504869223e-06, + "loss": 0.0406, + "step": 2666 + }, + { + "epoch": 2.0503362151777136, + "grad_norm": 0.6882383050291725, + "learning_rate": 1.9272343322762377e-06, + "loss": 0.0393, + "step": 2668 + }, + { + "epoch": 2.0518731988472623, + "grad_norm": 0.7047552202339743, + "learning_rate": 1.9216429480819575e-06, + "loss": 0.0438, + "step": 2670 + }, + { + "epoch": 2.053410182516811, + "grad_norm": 0.7596140785601401, + "learning_rate": 1.9160566158006613e-06, + "loss": 0.0414, + "step": 2672 + }, + { + "epoch": 2.054947166186359, + "grad_norm": 0.6985050704664985, + "learning_rate": 1.9104753533127555e-06, + "loss": 0.0397, + "step": 2674 + }, + { + "epoch": 2.056484149855908, + "grad_norm": 0.6562480639793371, + "learning_rate": 1.9048991784824146e-06, + "loss": 0.0401, + "step": 2676 + }, + { + "epoch": 2.058021133525456, + "grad_norm": 0.69494658065462, + "learning_rate": 1.8993281091575362e-06, + "loss": 0.0404, + "step": 2678 + }, + { + "epoch": 2.059558117195005, + "grad_norm": 0.7358984218115102, + "learning_rate": 1.8937621631696722e-06, + "loss": 0.0422, + "step": 2680 + }, + { + "epoch": 2.0610951008645535, + "grad_norm": 0.6513459605049308, + "learning_rate": 1.8882013583339773e-06, + "loss": 0.0401, + "step": 2682 + }, + { + "epoch": 2.0626320845341017, + "grad_norm": 0.6939024794365927, + "learning_rate": 1.8826457124491504e-06, + "loss": 0.0445, + "step": 2684 + }, + { + "epoch": 2.0641690682036504, + "grad_norm": 0.6668447837064994, + "learning_rate": 1.8770952432973784e-06, + "loss": 0.0392, + "step": 2686 + }, + { + "epoch": 2.0657060518731987, + "grad_norm": 0.6666480416435122, + "learning_rate": 1.87154996864428e-06, + "loss": 0.0409, + "step": 2688 + }, + { + "epoch": 2.0672430355427474, + "grad_norm": 0.6709721661234623, + "learning_rate": 1.8660099062388431e-06, + "loss": 0.0405, + "step": 2690 + }, + { + "epoch": 2.068780019212296, + "grad_norm": 0.685700314026161, + "learning_rate": 1.8604750738133756e-06, + "loss": 0.0397, + "step": 2692 + }, + { + "epoch": 2.0703170028818443, + "grad_norm": 0.6515559887232024, + "learning_rate": 1.8549454890834497e-06, + "loss": 0.0384, + "step": 2694 + }, + { + "epoch": 2.071853986551393, + "grad_norm": 0.709788654208775, + "learning_rate": 1.8494211697478323e-06, + "loss": 0.0415, + "step": 2696 + }, + { + "epoch": 2.073390970220941, + "grad_norm": 0.6775450512874873, + "learning_rate": 1.8439021334884444e-06, + "loss": 0.0386, + "step": 2698 + }, + { + "epoch": 2.07492795389049, + "grad_norm": 0.7750933505013274, + "learning_rate": 1.8383883979702934e-06, + "loss": 0.0426, + "step": 2700 + }, + { + "epoch": 2.0764649375600386, + "grad_norm": 0.6584818113536601, + "learning_rate": 1.8328799808414227e-06, + "loss": 0.038, + "step": 2702 + }, + { + "epoch": 2.078001921229587, + "grad_norm": 0.7829082201798115, + "learning_rate": 1.8273768997328525e-06, + "loss": 0.04, + "step": 2704 + }, + { + "epoch": 2.0795389048991355, + "grad_norm": 0.734651522047483, + "learning_rate": 1.8218791722585205e-06, + "loss": 0.0415, + "step": 2706 + }, + { + "epoch": 2.0810758885686838, + "grad_norm": 0.6479085477298625, + "learning_rate": 1.8163868160152308e-06, + "loss": 0.0372, + "step": 2708 + }, + { + "epoch": 2.0826128722382324, + "grad_norm": 0.7161779560377441, + "learning_rate": 1.8108998485826017e-06, + "loss": 0.0439, + "step": 2710 + }, + { + "epoch": 2.084149855907781, + "grad_norm": 0.6464214115435033, + "learning_rate": 1.8054182875229925e-06, + "loss": 0.0401, + "step": 2712 + }, + { + "epoch": 2.0856868395773294, + "grad_norm": 0.694251707574077, + "learning_rate": 1.799942150381465e-06, + "loss": 0.0387, + "step": 2714 + }, + { + "epoch": 2.087223823246878, + "grad_norm": 0.7296255965243119, + "learning_rate": 1.7944714546857195e-06, + "loss": 0.0428, + "step": 2716 + }, + { + "epoch": 2.0887608069164267, + "grad_norm": 0.693023285089645, + "learning_rate": 1.7890062179460383e-06, + "loss": 0.0425, + "step": 2718 + }, + { + "epoch": 2.090297790585975, + "grad_norm": 0.8091932276327041, + "learning_rate": 1.7835464576552334e-06, + "loss": 0.0442, + "step": 2720 + }, + { + "epoch": 2.0918347742555237, + "grad_norm": 0.7647763579617219, + "learning_rate": 1.7780921912885828e-06, + "loss": 0.0423, + "step": 2722 + }, + { + "epoch": 2.093371757925072, + "grad_norm": 0.7252365506572918, + "learning_rate": 1.7726434363037843e-06, + "loss": 0.0401, + "step": 2724 + }, + { + "epoch": 2.0949087415946206, + "grad_norm": 0.7457710289722953, + "learning_rate": 1.7672002101408983e-06, + "loss": 0.0441, + "step": 2726 + }, + { + "epoch": 2.0964457252641693, + "grad_norm": 0.8268423722666511, + "learning_rate": 1.761762530222281e-06, + "loss": 0.0428, + "step": 2728 + }, + { + "epoch": 2.0979827089337175, + "grad_norm": 0.7059199502684411, + "learning_rate": 1.756330413952541e-06, + "loss": 0.0415, + "step": 2730 + }, + { + "epoch": 2.099519692603266, + "grad_norm": 0.6799281357494354, + "learning_rate": 1.7509038787184795e-06, + "loss": 0.0433, + "step": 2732 + }, + { + "epoch": 2.1010566762728144, + "grad_norm": 0.6560319432868118, + "learning_rate": 1.7454829418890321e-06, + "loss": 0.0404, + "step": 2734 + }, + { + "epoch": 2.102593659942363, + "grad_norm": 0.6785113040948152, + "learning_rate": 1.7400676208152185e-06, + "loss": 0.0387, + "step": 2736 + }, + { + "epoch": 2.104130643611912, + "grad_norm": 0.6843492876225117, + "learning_rate": 1.7346579328300795e-06, + "loss": 0.0437, + "step": 2738 + }, + { + "epoch": 2.10566762728146, + "grad_norm": 0.6887111820251511, + "learning_rate": 1.7292538952486288e-06, + "loss": 0.0384, + "step": 2740 + }, + { + "epoch": 2.1072046109510087, + "grad_norm": 0.7567201231202126, + "learning_rate": 1.7238555253677945e-06, + "loss": 0.0438, + "step": 2742 + }, + { + "epoch": 2.108741594620557, + "grad_norm": 0.6548000735118099, + "learning_rate": 1.7184628404663628e-06, + "loss": 0.0392, + "step": 2744 + }, + { + "epoch": 2.1102785782901057, + "grad_norm": 0.6916814777807151, + "learning_rate": 1.713075857804926e-06, + "loss": 0.039, + "step": 2746 + }, + { + "epoch": 2.1118155619596544, + "grad_norm": 0.675062961353931, + "learning_rate": 1.7076945946258195e-06, + "loss": 0.0409, + "step": 2748 + }, + { + "epoch": 2.1133525456292026, + "grad_norm": 0.7099794758199118, + "learning_rate": 1.702319068153079e-06, + "loss": 0.0422, + "step": 2750 + }, + { + "epoch": 2.1148895292987513, + "grad_norm": 0.8255111866724547, + "learning_rate": 1.6969492955923765e-06, + "loss": 0.0467, + "step": 2752 + }, + { + "epoch": 2.1164265129682995, + "grad_norm": 0.7089306864322071, + "learning_rate": 1.6915852941309628e-06, + "loss": 0.0372, + "step": 2754 + }, + { + "epoch": 2.117963496637848, + "grad_norm": 0.7195267991677364, + "learning_rate": 1.6862270809376217e-06, + "loss": 0.0441, + "step": 2756 + }, + { + "epoch": 2.119500480307397, + "grad_norm": 0.7600552648629298, + "learning_rate": 1.6808746731626085e-06, + "loss": 0.0498, + "step": 2758 + }, + { + "epoch": 2.121037463976945, + "grad_norm": 0.6719618311917168, + "learning_rate": 1.6755280879375975e-06, + "loss": 0.0365, + "step": 2760 + }, + { + "epoch": 2.122574447646494, + "grad_norm": 0.6725979801550628, + "learning_rate": 1.6701873423756275e-06, + "loss": 0.038, + "step": 2762 + }, + { + "epoch": 2.124111431316042, + "grad_norm": 0.7133205530228818, + "learning_rate": 1.6648524535710401e-06, + "loss": 0.041, + "step": 2764 + }, + { + "epoch": 2.1256484149855908, + "grad_norm": 0.6458222389869777, + "learning_rate": 1.6595234385994398e-06, + "loss": 0.0433, + "step": 2766 + }, + { + "epoch": 2.1271853986551394, + "grad_norm": 0.7422996801701388, + "learning_rate": 1.6542003145176265e-06, + "loss": 0.0425, + "step": 2768 + }, + { + "epoch": 2.1287223823246877, + "grad_norm": 0.7076408311621847, + "learning_rate": 1.648883098363542e-06, + "loss": 0.0416, + "step": 2770 + }, + { + "epoch": 2.1302593659942364, + "grad_norm": 0.7798719878560143, + "learning_rate": 1.6435718071562212e-06, + "loss": 0.0495, + "step": 2772 + }, + { + "epoch": 2.1317963496637846, + "grad_norm": 0.7240342459534611, + "learning_rate": 1.6382664578957359e-06, + "loss": 0.0392, + "step": 2774 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 0.7052969420907925, + "learning_rate": 1.6329670675631369e-06, + "loss": 0.0419, + "step": 2776 + }, + { + "epoch": 2.134870317002882, + "grad_norm": 0.7301031702593861, + "learning_rate": 1.6276736531204044e-06, + "loss": 0.0386, + "step": 2778 + }, + { + "epoch": 2.1364073006724302, + "grad_norm": 0.6875032643695771, + "learning_rate": 1.6223862315103865e-06, + "loss": 0.0393, + "step": 2780 + }, + { + "epoch": 2.137944284341979, + "grad_norm": 0.6775172137731008, + "learning_rate": 1.617104819656758e-06, + "loss": 0.0399, + "step": 2782 + }, + { + "epoch": 2.1394812680115276, + "grad_norm": 0.7408298536972419, + "learning_rate": 1.6118294344639496e-06, + "loss": 0.0431, + "step": 2784 + }, + { + "epoch": 2.141018251681076, + "grad_norm": 0.7531648570461281, + "learning_rate": 1.6065600928171054e-06, + "loss": 0.0401, + "step": 2786 + }, + { + "epoch": 2.1425552353506245, + "grad_norm": 0.6778919207003176, + "learning_rate": 1.6012968115820306e-06, + "loss": 0.0399, + "step": 2788 + }, + { + "epoch": 2.1440922190201728, + "grad_norm": 0.7039227096859256, + "learning_rate": 1.5960396076051233e-06, + "loss": 0.0402, + "step": 2790 + }, + { + "epoch": 2.1456292026897215, + "grad_norm": 0.6789632610059323, + "learning_rate": 1.5907884977133366e-06, + "loss": 0.0373, + "step": 2792 + }, + { + "epoch": 2.1471661863592697, + "grad_norm": 0.6884435472545695, + "learning_rate": 1.5855434987141163e-06, + "loss": 0.0447, + "step": 2794 + }, + { + "epoch": 2.1487031700288184, + "grad_norm": 0.6920660658427317, + "learning_rate": 1.580304627395348e-06, + "loss": 0.0454, + "step": 2796 + }, + { + "epoch": 2.150240153698367, + "grad_norm": 0.7745737611033345, + "learning_rate": 1.5750719005253062e-06, + "loss": 0.0413, + "step": 2798 + }, + { + "epoch": 2.1517771373679153, + "grad_norm": 0.6907792630115484, + "learning_rate": 1.569845334852595e-06, + "loss": 0.041, + "step": 2800 + }, + { + "epoch": 2.153314121037464, + "grad_norm": 0.6604941051087728, + "learning_rate": 1.5646249471060995e-06, + "loss": 0.0431, + "step": 2802 + }, + { + "epoch": 2.1548511047070127, + "grad_norm": 0.7080559060460522, + "learning_rate": 1.5594107539949365e-06, + "loss": 0.0421, + "step": 2804 + }, + { + "epoch": 2.156388088376561, + "grad_norm": 0.7149705649090393, + "learning_rate": 1.5542027722083853e-06, + "loss": 0.0388, + "step": 2806 + }, + { + "epoch": 2.1579250720461096, + "grad_norm": 0.7146661785693218, + "learning_rate": 1.5490010184158523e-06, + "loss": 0.0398, + "step": 2808 + }, + { + "epoch": 2.159462055715658, + "grad_norm": 0.7240758501470305, + "learning_rate": 1.543805509266806e-06, + "loss": 0.045, + "step": 2810 + }, + { + "epoch": 2.1609990393852065, + "grad_norm": 0.7001839005699572, + "learning_rate": 1.5386162613907287e-06, + "loss": 0.0377, + "step": 2812 + }, + { + "epoch": 2.1625360230547552, + "grad_norm": 0.7094421429704845, + "learning_rate": 1.5334332913970623e-06, + "loss": 0.0425, + "step": 2814 + }, + { + "epoch": 2.1640730067243035, + "grad_norm": 0.6847460522981713, + "learning_rate": 1.5282566158751524e-06, + "loss": 0.0402, + "step": 2816 + }, + { + "epoch": 2.165609990393852, + "grad_norm": 0.7120209696945222, + "learning_rate": 1.5230862513941995e-06, + "loss": 0.0399, + "step": 2818 + }, + { + "epoch": 2.1671469740634004, + "grad_norm": 0.6907346621733982, + "learning_rate": 1.5179222145032073e-06, + "loss": 0.0447, + "step": 2820 + }, + { + "epoch": 2.168683957732949, + "grad_norm": 0.7773144554098375, + "learning_rate": 1.5127645217309192e-06, + "loss": 0.0434, + "step": 2822 + }, + { + "epoch": 2.1702209414024978, + "grad_norm": 0.7117848076597437, + "learning_rate": 1.5076131895857784e-06, + "loss": 0.0423, + "step": 2824 + }, + { + "epoch": 2.171757925072046, + "grad_norm": 0.7027544209261267, + "learning_rate": 1.5024682345558675e-06, + "loss": 0.0401, + "step": 2826 + }, + { + "epoch": 2.1732949087415947, + "grad_norm": 0.7033554856080803, + "learning_rate": 1.4973296731088581e-06, + "loss": 0.0388, + "step": 2828 + }, + { + "epoch": 2.174831892411143, + "grad_norm": 0.7715815363157628, + "learning_rate": 1.4921975216919582e-06, + "loss": 0.04, + "step": 2830 + }, + { + "epoch": 2.1763688760806916, + "grad_norm": 0.7862111914567127, + "learning_rate": 1.4870717967318554e-06, + "loss": 0.0392, + "step": 2832 + }, + { + "epoch": 2.1779058597502403, + "grad_norm": 0.7221877404149011, + "learning_rate": 1.4819525146346723e-06, + "loss": 0.0461, + "step": 2834 + }, + { + "epoch": 2.1794428434197886, + "grad_norm": 0.6911027142163597, + "learning_rate": 1.4768396917859073e-06, + "loss": 0.0382, + "step": 2836 + }, + { + "epoch": 2.1809798270893372, + "grad_norm": 0.7119349195163982, + "learning_rate": 1.4717333445503851e-06, + "loss": 0.0384, + "step": 2838 + }, + { + "epoch": 2.1825168107588855, + "grad_norm": 0.7025660651802623, + "learning_rate": 1.4666334892722047e-06, + "loss": 0.0393, + "step": 2840 + }, + { + "epoch": 2.184053794428434, + "grad_norm": 0.7083054615445171, + "learning_rate": 1.4615401422746807e-06, + "loss": 0.0411, + "step": 2842 + }, + { + "epoch": 2.185590778097983, + "grad_norm": 0.7283963127887443, + "learning_rate": 1.4564533198603048e-06, + "loss": 0.0417, + "step": 2844 + }, + { + "epoch": 2.187127761767531, + "grad_norm": 0.6929199237523709, + "learning_rate": 1.45137303831068e-06, + "loss": 0.0378, + "step": 2846 + }, + { + "epoch": 2.18866474543708, + "grad_norm": 0.7003853343345394, + "learning_rate": 1.4462993138864725e-06, + "loss": 0.0389, + "step": 2848 + }, + { + "epoch": 2.1902017291066285, + "grad_norm": 0.6983653676006323, + "learning_rate": 1.441232162827364e-06, + "loss": 0.039, + "step": 2850 + }, + { + "epoch": 2.1917387127761767, + "grad_norm": 0.7159975682093909, + "learning_rate": 1.4361716013519952e-06, + "loss": 0.0409, + "step": 2852 + }, + { + "epoch": 2.1932756964457254, + "grad_norm": 0.7754416019144194, + "learning_rate": 1.4311176456579158e-06, + "loss": 0.0427, + "step": 2854 + }, + { + "epoch": 2.1948126801152736, + "grad_norm": 0.7345157001852429, + "learning_rate": 1.4260703119215324e-06, + "loss": 0.0425, + "step": 2856 + }, + { + "epoch": 2.1963496637848223, + "grad_norm": 0.6537911636031369, + "learning_rate": 1.4210296162980526e-06, + "loss": 0.04, + "step": 2858 + }, + { + "epoch": 2.1978866474543706, + "grad_norm": 0.6733339321649235, + "learning_rate": 1.415995574921444e-06, + "loss": 0.0358, + "step": 2860 + }, + { + "epoch": 2.1994236311239193, + "grad_norm": 0.6520397122582371, + "learning_rate": 1.4109682039043717e-06, + "loss": 0.0367, + "step": 2862 + }, + { + "epoch": 2.200960614793468, + "grad_norm": 0.7851918798134458, + "learning_rate": 1.4059475193381485e-06, + "loss": 0.044, + "step": 2864 + }, + { + "epoch": 2.202497598463016, + "grad_norm": 0.6944754890985299, + "learning_rate": 1.4009335372926895e-06, + "loss": 0.0388, + "step": 2866 + }, + { + "epoch": 2.204034582132565, + "grad_norm": 0.7185388294778169, + "learning_rate": 1.3959262738164548e-06, + "loss": 0.0434, + "step": 2868 + }, + { + "epoch": 2.2055715658021136, + "grad_norm": 0.7581514840792837, + "learning_rate": 1.3909257449364012e-06, + "loss": 0.0419, + "step": 2870 + }, + { + "epoch": 2.207108549471662, + "grad_norm": 0.6783363501218831, + "learning_rate": 1.38593196665793e-06, + "loss": 0.0374, + "step": 2872 + }, + { + "epoch": 2.2086455331412105, + "grad_norm": 0.6911512020404058, + "learning_rate": 1.3809449549648313e-06, + "loss": 0.037, + "step": 2874 + }, + { + "epoch": 2.2101825168107587, + "grad_norm": 0.6909020138454802, + "learning_rate": 1.3759647258192446e-06, + "loss": 0.0388, + "step": 2876 + }, + { + "epoch": 2.2117195004803074, + "grad_norm": 0.6798676329168656, + "learning_rate": 1.370991295161596e-06, + "loss": 0.0371, + "step": 2878 + }, + { + "epoch": 2.213256484149856, + "grad_norm": 0.7757828920522275, + "learning_rate": 1.3660246789105472e-06, + "loss": 0.0383, + "step": 2880 + }, + { + "epoch": 2.2147934678194043, + "grad_norm": 0.7289028293318881, + "learning_rate": 1.3610648929629593e-06, + "loss": 0.0408, + "step": 2882 + }, + { + "epoch": 2.216330451488953, + "grad_norm": 0.7420967431557955, + "learning_rate": 1.3561119531938205e-06, + "loss": 0.043, + "step": 2884 + }, + { + "epoch": 2.2178674351585013, + "grad_norm": 0.7163966573368784, + "learning_rate": 1.3511658754562126e-06, + "loss": 0.0396, + "step": 2886 + }, + { + "epoch": 2.21940441882805, + "grad_norm": 0.7419349249047851, + "learning_rate": 1.3462266755812522e-06, + "loss": 0.0417, + "step": 2888 + }, + { + "epoch": 2.2209414024975986, + "grad_norm": 0.7800199446876597, + "learning_rate": 1.3412943693780412e-06, + "loss": 0.0422, + "step": 2890 + }, + { + "epoch": 2.222478386167147, + "grad_norm": 0.6961688291543298, + "learning_rate": 1.3363689726336185e-06, + "loss": 0.0407, + "step": 2892 + }, + { + "epoch": 2.2240153698366956, + "grad_norm": 0.7098183143734048, + "learning_rate": 1.3314505011129031e-06, + "loss": 0.0441, + "step": 2894 + }, + { + "epoch": 2.225552353506244, + "grad_norm": 0.6599491965120293, + "learning_rate": 1.3265389705586513e-06, + "loss": 0.0391, + "step": 2896 + }, + { + "epoch": 2.2270893371757925, + "grad_norm": 0.6447579173261855, + "learning_rate": 1.3216343966914074e-06, + "loss": 0.0346, + "step": 2898 + }, + { + "epoch": 2.228626320845341, + "grad_norm": 0.7146194112938924, + "learning_rate": 1.31673679520944e-06, + "loss": 0.0414, + "step": 2900 + }, + { + "epoch": 2.2301633045148894, + "grad_norm": 0.7599787345630702, + "learning_rate": 1.3118461817887071e-06, + "loss": 0.0426, + "step": 2902 + }, + { + "epoch": 2.231700288184438, + "grad_norm": 0.7045861225059481, + "learning_rate": 1.3069625720827984e-06, + "loss": 0.0369, + "step": 2904 + }, + { + "epoch": 2.2332372718539863, + "grad_norm": 0.6678597621037076, + "learning_rate": 1.302085981722887e-06, + "loss": 0.0378, + "step": 2906 + }, + { + "epoch": 2.234774255523535, + "grad_norm": 0.7318552001241747, + "learning_rate": 1.2972164263176784e-06, + "loss": 0.043, + "step": 2908 + }, + { + "epoch": 2.2363112391930837, + "grad_norm": 0.7347881359720747, + "learning_rate": 1.2923539214533586e-06, + "loss": 0.0435, + "step": 2910 + }, + { + "epoch": 2.237848222862632, + "grad_norm": 0.7833005087501644, + "learning_rate": 1.2874984826935493e-06, + "loss": 0.0411, + "step": 2912 + }, + { + "epoch": 2.2393852065321806, + "grad_norm": 0.7340931719392261, + "learning_rate": 1.2826501255792588e-06, + "loss": 0.0419, + "step": 2914 + }, + { + "epoch": 2.2409221902017293, + "grad_norm": 0.7244781359138952, + "learning_rate": 1.277808865628821e-06, + "loss": 0.0403, + "step": 2916 + }, + { + "epoch": 2.2424591738712776, + "grad_norm": 0.7012819390876723, + "learning_rate": 1.2729747183378591e-06, + "loss": 0.041, + "step": 2918 + }, + { + "epoch": 2.2439961575408263, + "grad_norm": 0.7456233730244353, + "learning_rate": 1.2681476991792295e-06, + "loss": 0.0421, + "step": 2920 + }, + { + "epoch": 2.2455331412103745, + "grad_norm": 0.6802488303834758, + "learning_rate": 1.2633278236029728e-06, + "loss": 0.0362, + "step": 2922 + }, + { + "epoch": 2.247070124879923, + "grad_norm": 0.7518464030635749, + "learning_rate": 1.2585151070362655e-06, + "loss": 0.0404, + "step": 2924 + }, + { + "epoch": 2.2486071085494714, + "grad_norm": 0.6909362534084197, + "learning_rate": 1.253709564883368e-06, + "loss": 0.0404, + "step": 2926 + }, + { + "epoch": 2.25014409221902, + "grad_norm": 0.752505311933665, + "learning_rate": 1.2489112125255795e-06, + "loss": 0.0387, + "step": 2928 + }, + { + "epoch": 2.251681075888569, + "grad_norm": 0.74911032053493, + "learning_rate": 1.2441200653211853e-06, + "loss": 0.0383, + "step": 2930 + }, + { + "epoch": 2.253218059558117, + "grad_norm": 0.7048961647841477, + "learning_rate": 1.2393361386054102e-06, + "loss": 0.0359, + "step": 2932 + }, + { + "epoch": 2.2547550432276657, + "grad_norm": 0.7455505928042365, + "learning_rate": 1.2345594476903678e-06, + "loss": 0.0388, + "step": 2934 + }, + { + "epoch": 2.2562920268972144, + "grad_norm": 0.7618852898444295, + "learning_rate": 1.2297900078650073e-06, + "loss": 0.0406, + "step": 2936 + }, + { + "epoch": 2.2578290105667627, + "grad_norm": 0.7877656713372072, + "learning_rate": 1.225027834395077e-06, + "loss": 0.0418, + "step": 2938 + }, + { + "epoch": 2.2593659942363113, + "grad_norm": 0.7487876696738023, + "learning_rate": 1.2202729425230628e-06, + "loss": 0.0435, + "step": 2940 + }, + { + "epoch": 2.2609029779058596, + "grad_norm": 0.7549339920384163, + "learning_rate": 1.215525347468143e-06, + "loss": 0.039, + "step": 2942 + }, + { + "epoch": 2.2624399615754083, + "grad_norm": 0.7381797557455133, + "learning_rate": 1.210785064426143e-06, + "loss": 0.0401, + "step": 2944 + }, + { + "epoch": 2.263976945244957, + "grad_norm": 0.7303121808696214, + "learning_rate": 1.2060521085694836e-06, + "loss": 0.0393, + "step": 2946 + }, + { + "epoch": 2.265513928914505, + "grad_norm": 0.7091244479813347, + "learning_rate": 1.2013264950471333e-06, + "loss": 0.0426, + "step": 2948 + }, + { + "epoch": 2.267050912584054, + "grad_norm": 0.8368399757213874, + "learning_rate": 1.196608238984561e-06, + "loss": 0.0488, + "step": 2950 + }, + { + "epoch": 2.268587896253602, + "grad_norm": 0.7827298872789893, + "learning_rate": 1.191897355483681e-06, + "loss": 0.0439, + "step": 2952 + }, + { + "epoch": 2.270124879923151, + "grad_norm": 0.7171969138664943, + "learning_rate": 1.1871938596228177e-06, + "loss": 0.0388, + "step": 2954 + }, + { + "epoch": 2.2716618635926995, + "grad_norm": 0.7079483043665579, + "learning_rate": 1.1824977664566453e-06, + "loss": 0.0418, + "step": 2956 + }, + { + "epoch": 2.2731988472622477, + "grad_norm": 0.7574665616251981, + "learning_rate": 1.1778090910161426e-06, + "loss": 0.0416, + "step": 2958 + }, + { + "epoch": 2.2747358309317964, + "grad_norm": 0.7111652785046836, + "learning_rate": 1.1731278483085481e-06, + "loss": 0.0427, + "step": 2960 + }, + { + "epoch": 2.2762728146013447, + "grad_norm": 0.7535827297817603, + "learning_rate": 1.1684540533173104e-06, + "loss": 0.041, + "step": 2962 + }, + { + "epoch": 2.2778097982708934, + "grad_norm": 0.7519434123378339, + "learning_rate": 1.1637877210020395e-06, + "loss": 0.0408, + "step": 2964 + }, + { + "epoch": 2.279346781940442, + "grad_norm": 0.7008462349602121, + "learning_rate": 1.1591288662984594e-06, + "loss": 0.0397, + "step": 2966 + }, + { + "epoch": 2.2808837656099903, + "grad_norm": 0.7152887905478794, + "learning_rate": 1.154477504118357e-06, + "loss": 0.038, + "step": 2968 + }, + { + "epoch": 2.282420749279539, + "grad_norm": 0.7322203327399467, + "learning_rate": 1.1498336493495444e-06, + "loss": 0.0424, + "step": 2970 + }, + { + "epoch": 2.283957732949087, + "grad_norm": 0.705019954856083, + "learning_rate": 1.1451973168557992e-06, + "loss": 0.0393, + "step": 2972 + }, + { + "epoch": 2.285494716618636, + "grad_norm": 0.7314831773427871, + "learning_rate": 1.1405685214768224e-06, + "loss": 0.042, + "step": 2974 + }, + { + "epoch": 2.2870317002881846, + "grad_norm": 0.7024944801405867, + "learning_rate": 1.1359472780281916e-06, + "loss": 0.04, + "step": 2976 + }, + { + "epoch": 2.288568683957733, + "grad_norm": 0.7216103123781318, + "learning_rate": 1.1313336013013139e-06, + "loss": 0.04, + "step": 2978 + }, + { + "epoch": 2.2901056676272815, + "grad_norm": 0.6819781243729732, + "learning_rate": 1.126727506063375e-06, + "loss": 0.0383, + "step": 2980 + }, + { + "epoch": 2.29164265129683, + "grad_norm": 0.6997754986778041, + "learning_rate": 1.122129007057296e-06, + "loss": 0.0402, + "step": 2982 + }, + { + "epoch": 2.2931796349663784, + "grad_norm": 0.6626184454931626, + "learning_rate": 1.1175381190016833e-06, + "loss": 0.038, + "step": 2984 + }, + { + "epoch": 2.294716618635927, + "grad_norm": 0.6890948335939371, + "learning_rate": 1.1129548565907842e-06, + "loss": 0.041, + "step": 2986 + }, + { + "epoch": 2.2962536023054754, + "grad_norm": 0.6433359932240779, + "learning_rate": 1.108379234494435e-06, + "loss": 0.0351, + "step": 2988 + }, + { + "epoch": 2.297790585975024, + "grad_norm": 0.7392619695394935, + "learning_rate": 1.103811267358019e-06, + "loss": 0.0414, + "step": 2990 + }, + { + "epoch": 2.2993275696445723, + "grad_norm": 0.7246127735420398, + "learning_rate": 1.0992509698024226e-06, + "loss": 0.0389, + "step": 2992 + }, + { + "epoch": 2.300864553314121, + "grad_norm": 0.7447654773215131, + "learning_rate": 1.0946983564239754e-06, + "loss": 0.0459, + "step": 2994 + }, + { + "epoch": 2.3024015369836697, + "grad_norm": 0.7324507127817105, + "learning_rate": 1.0901534417944175e-06, + "loss": 0.0377, + "step": 2996 + }, + { + "epoch": 2.303938520653218, + "grad_norm": 0.6915535971116652, + "learning_rate": 1.0856162404608466e-06, + "loss": 0.0361, + "step": 2998 + }, + { + "epoch": 2.3054755043227666, + "grad_norm": 0.7490224319565619, + "learning_rate": 1.0810867669456705e-06, + "loss": 0.0375, + "step": 3000 + }, + { + "epoch": 2.3054755043227666, + "eval_loss": 0.1596132069826126, + "eval_runtime": 361.7237, + "eval_samples_per_second": 51.158, + "eval_steps_per_second": 6.397, + "step": 3000 + }, + { + "epoch": 2.3070124879923153, + "grad_norm": 0.7500197718141116, + "learning_rate": 1.0765650357465648e-06, + "loss": 0.0439, + "step": 3002 + }, + { + "epoch": 2.3085494716618635, + "grad_norm": 0.7509126020866208, + "learning_rate": 1.07205106133642e-06, + "loss": 0.0399, + "step": 3004 + }, + { + "epoch": 2.310086455331412, + "grad_norm": 0.6985233959996506, + "learning_rate": 1.0675448581633016e-06, + "loss": 0.0368, + "step": 3006 + }, + { + "epoch": 2.3116234390009605, + "grad_norm": 0.7109894469419616, + "learning_rate": 1.063046440650405e-06, + "loss": 0.0394, + "step": 3008 + }, + { + "epoch": 2.313160422670509, + "grad_norm": 0.6685986841427571, + "learning_rate": 1.0585558231959986e-06, + "loss": 0.0354, + "step": 3010 + }, + { + "epoch": 2.314697406340058, + "grad_norm": 0.6537755179648417, + "learning_rate": 1.0540730201733887e-06, + "loss": 0.0341, + "step": 3012 + }, + { + "epoch": 2.316234390009606, + "grad_norm": 0.7680457966259363, + "learning_rate": 1.0495980459308696e-06, + "loss": 0.0401, + "step": 3014 + }, + { + "epoch": 2.3177713736791548, + "grad_norm": 0.7160760279013273, + "learning_rate": 1.0451309147916773e-06, + "loss": 0.0385, + "step": 3016 + }, + { + "epoch": 2.319308357348703, + "grad_norm": 0.6930648625031354, + "learning_rate": 1.040671641053945e-06, + "loss": 0.0385, + "step": 3018 + }, + { + "epoch": 2.3208453410182517, + "grad_norm": 0.7212560558538098, + "learning_rate": 1.036220238990653e-06, + "loss": 0.038, + "step": 3020 + }, + { + "epoch": 2.3223823246878004, + "grad_norm": 0.7246474758799715, + "learning_rate": 1.0317767228495906e-06, + "loss": 0.0371, + "step": 3022 + }, + { + "epoch": 2.3239193083573486, + "grad_norm": 0.7534642781877979, + "learning_rate": 1.0273411068533037e-06, + "loss": 0.0387, + "step": 3024 + }, + { + "epoch": 2.3254562920268973, + "grad_norm": 0.7525530298880164, + "learning_rate": 1.0229134051990528e-06, + "loss": 0.0406, + "step": 3026 + }, + { + "epoch": 2.3269932756964455, + "grad_norm": 0.7939486095316215, + "learning_rate": 1.018493632058767e-06, + "loss": 0.0445, + "step": 3028 + }, + { + "epoch": 2.3285302593659942, + "grad_norm": 0.7218972892190336, + "learning_rate": 1.0140818015789975e-06, + "loss": 0.0405, + "step": 3030 + }, + { + "epoch": 2.330067243035543, + "grad_norm": 0.7983391471167234, + "learning_rate": 1.009677927880873e-06, + "loss": 0.04, + "step": 3032 + }, + { + "epoch": 2.331604226705091, + "grad_norm": 0.7867565276794657, + "learning_rate": 1.0052820250600568e-06, + "loss": 0.0433, + "step": 3034 + }, + { + "epoch": 2.33314121037464, + "grad_norm": 0.7385399550662357, + "learning_rate": 1.000894107186695e-06, + "loss": 0.0358, + "step": 3036 + }, + { + "epoch": 2.334678194044188, + "grad_norm": 0.7023204396957986, + "learning_rate": 9.965141883053803e-07, + "loss": 0.0409, + "step": 3038 + }, + { + "epoch": 2.3362151777137368, + "grad_norm": 0.7239223187447263, + "learning_rate": 9.921422824351001e-07, + "loss": 0.0401, + "step": 3040 + }, + { + "epoch": 2.3377521613832855, + "grad_norm": 0.7201483530659722, + "learning_rate": 9.87778403569196e-07, + "loss": 0.0374, + "step": 3042 + }, + { + "epoch": 2.3392891450528337, + "grad_norm": 0.7067782590935108, + "learning_rate": 9.834225656753167e-07, + "loss": 0.0364, + "step": 3044 + }, + { + "epoch": 2.3408261287223824, + "grad_norm": 0.767002733961986, + "learning_rate": 9.790747826953707e-07, + "loss": 0.0382, + "step": 3046 + }, + { + "epoch": 2.342363112391931, + "grad_norm": 0.7748316117552638, + "learning_rate": 9.747350685454906e-07, + "loss": 0.0416, + "step": 3048 + }, + { + "epoch": 2.3439000960614793, + "grad_norm": 0.6930253730222432, + "learning_rate": 9.704034371159801e-07, + "loss": 0.0409, + "step": 3050 + }, + { + "epoch": 2.345437079731028, + "grad_norm": 0.7552114221352043, + "learning_rate": 9.660799022712694e-07, + "loss": 0.0412, + "step": 3052 + }, + { + "epoch": 2.3469740634005762, + "grad_norm": 0.7612525976410068, + "learning_rate": 9.617644778498773e-07, + "loss": 0.0383, + "step": 3054 + }, + { + "epoch": 2.348511047070125, + "grad_norm": 0.7280586427447336, + "learning_rate": 9.574571776643617e-07, + "loss": 0.0433, + "step": 3056 + }, + { + "epoch": 2.350048030739673, + "grad_norm": 0.6693940571370718, + "learning_rate": 9.531580155012778e-07, + "loss": 0.04, + "step": 3058 + }, + { + "epoch": 2.351585014409222, + "grad_norm": 0.7048821090945985, + "learning_rate": 9.488670051211332e-07, + "loss": 0.0394, + "step": 3060 + }, + { + "epoch": 2.3531219980787705, + "grad_norm": 0.7505448748331673, + "learning_rate": 9.4458416025834e-07, + "loss": 0.0392, + "step": 3062 + }, + { + "epoch": 2.354658981748319, + "grad_norm": 0.6771190590157495, + "learning_rate": 9.403094946211808e-07, + "loss": 0.0413, + "step": 3064 + }, + { + "epoch": 2.3561959654178675, + "grad_norm": 0.7260675282763582, + "learning_rate": 9.360430218917558e-07, + "loss": 0.0398, + "step": 3066 + }, + { + "epoch": 2.357732949087416, + "grad_norm": 0.687104316635614, + "learning_rate": 9.317847557259387e-07, + "loss": 0.0388, + "step": 3068 + }, + { + "epoch": 2.3592699327569644, + "grad_norm": 0.6959356324751542, + "learning_rate": 9.275347097533404e-07, + "loss": 0.0409, + "step": 3070 + }, + { + "epoch": 2.360806916426513, + "grad_norm": 0.7566129208877186, + "learning_rate": 9.232928975772597e-07, + "loss": 0.0386, + "step": 3072 + }, + { + "epoch": 2.3623439000960613, + "grad_norm": 0.7158570072440973, + "learning_rate": 9.190593327746406e-07, + "loss": 0.0389, + "step": 3074 + }, + { + "epoch": 2.36388088376561, + "grad_norm": 0.7207667687469005, + "learning_rate": 9.148340288960307e-07, + "loss": 0.0409, + "step": 3076 + }, + { + "epoch": 2.3654178674351587, + "grad_norm": 0.7445605342380751, + "learning_rate": 9.106169994655347e-07, + "loss": 0.0407, + "step": 3078 + }, + { + "epoch": 2.366954851104707, + "grad_norm": 0.7736558101051751, + "learning_rate": 9.064082579807752e-07, + "loss": 0.0387, + "step": 3080 + }, + { + "epoch": 2.3684918347742556, + "grad_norm": 0.7435778083456558, + "learning_rate": 9.022078179128439e-07, + "loss": 0.0422, + "step": 3082 + }, + { + "epoch": 2.370028818443804, + "grad_norm": 0.680093837395743, + "learning_rate": 8.980156927062642e-07, + "loss": 0.0341, + "step": 3084 + }, + { + "epoch": 2.3715658021133526, + "grad_norm": 0.801244212730542, + "learning_rate": 8.93831895778948e-07, + "loss": 0.0372, + "step": 3086 + }, + { + "epoch": 2.3731027857829012, + "grad_norm": 0.7357307681580836, + "learning_rate": 8.896564405221447e-07, + "loss": 0.0377, + "step": 3088 + }, + { + "epoch": 2.3746397694524495, + "grad_norm": 0.7209769877560075, + "learning_rate": 8.85489340300409e-07, + "loss": 0.0423, + "step": 3090 + }, + { + "epoch": 2.376176753121998, + "grad_norm": 0.7330652449279741, + "learning_rate": 8.813306084515513e-07, + "loss": 0.0416, + "step": 3092 + }, + { + "epoch": 2.3777137367915464, + "grad_norm": 0.7504076654960494, + "learning_rate": 8.771802582865972e-07, + "loss": 0.0422, + "step": 3094 + }, + { + "epoch": 2.379250720461095, + "grad_norm": 0.6852298932331159, + "learning_rate": 8.730383030897463e-07, + "loss": 0.038, + "step": 3096 + }, + { + "epoch": 2.380787704130644, + "grad_norm": 0.6881568732248152, + "learning_rate": 8.689047561183245e-07, + "loss": 0.0382, + "step": 3098 + }, + { + "epoch": 2.382324687800192, + "grad_norm": 0.6634157742716904, + "learning_rate": 8.647796306027476e-07, + "loss": 0.0379, + "step": 3100 + }, + { + "epoch": 2.3838616714697407, + "grad_norm": 0.7328461753734546, + "learning_rate": 8.606629397464787e-07, + "loss": 0.0408, + "step": 3102 + }, + { + "epoch": 2.385398655139289, + "grad_norm": 0.7159969869187992, + "learning_rate": 8.565546967259782e-07, + "loss": 0.0351, + "step": 3104 + }, + { + "epoch": 2.3869356388088376, + "grad_norm": 0.7267286204071172, + "learning_rate": 8.524549146906714e-07, + "loss": 0.0399, + "step": 3106 + }, + { + "epoch": 2.3884726224783863, + "grad_norm": 0.6908926932230512, + "learning_rate": 8.483636067629014e-07, + "loss": 0.0434, + "step": 3108 + }, + { + "epoch": 2.3900096061479346, + "grad_norm": 0.697297173531623, + "learning_rate": 8.442807860378868e-07, + "loss": 0.0398, + "step": 3110 + }, + { + "epoch": 2.3915465898174832, + "grad_norm": 0.704515687419953, + "learning_rate": 8.40206465583683e-07, + "loss": 0.041, + "step": 3112 + }, + { + "epoch": 2.393083573487032, + "grad_norm": 0.717830937968017, + "learning_rate": 8.361406584411343e-07, + "loss": 0.0382, + "step": 3114 + }, + { + "epoch": 2.39462055715658, + "grad_norm": 0.6666435251973266, + "learning_rate": 8.320833776238384e-07, + "loss": 0.0355, + "step": 3116 + }, + { + "epoch": 2.396157540826129, + "grad_norm": 0.7678187211880796, + "learning_rate": 8.280346361181063e-07, + "loss": 0.0445, + "step": 3118 + }, + { + "epoch": 2.397694524495677, + "grad_norm": 0.6358919496522946, + "learning_rate": 8.239944468829094e-07, + "loss": 0.038, + "step": 3120 + }, + { + "epoch": 2.399231508165226, + "grad_norm": 0.6575677433822967, + "learning_rate": 8.199628228498507e-07, + "loss": 0.0382, + "step": 3122 + }, + { + "epoch": 2.400768491834774, + "grad_norm": 0.7346980326551955, + "learning_rate": 8.159397769231166e-07, + "loss": 0.0406, + "step": 3124 + }, + { + "epoch": 2.4023054755043227, + "grad_norm": 0.6780951881965817, + "learning_rate": 8.119253219794369e-07, + "loss": 0.0376, + "step": 3126 + }, + { + "epoch": 2.4038424591738714, + "grad_norm": 0.7518449654269455, + "learning_rate": 8.079194708680458e-07, + "loss": 0.0432, + "step": 3128 + }, + { + "epoch": 2.4053794428434196, + "grad_norm": 0.6735026739540719, + "learning_rate": 8.039222364106351e-07, + "loss": 0.0353, + "step": 3130 + }, + { + "epoch": 2.4069164265129683, + "grad_norm": 0.6342721214297197, + "learning_rate": 7.999336314013204e-07, + "loss": 0.0402, + "step": 3132 + }, + { + "epoch": 2.408453410182517, + "grad_norm": 0.7230053307776508, + "learning_rate": 7.959536686065951e-07, + "loss": 0.0391, + "step": 3134 + }, + { + "epoch": 2.4099903938520653, + "grad_norm": 0.7653416656760699, + "learning_rate": 7.919823607652921e-07, + "loss": 0.0399, + "step": 3136 + }, + { + "epoch": 2.411527377521614, + "grad_norm": 0.8030771738565683, + "learning_rate": 7.880197205885418e-07, + "loss": 0.0429, + "step": 3138 + }, + { + "epoch": 2.413064361191162, + "grad_norm": 0.7931388452091263, + "learning_rate": 7.840657607597282e-07, + "loss": 0.0428, + "step": 3140 + }, + { + "epoch": 2.414601344860711, + "grad_norm": 0.7430780202524496, + "learning_rate": 7.80120493934458e-07, + "loss": 0.0431, + "step": 3142 + }, + { + "epoch": 2.4161383285302596, + "grad_norm": 0.6756429003384381, + "learning_rate": 7.761839327405105e-07, + "loss": 0.038, + "step": 3144 + }, + { + "epoch": 2.417675312199808, + "grad_norm": 0.7445655284954013, + "learning_rate": 7.722560897777989e-07, + "loss": 0.0378, + "step": 3146 + }, + { + "epoch": 2.4192122958693565, + "grad_norm": 0.780771575546592, + "learning_rate": 7.683369776183342e-07, + "loss": 0.0398, + "step": 3148 + }, + { + "epoch": 2.4207492795389047, + "grad_norm": 0.7856619296684908, + "learning_rate": 7.644266088061811e-07, + "loss": 0.0436, + "step": 3150 + }, + { + "epoch": 2.4222862632084534, + "grad_norm": 0.7727263360626014, + "learning_rate": 7.605249958574199e-07, + "loss": 0.0435, + "step": 3152 + }, + { + "epoch": 2.423823246878002, + "grad_norm": 0.7022557826012432, + "learning_rate": 7.566321512601064e-07, + "loss": 0.039, + "step": 3154 + }, + { + "epoch": 2.4253602305475503, + "grad_norm": 0.7135877902071903, + "learning_rate": 7.527480874742269e-07, + "loss": 0.0404, + "step": 3156 + }, + { + "epoch": 2.426897214217099, + "grad_norm": 0.726091607679137, + "learning_rate": 7.488728169316684e-07, + "loss": 0.039, + "step": 3158 + }, + { + "epoch": 2.4284341978866473, + "grad_norm": 0.6803423609158856, + "learning_rate": 7.450063520361706e-07, + "loss": 0.0382, + "step": 3160 + }, + { + "epoch": 2.429971181556196, + "grad_norm": 0.631085298189217, + "learning_rate": 7.411487051632861e-07, + "loss": 0.0369, + "step": 3162 + }, + { + "epoch": 2.4315081652257446, + "grad_norm": 0.68946022968755, + "learning_rate": 7.372998886603466e-07, + "loss": 0.0383, + "step": 3164 + }, + { + "epoch": 2.433045148895293, + "grad_norm": 0.6831510201370967, + "learning_rate": 7.33459914846419e-07, + "loss": 0.0431, + "step": 3166 + }, + { + "epoch": 2.4345821325648416, + "grad_norm": 0.7684038963667288, + "learning_rate": 7.296287960122672e-07, + "loss": 0.0414, + "step": 3168 + }, + { + "epoch": 2.43611911623439, + "grad_norm": 0.7347215978088572, + "learning_rate": 7.258065444203128e-07, + "loss": 0.0416, + "step": 3170 + }, + { + "epoch": 2.4376560999039385, + "grad_norm": 0.7643085623961072, + "learning_rate": 7.219931723045929e-07, + "loss": 0.0433, + "step": 3172 + }, + { + "epoch": 2.439193083573487, + "grad_norm": 0.7071330978188225, + "learning_rate": 7.181886918707297e-07, + "loss": 0.037, + "step": 3174 + }, + { + "epoch": 2.4407300672430354, + "grad_norm": 0.6841434498931844, + "learning_rate": 7.143931152958791e-07, + "loss": 0.0372, + "step": 3176 + }, + { + "epoch": 2.442267050912584, + "grad_norm": 0.692314572344589, + "learning_rate": 7.106064547287002e-07, + "loss": 0.0345, + "step": 3178 + }, + { + "epoch": 2.443804034582133, + "grad_norm": 0.7363574996455537, + "learning_rate": 7.068287222893179e-07, + "loss": 0.0379, + "step": 3180 + }, + { + "epoch": 2.445341018251681, + "grad_norm": 0.6822850019837609, + "learning_rate": 7.030599300692748e-07, + "loss": 0.0368, + "step": 3182 + }, + { + "epoch": 2.4468780019212297, + "grad_norm": 0.7168612788689864, + "learning_rate": 6.993000901315013e-07, + "loss": 0.0375, + "step": 3184 + }, + { + "epoch": 2.448414985590778, + "grad_norm": 0.7882428320516394, + "learning_rate": 6.955492145102735e-07, + "loss": 0.0411, + "step": 3186 + }, + { + "epoch": 2.4499519692603267, + "grad_norm": 0.7300434190827599, + "learning_rate": 6.918073152111746e-07, + "loss": 0.0411, + "step": 3188 + }, + { + "epoch": 2.451488952929875, + "grad_norm": 0.7866316609425517, + "learning_rate": 6.88074404211058e-07, + "loss": 0.0439, + "step": 3190 + }, + { + "epoch": 2.4530259365994236, + "grad_norm": 0.7171956812535517, + "learning_rate": 6.843504934580055e-07, + "loss": 0.0388, + "step": 3192 + }, + { + "epoch": 2.4545629202689723, + "grad_norm": 0.704134115079434, + "learning_rate": 6.806355948712931e-07, + "loss": 0.0366, + "step": 3194 + }, + { + "epoch": 2.4560999039385205, + "grad_norm": 0.7589267365486366, + "learning_rate": 6.76929720341353e-07, + "loss": 0.0402, + "step": 3196 + }, + { + "epoch": 2.457636887608069, + "grad_norm": 0.7438363100690424, + "learning_rate": 6.732328817297294e-07, + "loss": 0.0383, + "step": 3198 + }, + { + "epoch": 2.459173871277618, + "grad_norm": 0.7366824621405825, + "learning_rate": 6.695450908690482e-07, + "loss": 0.0402, + "step": 3200 + }, + { + "epoch": 2.460710854947166, + "grad_norm": 0.7557689558570855, + "learning_rate": 6.658663595629751e-07, + "loss": 0.0386, + "step": 3202 + }, + { + "epoch": 2.462247838616715, + "grad_norm": 0.7592608254472845, + "learning_rate": 6.621966995861775e-07, + "loss": 0.0383, + "step": 3204 + }, + { + "epoch": 2.463784822286263, + "grad_norm": 0.7295386054315369, + "learning_rate": 6.585361226842899e-07, + "loss": 0.0361, + "step": 3206 + }, + { + "epoch": 2.4653218059558117, + "grad_norm": 0.6625997315002057, + "learning_rate": 6.548846405738713e-07, + "loss": 0.038, + "step": 3208 + }, + { + "epoch": 2.46685878962536, + "grad_norm": 0.7014497663989586, + "learning_rate": 6.512422649423715e-07, + "loss": 0.0412, + "step": 3210 + }, + { + "epoch": 2.4683957732949087, + "grad_norm": 0.783558414213728, + "learning_rate": 6.476090074480966e-07, + "loss": 0.0393, + "step": 3212 + }, + { + "epoch": 2.4699327569644574, + "grad_norm": 0.7244776560799225, + "learning_rate": 6.439848797201623e-07, + "loss": 0.038, + "step": 3214 + }, + { + "epoch": 2.4714697406340056, + "grad_norm": 0.7224162285528907, + "learning_rate": 6.403698933584653e-07, + "loss": 0.0409, + "step": 3216 + }, + { + "epoch": 2.4730067243035543, + "grad_norm": 0.761865817385147, + "learning_rate": 6.367640599336425e-07, + "loss": 0.0397, + "step": 3218 + }, + { + "epoch": 2.474543707973103, + "grad_norm": 0.6853505678703133, + "learning_rate": 6.331673909870353e-07, + "loss": 0.0364, + "step": 3220 + }, + { + "epoch": 2.476080691642651, + "grad_norm": 0.716004689687549, + "learning_rate": 6.295798980306516e-07, + "loss": 0.0384, + "step": 3222 + }, + { + "epoch": 2.4776176753122, + "grad_norm": 0.6629622601789238, + "learning_rate": 6.260015925471279e-07, + "loss": 0.0382, + "step": 3224 + }, + { + "epoch": 2.479154658981748, + "grad_norm": 0.6579568031971793, + "learning_rate": 6.224324859896957e-07, + "loss": 0.0355, + "step": 3226 + }, + { + "epoch": 2.480691642651297, + "grad_norm": 0.7068591652816046, + "learning_rate": 6.18872589782143e-07, + "loss": 0.041, + "step": 3228 + }, + { + "epoch": 2.4822286263208455, + "grad_norm": 0.7304546722170435, + "learning_rate": 6.153219153187772e-07, + "loss": 0.04, + "step": 3230 + }, + { + "epoch": 2.4837656099903938, + "grad_norm": 0.7585585312793138, + "learning_rate": 6.117804739643907e-07, + "loss": 0.0394, + "step": 3232 + }, + { + "epoch": 2.4853025936599424, + "grad_norm": 0.7054671882965444, + "learning_rate": 6.082482770542192e-07, + "loss": 0.0378, + "step": 3234 + }, + { + "epoch": 2.4868395773294907, + "grad_norm": 0.703977106591607, + "learning_rate": 6.047253358939142e-07, + "loss": 0.0383, + "step": 3236 + }, + { + "epoch": 2.4883765609990394, + "grad_norm": 0.6400130530644246, + "learning_rate": 6.01211661759501e-07, + "loss": 0.0397, + "step": 3238 + }, + { + "epoch": 2.489913544668588, + "grad_norm": 0.7278600446549451, + "learning_rate": 5.977072658973393e-07, + "loss": 0.0384, + "step": 3240 + }, + { + "epoch": 2.4914505283381363, + "grad_norm": 0.746204962644141, + "learning_rate": 5.942121595240963e-07, + "loss": 0.0362, + "step": 3242 + }, + { + "epoch": 2.492987512007685, + "grad_norm": 0.7292907227238307, + "learning_rate": 5.907263538267036e-07, + "loss": 0.0386, + "step": 3244 + }, + { + "epoch": 2.4945244956772337, + "grad_norm": 0.7163392398345111, + "learning_rate": 5.872498599623248e-07, + "loss": 0.0401, + "step": 3246 + }, + { + "epoch": 2.496061479346782, + "grad_norm": 0.7435508674073057, + "learning_rate": 5.837826890583188e-07, + "loss": 0.0416, + "step": 3248 + }, + { + "epoch": 2.4975984630163306, + "grad_norm": 0.72066787754061, + "learning_rate": 5.803248522122008e-07, + "loss": 0.0394, + "step": 3250 + }, + { + "epoch": 2.499135446685879, + "grad_norm": 0.732919013969617, + "learning_rate": 5.768763604916157e-07, + "loss": 0.0411, + "step": 3252 + }, + { + "epoch": 2.5006724303554275, + "grad_norm": 0.7405159195872731, + "learning_rate": 5.734372249342942e-07, + "loss": 0.0417, + "step": 3254 + }, + { + "epoch": 2.5022094140249758, + "grad_norm": 0.7197000994936994, + "learning_rate": 5.700074565480184e-07, + "loss": 0.041, + "step": 3256 + }, + { + "epoch": 2.5037463976945245, + "grad_norm": 0.7015912360666993, + "learning_rate": 5.665870663105918e-07, + "loss": 0.0384, + "step": 3258 + }, + { + "epoch": 2.505283381364073, + "grad_norm": 0.7221454603303137, + "learning_rate": 5.631760651697987e-07, + "loss": 0.0406, + "step": 3260 + }, + { + "epoch": 2.5068203650336214, + "grad_norm": 0.6898827529762818, + "learning_rate": 5.59774464043373e-07, + "loss": 0.0347, + "step": 3262 + }, + { + "epoch": 2.50835734870317, + "grad_norm": 0.7561654186730838, + "learning_rate": 5.56382273818961e-07, + "loss": 0.0401, + "step": 3264 + }, + { + "epoch": 2.5098943323727188, + "grad_norm": 0.7634858564317603, + "learning_rate": 5.529995053540845e-07, + "loss": 0.0393, + "step": 3266 + }, + { + "epoch": 2.511431316042267, + "grad_norm": 0.7573259002994046, + "learning_rate": 5.496261694761138e-07, + "loss": 0.0369, + "step": 3268 + }, + { + "epoch": 2.5129682997118157, + "grad_norm": 0.712091317631427, + "learning_rate": 5.46262276982225e-07, + "loss": 0.0364, + "step": 3270 + }, + { + "epoch": 2.514505283381364, + "grad_norm": 0.7452058266292231, + "learning_rate": 5.429078386393659e-07, + "loss": 0.0376, + "step": 3272 + }, + { + "epoch": 2.5160422670509126, + "grad_norm": 0.6945382662735007, + "learning_rate": 5.3956286518423e-07, + "loss": 0.0383, + "step": 3274 + }, + { + "epoch": 2.517579250720461, + "grad_norm": 0.7542364397801513, + "learning_rate": 5.362273673232104e-07, + "loss": 0.0404, + "step": 3276 + }, + { + "epoch": 2.5191162343900095, + "grad_norm": 0.6802822757529954, + "learning_rate": 5.329013557323747e-07, + "loss": 0.0373, + "step": 3278 + }, + { + "epoch": 2.520653218059558, + "grad_norm": 0.6897346854529715, + "learning_rate": 5.295848410574261e-07, + "loss": 0.0371, + "step": 3280 + }, + { + "epoch": 2.5221902017291065, + "grad_norm": 0.7751087080675236, + "learning_rate": 5.262778339136709e-07, + "loss": 0.0395, + "step": 3282 + }, + { + "epoch": 2.523727185398655, + "grad_norm": 0.739466897171713, + "learning_rate": 5.229803448859851e-07, + "loss": 0.0387, + "step": 3284 + }, + { + "epoch": 2.525264169068204, + "grad_norm": 0.7326229919534164, + "learning_rate": 5.196923845287773e-07, + "loss": 0.0399, + "step": 3286 + }, + { + "epoch": 2.526801152737752, + "grad_norm": 0.6978471766129466, + "learning_rate": 5.164139633659586e-07, + "loss": 0.0371, + "step": 3288 + }, + { + "epoch": 2.5283381364073008, + "grad_norm": 0.7419075355790281, + "learning_rate": 5.1314509189091e-07, + "loss": 0.0418, + "step": 3290 + }, + { + "epoch": 2.5298751200768494, + "grad_norm": 0.6611449090491568, + "learning_rate": 5.098857805664424e-07, + "loss": 0.0363, + "step": 3292 + }, + { + "epoch": 2.5314121037463977, + "grad_norm": 0.6982486243686108, + "learning_rate": 5.066360398247698e-07, + "loss": 0.0379, + "step": 3294 + }, + { + "epoch": 2.532949087415946, + "grad_norm": 0.6911500588630615, + "learning_rate": 5.033958800674717e-07, + "loss": 0.0367, + "step": 3296 + }, + { + "epoch": 2.5344860710854946, + "grad_norm": 0.669502871148484, + "learning_rate": 5.001653116654625e-07, + "loss": 0.0345, + "step": 3298 + }, + { + "epoch": 2.5360230547550433, + "grad_norm": 0.6572589542045425, + "learning_rate": 4.969443449589576e-07, + "loss": 0.0366, + "step": 3300 + }, + { + "epoch": 2.5375600384245915, + "grad_norm": 0.6553120661704981, + "learning_rate": 4.937329902574367e-07, + "loss": 0.0382, + "step": 3302 + }, + { + "epoch": 2.5390970220941402, + "grad_norm": 0.7504177803505651, + "learning_rate": 4.90531257839617e-07, + "loss": 0.0376, + "step": 3304 + }, + { + "epoch": 2.540634005763689, + "grad_norm": 0.6822045806022221, + "learning_rate": 4.873391579534186e-07, + "loss": 0.0383, + "step": 3306 + }, + { + "epoch": 2.542170989433237, + "grad_norm": 0.6780574733579846, + "learning_rate": 4.841567008159255e-07, + "loss": 0.0352, + "step": 3308 + }, + { + "epoch": 2.543707973102786, + "grad_norm": 0.721214268014702, + "learning_rate": 4.809838966133612e-07, + "loss": 0.0396, + "step": 3310 + }, + { + "epoch": 2.5452449567723345, + "grad_norm": 0.7202960873503238, + "learning_rate": 4.778207555010525e-07, + "loss": 0.043, + "step": 3312 + }, + { + "epoch": 2.5467819404418828, + "grad_norm": 0.6736522099205745, + "learning_rate": 4.74667287603396e-07, + "loss": 0.0379, + "step": 3314 + }, + { + "epoch": 2.5483189241114315, + "grad_norm": 0.7223414892006996, + "learning_rate": 4.715235030138284e-07, + "loss": 0.036, + "step": 3316 + }, + { + "epoch": 2.5498559077809797, + "grad_norm": 0.6901701228966389, + "learning_rate": 4.6838941179479045e-07, + "loss": 0.0389, + "step": 3318 + }, + { + "epoch": 2.5513928914505284, + "grad_norm": 0.7617514775980445, + "learning_rate": 4.652650239776981e-07, + "loss": 0.0399, + "step": 3320 + }, + { + "epoch": 2.5529298751200766, + "grad_norm": 0.7419234804032083, + "learning_rate": 4.6215034956290966e-07, + "loss": 0.0411, + "step": 3322 + }, + { + "epoch": 2.5544668587896253, + "grad_norm": 0.7045434950164664, + "learning_rate": 4.590453985196933e-07, + "loss": 0.0342, + "step": 3324 + }, + { + "epoch": 2.556003842459174, + "grad_norm": 0.6495176322545015, + "learning_rate": 4.5595018078619483e-07, + "loss": 0.0369, + "step": 3326 + }, + { + "epoch": 2.5575408261287222, + "grad_norm": 0.6788801032608669, + "learning_rate": 4.5286470626940446e-07, + "loss": 0.0351, + "step": 3328 + }, + { + "epoch": 2.559077809798271, + "grad_norm": 0.7617522250263151, + "learning_rate": 4.497889848451303e-07, + "loss": 0.0392, + "step": 3330 + }, + { + "epoch": 2.5606147934678196, + "grad_norm": 0.6989723366985957, + "learning_rate": 4.4672302635796267e-07, + "loss": 0.0385, + "step": 3332 + }, + { + "epoch": 2.562151777137368, + "grad_norm": 0.7054197171986639, + "learning_rate": 4.436668406212402e-07, + "loss": 0.0385, + "step": 3334 + }, + { + "epoch": 2.5636887608069165, + "grad_norm": 0.6582212133744816, + "learning_rate": 4.4062043741702416e-07, + "loss": 0.0357, + "step": 3336 + }, + { + "epoch": 2.565225744476465, + "grad_norm": 0.7280116384325578, + "learning_rate": 4.37583826496064e-07, + "loss": 0.0437, + "step": 3338 + }, + { + "epoch": 2.5667627281460135, + "grad_norm": 0.8381876496092886, + "learning_rate": 4.3455701757776623e-07, + "loss": 0.0388, + "step": 3340 + }, + { + "epoch": 2.5682997118155617, + "grad_norm": 0.7357236219117014, + "learning_rate": 4.315400203501649e-07, + "loss": 0.0389, + "step": 3342 + }, + { + "epoch": 2.5698366954851104, + "grad_norm": 0.7154920208066208, + "learning_rate": 4.2853284446988595e-07, + "loss": 0.0394, + "step": 3344 + }, + { + "epoch": 2.571373679154659, + "grad_norm": 0.7461232486368099, + "learning_rate": 4.2553549956212485e-07, + "loss": 0.0378, + "step": 3346 + }, + { + "epoch": 2.5729106628242073, + "grad_norm": 0.6390926012256907, + "learning_rate": 4.225479952206074e-07, + "loss": 0.0344, + "step": 3348 + }, + { + "epoch": 2.574447646493756, + "grad_norm": 0.6265363948197101, + "learning_rate": 4.195703410075615e-07, + "loss": 0.0358, + "step": 3350 + }, + { + "epoch": 2.5759846301633047, + "grad_norm": 0.7056231822693347, + "learning_rate": 4.1660254645368936e-07, + "loss": 0.0383, + "step": 3352 + }, + { + "epoch": 2.577521613832853, + "grad_norm": 0.7197705899541366, + "learning_rate": 4.1364462105813486e-07, + "loss": 0.0382, + "step": 3354 + }, + { + "epoch": 2.5790585975024016, + "grad_norm": 0.7234355293425967, + "learning_rate": 4.10696574288452e-07, + "loss": 0.0362, + "step": 3356 + }, + { + "epoch": 2.5805955811719503, + "grad_norm": 0.6700554042034669, + "learning_rate": 4.077584155805774e-07, + "loss": 0.0383, + "step": 3358 + }, + { + "epoch": 2.5821325648414986, + "grad_norm": 0.6989541320884164, + "learning_rate": 4.048301543387956e-07, + "loss": 0.0334, + "step": 3360 + }, + { + "epoch": 2.583669548511047, + "grad_norm": 0.7142263192429504, + "learning_rate": 4.0191179993571623e-07, + "loss": 0.0408, + "step": 3362 + }, + { + "epoch": 2.5852065321805955, + "grad_norm": 0.665689407268323, + "learning_rate": 3.990033617122375e-07, + "loss": 0.0343, + "step": 3364 + }, + { + "epoch": 2.586743515850144, + "grad_norm": 0.6706336215208009, + "learning_rate": 3.961048489775169e-07, + "loss": 0.0393, + "step": 3366 + }, + { + "epoch": 2.5882804995196924, + "grad_norm": 0.746265905426555, + "learning_rate": 3.932162710089454e-07, + "loss": 0.0403, + "step": 3368 + }, + { + "epoch": 2.589817483189241, + "grad_norm": 0.7118457765158708, + "learning_rate": 3.903376370521146e-07, + "loss": 0.0381, + "step": 3370 + }, + { + "epoch": 2.59135446685879, + "grad_norm": 0.6708203366211161, + "learning_rate": 3.874689563207873e-07, + "loss": 0.0341, + "step": 3372 + }, + { + "epoch": 2.592891450528338, + "grad_norm": 0.6330517123604927, + "learning_rate": 3.8461023799686917e-07, + "loss": 0.0364, + "step": 3374 + }, + { + "epoch": 2.5944284341978867, + "grad_norm": 0.7349779518025492, + "learning_rate": 3.817614912303794e-07, + "loss": 0.0382, + "step": 3376 + }, + { + "epoch": 2.5959654178674354, + "grad_norm": 0.704462327359088, + "learning_rate": 3.7892272513942003e-07, + "loss": 0.0373, + "step": 3378 + }, + { + "epoch": 2.5975024015369836, + "grad_norm": 0.6642196959244105, + "learning_rate": 3.760939488101463e-07, + "loss": 0.036, + "step": 3380 + }, + { + "epoch": 2.5990393852065323, + "grad_norm": 0.6891895315956058, + "learning_rate": 3.7327517129674055e-07, + "loss": 0.0374, + "step": 3382 + }, + { + "epoch": 2.6005763688760806, + "grad_norm": 0.7644451152560637, + "learning_rate": 3.70466401621383e-07, + "loss": 0.0405, + "step": 3384 + }, + { + "epoch": 2.6021133525456293, + "grad_norm": 0.7737376511884962, + "learning_rate": 3.676676487742172e-07, + "loss": 0.0366, + "step": 3386 + }, + { + "epoch": 2.6036503362151775, + "grad_norm": 0.7181650532484439, + "learning_rate": 3.648789217133282e-07, + "loss": 0.0374, + "step": 3388 + }, + { + "epoch": 2.605187319884726, + "grad_norm": 0.6915226559770837, + "learning_rate": 3.621002293647111e-07, + "loss": 0.0368, + "step": 3390 + }, + { + "epoch": 2.606724303554275, + "grad_norm": 0.7570072346163113, + "learning_rate": 3.593315806222415e-07, + "loss": 0.04, + "step": 3392 + }, + { + "epoch": 2.608261287223823, + "grad_norm": 0.8257096784505046, + "learning_rate": 3.5657298434764884e-07, + "loss": 0.0409, + "step": 3394 + }, + { + "epoch": 2.609798270893372, + "grad_norm": 0.7210127478796208, + "learning_rate": 3.538244493704862e-07, + "loss": 0.0382, + "step": 3396 + }, + { + "epoch": 2.6113352545629205, + "grad_norm": 0.7234247835643788, + "learning_rate": 3.510859844881033e-07, + "loss": 0.0374, + "step": 3398 + }, + { + "epoch": 2.6128722382324687, + "grad_norm": 0.7384845763851767, + "learning_rate": 3.4835759846562047e-07, + "loss": 0.0386, + "step": 3400 + }, + { + "epoch": 2.6144092219020174, + "grad_norm": 0.7147474025303922, + "learning_rate": 3.45639300035894e-07, + "loss": 0.0346, + "step": 3402 + }, + { + "epoch": 2.6159462055715657, + "grad_norm": 0.6463222792959664, + "learning_rate": 3.429310978994955e-07, + "loss": 0.0336, + "step": 3404 + }, + { + "epoch": 2.6174831892411143, + "grad_norm": 0.7095670666778837, + "learning_rate": 3.402330007246798e-07, + "loss": 0.0354, + "step": 3406 + }, + { + "epoch": 2.6190201729106626, + "grad_norm": 0.6506286288306146, + "learning_rate": 3.3754501714735867e-07, + "loss": 0.0373, + "step": 3408 + }, + { + "epoch": 2.6205571565802113, + "grad_norm": 0.7120588233101057, + "learning_rate": 3.34867155771074e-07, + "loss": 0.0387, + "step": 3410 + }, + { + "epoch": 2.62209414024976, + "grad_norm": 0.7586108600316385, + "learning_rate": 3.321994251669659e-07, + "loss": 0.0372, + "step": 3412 + }, + { + "epoch": 2.623631123919308, + "grad_norm": 0.8046131513589455, + "learning_rate": 3.295418338737517e-07, + "loss": 0.0394, + "step": 3414 + }, + { + "epoch": 2.625168107588857, + "grad_norm": 0.6335632790717333, + "learning_rate": 3.2689439039769407e-07, + "loss": 0.0344, + "step": 3416 + }, + { + "epoch": 2.6267050912584056, + "grad_norm": 0.7948767979535154, + "learning_rate": 3.2425710321257503e-07, + "loss": 0.0413, + "step": 3418 + }, + { + "epoch": 2.628242074927954, + "grad_norm": 0.6525048541754009, + "learning_rate": 3.216299807596697e-07, + "loss": 0.0364, + "step": 3420 + }, + { + "epoch": 2.6297790585975025, + "grad_norm": 0.6986942893445027, + "learning_rate": 3.190130314477178e-07, + "loss": 0.0424, + "step": 3422 + }, + { + "epoch": 2.631316042267051, + "grad_norm": 0.6837105014346364, + "learning_rate": 3.164062636528975e-07, + "loss": 0.036, + "step": 3424 + }, + { + "epoch": 2.6328530259365994, + "grad_norm": 0.6890104339174508, + "learning_rate": 3.138096857188001e-07, + "loss": 0.0343, + "step": 3426 + }, + { + "epoch": 2.6343900096061477, + "grad_norm": 0.7400251284066703, + "learning_rate": 3.1122330595639864e-07, + "loss": 0.0338, + "step": 3428 + }, + { + "epoch": 2.6359269932756964, + "grad_norm": 0.7249635097965398, + "learning_rate": 3.0864713264402697e-07, + "loss": 0.0409, + "step": 3430 + }, + { + "epoch": 2.637463976945245, + "grad_norm": 0.6532133989690416, + "learning_rate": 3.060811740273496e-07, + "loss": 0.0381, + "step": 3432 + }, + { + "epoch": 2.6390009606147933, + "grad_norm": 0.7524963132197209, + "learning_rate": 3.03525438319338e-07, + "loss": 0.0403, + "step": 3434 + }, + { + "epoch": 2.640537944284342, + "grad_norm": 0.7653173809713844, + "learning_rate": 3.0097993370024114e-07, + "loss": 0.039, + "step": 3436 + }, + { + "epoch": 2.6420749279538907, + "grad_norm": 0.7480100060268294, + "learning_rate": 2.9844466831756e-07, + "loss": 0.0414, + "step": 3438 + }, + { + "epoch": 2.643611911623439, + "grad_norm": 0.7160584018035305, + "learning_rate": 2.9591965028602584e-07, + "loss": 0.0355, + "step": 3440 + }, + { + "epoch": 2.6451488952929876, + "grad_norm": 0.7617333555835748, + "learning_rate": 2.9340488768756837e-07, + "loss": 0.0402, + "step": 3442 + }, + { + "epoch": 2.6466858789625363, + "grad_norm": 0.6637519028178263, + "learning_rate": 2.909003885712919e-07, + "loss": 0.0391, + "step": 3444 + }, + { + "epoch": 2.6482228626320845, + "grad_norm": 0.6831014037948032, + "learning_rate": 2.8840616095345085e-07, + "loss": 0.0362, + "step": 3446 + }, + { + "epoch": 2.649759846301633, + "grad_norm": 0.6786000704889272, + "learning_rate": 2.859222128174235e-07, + "loss": 0.0378, + "step": 3448 + }, + { + "epoch": 2.6512968299711814, + "grad_norm": 0.8326763730243029, + "learning_rate": 2.83448552113686e-07, + "loss": 0.0405, + "step": 3450 + }, + { + "epoch": 2.65283381364073, + "grad_norm": 0.7015006547909788, + "learning_rate": 2.809851867597875e-07, + "loss": 0.0367, + "step": 3452 + }, + { + "epoch": 2.6543707973102784, + "grad_norm": 0.710615174193323, + "learning_rate": 2.7853212464032146e-07, + "loss": 0.041, + "step": 3454 + }, + { + "epoch": 2.655907780979827, + "grad_norm": 0.6849647259800099, + "learning_rate": 2.7608937360690814e-07, + "loss": 0.0363, + "step": 3456 + }, + { + "epoch": 2.6574447646493757, + "grad_norm": 0.7276694595529292, + "learning_rate": 2.736569414781617e-07, + "loss": 0.0383, + "step": 3458 + }, + { + "epoch": 2.658981748318924, + "grad_norm": 0.774608853311179, + "learning_rate": 2.7123483603966824e-07, + "loss": 0.0402, + "step": 3460 + }, + { + "epoch": 2.6605187319884727, + "grad_norm": 0.7083364735387814, + "learning_rate": 2.6882306504396143e-07, + "loss": 0.0401, + "step": 3462 + }, + { + "epoch": 2.6620557156580213, + "grad_norm": 0.6714099006775984, + "learning_rate": 2.664216362104964e-07, + "loss": 0.0354, + "step": 3464 + }, + { + "epoch": 2.6635926993275696, + "grad_norm": 0.6437816346046543, + "learning_rate": 2.64030557225627e-07, + "loss": 0.035, + "step": 3466 + }, + { + "epoch": 2.6651296829971183, + "grad_norm": 0.7386329454344525, + "learning_rate": 2.6164983574257875e-07, + "loss": 0.0411, + "step": 3468 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.7173981240012108, + "learning_rate": 2.592794793814257e-07, + "loss": 0.0351, + "step": 3470 + }, + { + "epoch": 2.668203650336215, + "grad_norm": 0.7373052648241346, + "learning_rate": 2.569194957290666e-07, + "loss": 0.0367, + "step": 3472 + }, + { + "epoch": 2.6697406340057634, + "grad_norm": 0.7328221093520572, + "learning_rate": 2.5456989233919775e-07, + "loss": 0.0395, + "step": 3474 + }, + { + "epoch": 2.671277617675312, + "grad_norm": 0.6381830501668562, + "learning_rate": 2.5223067673229285e-07, + "loss": 0.0336, + "step": 3476 + }, + { + "epoch": 2.672814601344861, + "grad_norm": 0.7536818768122339, + "learning_rate": 2.4990185639557777e-07, + "loss": 0.0398, + "step": 3478 + }, + { + "epoch": 2.674351585014409, + "grad_norm": 0.7737529199470804, + "learning_rate": 2.475834387830031e-07, + "loss": 0.0379, + "step": 3480 + }, + { + "epoch": 2.6758885686839577, + "grad_norm": 0.7616061727604434, + "learning_rate": 2.452754313152246e-07, + "loss": 0.0403, + "step": 3482 + }, + { + "epoch": 2.6774255523535064, + "grad_norm": 0.7209975392281303, + "learning_rate": 2.429778413795775e-07, + "loss": 0.0383, + "step": 3484 + }, + { + "epoch": 2.6789625360230547, + "grad_norm": 0.6777003415447909, + "learning_rate": 2.4069067633005426e-07, + "loss": 0.0397, + "step": 3486 + }, + { + "epoch": 2.6804995196926034, + "grad_norm": 0.7469038413792495, + "learning_rate": 2.3841394348727856e-07, + "loss": 0.0392, + "step": 3488 + }, + { + "epoch": 2.682036503362152, + "grad_norm": 0.7456099202324231, + "learning_rate": 2.3614765013848365e-07, + "loss": 0.0389, + "step": 3490 + }, + { + "epoch": 2.6835734870317003, + "grad_norm": 0.758937959458986, + "learning_rate": 2.338918035374881e-07, + "loss": 0.0381, + "step": 3492 + }, + { + "epoch": 2.6851104707012485, + "grad_norm": 0.7223222799308419, + "learning_rate": 2.3164641090467535e-07, + "loss": 0.0399, + "step": 3494 + }, + { + "epoch": 2.686647454370797, + "grad_norm": 0.750470365319683, + "learning_rate": 2.29411479426966e-07, + "loss": 0.0389, + "step": 3496 + }, + { + "epoch": 2.688184438040346, + "grad_norm": 0.8119786466865458, + "learning_rate": 2.27187016257798e-07, + "loss": 0.0394, + "step": 3498 + }, + { + "epoch": 2.689721421709894, + "grad_norm": 0.7063154674095581, + "learning_rate": 2.2497302851710354e-07, + "loss": 0.0366, + "step": 3500 + }, + { + "epoch": 2.689721421709894, + "eval_loss": 0.15855057537555695, + "eval_runtime": 360.8492, + "eval_samples_per_second": 51.282, + "eval_steps_per_second": 6.413, + "step": 3500 + }, + { + "epoch": 2.691258405379443, + "grad_norm": 0.7006450588051517, + "learning_rate": 2.2276952329128462e-07, + "loss": 0.0357, + "step": 3502 + }, + { + "epoch": 2.6927953890489915, + "grad_norm": 0.6689362757218708, + "learning_rate": 2.2057650763319235e-07, + "loss": 0.0377, + "step": 3504 + }, + { + "epoch": 2.6943323727185398, + "grad_norm": 0.7529959367992736, + "learning_rate": 2.1839398856210274e-07, + "loss": 0.0381, + "step": 3506 + }, + { + "epoch": 2.6958693563880884, + "grad_norm": 0.7460063989500526, + "learning_rate": 2.1622197306369529e-07, + "loss": 0.0403, + "step": 3508 + }, + { + "epoch": 2.697406340057637, + "grad_norm": 0.7548043108085488, + "learning_rate": 2.1406046809003016e-07, + "loss": 0.0382, + "step": 3510 + }, + { + "epoch": 2.6989433237271854, + "grad_norm": 0.7421499181728671, + "learning_rate": 2.1190948055952634e-07, + "loss": 0.0382, + "step": 3512 + }, + { + "epoch": 2.700480307396734, + "grad_norm": 0.712291075185728, + "learning_rate": 2.097690173569392e-07, + "loss": 0.0374, + "step": 3514 + }, + { + "epoch": 2.7020172910662823, + "grad_norm": 0.6764437704157867, + "learning_rate": 2.0763908533333742e-07, + "loss": 0.0358, + "step": 3516 + }, + { + "epoch": 2.703554274735831, + "grad_norm": 0.7710410887906691, + "learning_rate": 2.0551969130608366e-07, + "loss": 0.0405, + "step": 3518 + }, + { + "epoch": 2.7050912584053792, + "grad_norm": 0.6628607572891497, + "learning_rate": 2.0341084205881088e-07, + "loss": 0.0334, + "step": 3520 + }, + { + "epoch": 2.706628242074928, + "grad_norm": 0.7047102030885535, + "learning_rate": 2.0131254434139894e-07, + "loss": 0.0364, + "step": 3522 + }, + { + "epoch": 2.7081652257444766, + "grad_norm": 0.7511681830084664, + "learning_rate": 1.992248048699576e-07, + "loss": 0.0423, + "step": 3524 + }, + { + "epoch": 2.709702209414025, + "grad_norm": 0.6655250328949831, + "learning_rate": 1.971476303268007e-07, + "loss": 0.0355, + "step": 3526 + }, + { + "epoch": 2.7112391930835735, + "grad_norm": 0.7160428780372823, + "learning_rate": 1.950810273604274e-07, + "loss": 0.0375, + "step": 3528 + }, + { + "epoch": 2.712776176753122, + "grad_norm": 0.7004993169642538, + "learning_rate": 1.930250025855006e-07, + "loss": 0.0381, + "step": 3530 + }, + { + "epoch": 2.7143131604226705, + "grad_norm": 0.6778326775797812, + "learning_rate": 1.90979562582822e-07, + "loss": 0.0359, + "step": 3532 + }, + { + "epoch": 2.715850144092219, + "grad_norm": 0.7164337600080629, + "learning_rate": 1.8894471389931838e-07, + "loss": 0.0373, + "step": 3534 + }, + { + "epoch": 2.7173871277617674, + "grad_norm": 0.7141990651264797, + "learning_rate": 1.869204630480142e-07, + "loss": 0.0375, + "step": 3536 + }, + { + "epoch": 2.718924111431316, + "grad_norm": 0.7001303648061724, + "learning_rate": 1.8490681650801216e-07, + "loss": 0.0355, + "step": 3538 + }, + { + "epoch": 2.7204610951008643, + "grad_norm": 0.7165227897430166, + "learning_rate": 1.829037807244751e-07, + "loss": 0.0361, + "step": 3540 + }, + { + "epoch": 2.721998078770413, + "grad_norm": 0.7103603815004305, + "learning_rate": 1.8091136210860293e-07, + "loss": 0.0394, + "step": 3542 + }, + { + "epoch": 2.7235350624399617, + "grad_norm": 0.7231011724413698, + "learning_rate": 1.789295670376128e-07, + "loss": 0.0408, + "step": 3544 + }, + { + "epoch": 2.72507204610951, + "grad_norm": 0.7187891219293795, + "learning_rate": 1.7695840185471828e-07, + "loss": 0.0353, + "step": 3546 + }, + { + "epoch": 2.7266090297790586, + "grad_norm": 0.7249222842840717, + "learning_rate": 1.7499787286910896e-07, + "loss": 0.0383, + "step": 3548 + }, + { + "epoch": 2.7281460134486073, + "grad_norm": 0.6726610987757241, + "learning_rate": 1.7304798635593227e-07, + "loss": 0.0345, + "step": 3550 + }, + { + "epoch": 2.7296829971181555, + "grad_norm": 0.7798682542487884, + "learning_rate": 1.711087485562714e-07, + "loss": 0.0395, + "step": 3552 + }, + { + "epoch": 2.7312199807877042, + "grad_norm": 0.8489020328845095, + "learning_rate": 1.6918016567712457e-07, + "loss": 0.0388, + "step": 3554 + }, + { + "epoch": 2.732756964457253, + "grad_norm": 0.7675735405799808, + "learning_rate": 1.672622438913869e-07, + "loss": 0.0413, + "step": 3556 + }, + { + "epoch": 2.734293948126801, + "grad_norm": 0.7672370602760468, + "learning_rate": 1.6535498933783083e-07, + "loss": 0.0392, + "step": 3558 + }, + { + "epoch": 2.7358309317963494, + "grad_norm": 0.706536918085323, + "learning_rate": 1.634584081210853e-07, + "loss": 0.0353, + "step": 3560 + }, + { + "epoch": 2.737367915465898, + "grad_norm": 0.6514716793107945, + "learning_rate": 1.6157250631161624e-07, + "loss": 0.029, + "step": 3562 + }, + { + "epoch": 2.7389048991354468, + "grad_norm": 0.8151870302671732, + "learning_rate": 1.5969728994570786e-07, + "loss": 0.0451, + "step": 3564 + }, + { + "epoch": 2.740441882804995, + "grad_norm": 0.7201013965367818, + "learning_rate": 1.5783276502544413e-07, + "loss": 0.0375, + "step": 3566 + }, + { + "epoch": 2.7419788664745437, + "grad_norm": 0.8279664659096065, + "learning_rate": 1.5597893751868574e-07, + "loss": 0.0377, + "step": 3568 + }, + { + "epoch": 2.7435158501440924, + "grad_norm": 0.7556263425034218, + "learning_rate": 1.541358133590562e-07, + "loss": 0.0383, + "step": 3570 + }, + { + "epoch": 2.7450528338136406, + "grad_norm": 0.786743386378996, + "learning_rate": 1.5230339844592033e-07, + "loss": 0.0389, + "step": 3572 + }, + { + "epoch": 2.7465898174831893, + "grad_norm": 0.6751334601351806, + "learning_rate": 1.504816986443635e-07, + "loss": 0.0348, + "step": 3574 + }, + { + "epoch": 2.748126801152738, + "grad_norm": 0.7230502301443176, + "learning_rate": 1.4867071978517626e-07, + "loss": 0.0381, + "step": 3576 + }, + { + "epoch": 2.7496637848222862, + "grad_norm": 0.7424159284142868, + "learning_rate": 1.4687046766483425e-07, + "loss": 0.0377, + "step": 3578 + }, + { + "epoch": 2.751200768491835, + "grad_norm": 0.7033065172321098, + "learning_rate": 1.450809480454787e-07, + "loss": 0.0392, + "step": 3580 + }, + { + "epoch": 2.752737752161383, + "grad_norm": 0.7422740281311779, + "learning_rate": 1.4330216665490024e-07, + "loss": 0.0388, + "step": 3582 + }, + { + "epoch": 2.754274735830932, + "grad_norm": 0.6749501870256193, + "learning_rate": 1.4153412918651736e-07, + "loss": 0.0292, + "step": 3584 + }, + { + "epoch": 2.75581171950048, + "grad_norm": 0.7383402984400056, + "learning_rate": 1.3977684129936095e-07, + "loss": 0.0362, + "step": 3586 + }, + { + "epoch": 2.757348703170029, + "grad_norm": 0.7344177105323607, + "learning_rate": 1.3803030861805686e-07, + "loss": 0.0369, + "step": 3588 + }, + { + "epoch": 2.7588856868395775, + "grad_norm": 0.7788733949519481, + "learning_rate": 1.3629453673280367e-07, + "loss": 0.0386, + "step": 3590 + }, + { + "epoch": 2.7604226705091257, + "grad_norm": 0.7523955526390752, + "learning_rate": 1.3456953119935832e-07, + "loss": 0.0402, + "step": 3592 + }, + { + "epoch": 2.7619596541786744, + "grad_norm": 0.6387136592938574, + "learning_rate": 1.328552975390179e-07, + "loss": 0.0366, + "step": 3594 + }, + { + "epoch": 2.763496637848223, + "grad_norm": 0.7765496538466939, + "learning_rate": 1.3115184123860141e-07, + "loss": 0.038, + "step": 3596 + }, + { + "epoch": 2.7650336215177713, + "grad_norm": 0.6197630907677361, + "learning_rate": 1.2945916775043285e-07, + "loss": 0.0347, + "step": 3598 + }, + { + "epoch": 2.76657060518732, + "grad_norm": 0.6507185122236504, + "learning_rate": 1.2777728249232174e-07, + "loss": 0.0366, + "step": 3600 + }, + { + "epoch": 2.7681075888568683, + "grad_norm": 0.7129555271323699, + "learning_rate": 1.261061908475476e-07, + "loss": 0.0382, + "step": 3602 + }, + { + "epoch": 2.769644572526417, + "grad_norm": 0.7063456528141899, + "learning_rate": 1.244458981648443e-07, + "loss": 0.0357, + "step": 3604 + }, + { + "epoch": 2.771181556195965, + "grad_norm": 0.7313820214800656, + "learning_rate": 1.2279640975837857e-07, + "loss": 0.0361, + "step": 3606 + }, + { + "epoch": 2.772718539865514, + "grad_norm": 0.7892493813976871, + "learning_rate": 1.2115773090773718e-07, + "loss": 0.0383, + "step": 3608 + }, + { + "epoch": 2.7742555235350626, + "grad_norm": 0.7011080024188364, + "learning_rate": 1.1952986685790674e-07, + "loss": 0.0348, + "step": 3610 + }, + { + "epoch": 2.775792507204611, + "grad_norm": 0.7212790327473237, + "learning_rate": 1.1791282281925968e-07, + "loss": 0.0386, + "step": 3612 + }, + { + "epoch": 2.7773294908741595, + "grad_norm": 0.6824883473362624, + "learning_rate": 1.163066039675369e-07, + "loss": 0.0351, + "step": 3614 + }, + { + "epoch": 2.778866474543708, + "grad_norm": 0.6850223544852229, + "learning_rate": 1.1471121544382851e-07, + "loss": 0.0356, + "step": 3616 + }, + { + "epoch": 2.7804034582132564, + "grad_norm": 0.7426536823064712, + "learning_rate": 1.131266623545612e-07, + "loss": 0.0375, + "step": 3618 + }, + { + "epoch": 2.781940441882805, + "grad_norm": 0.7258544058495259, + "learning_rate": 1.1155294977148028e-07, + "loss": 0.0364, + "step": 3620 + }, + { + "epoch": 2.783477425552354, + "grad_norm": 0.7068237112208213, + "learning_rate": 1.0999008273163257e-07, + "loss": 0.0427, + "step": 3622 + }, + { + "epoch": 2.785014409221902, + "grad_norm": 0.723771053564773, + "learning_rate": 1.0843806623735207e-07, + "loss": 0.0378, + "step": 3624 + }, + { + "epoch": 2.7865513928914503, + "grad_norm": 0.8519854888198337, + "learning_rate": 1.0689690525624167e-07, + "loss": 0.04, + "step": 3626 + }, + { + "epoch": 2.788088376560999, + "grad_norm": 0.706127807785433, + "learning_rate": 1.0536660472115993e-07, + "loss": 0.0393, + "step": 3628 + }, + { + "epoch": 2.7896253602305476, + "grad_norm": 0.6705497949051035, + "learning_rate": 1.0384716953020323e-07, + "loss": 0.0333, + "step": 3630 + }, + { + "epoch": 2.791162343900096, + "grad_norm": 0.6886374108827851, + "learning_rate": 1.0233860454669097e-07, + "loss": 0.0354, + "step": 3632 + }, + { + "epoch": 2.7926993275696446, + "grad_norm": 0.6453996780069803, + "learning_rate": 1.0084091459914929e-07, + "loss": 0.0333, + "step": 3634 + }, + { + "epoch": 2.7942363112391932, + "grad_norm": 0.7016064494453276, + "learning_rate": 9.935410448129705e-08, + "loss": 0.0388, + "step": 3636 + }, + { + "epoch": 2.7957732949087415, + "grad_norm": 0.7099217898470163, + "learning_rate": 9.787817895202876e-08, + "loss": 0.0341, + "step": 3638 + }, + { + "epoch": 2.79731027857829, + "grad_norm": 0.7392413254776985, + "learning_rate": 9.641314273540175e-08, + "loss": 0.0412, + "step": 3640 + }, + { + "epoch": 2.798847262247839, + "grad_norm": 0.6659018777001534, + "learning_rate": 9.495900052061629e-08, + "loss": 0.036, + "step": 3642 + }, + { + "epoch": 2.800384245917387, + "grad_norm": 0.7498210451825361, + "learning_rate": 9.351575696200754e-08, + "loss": 0.0379, + "step": 3644 + }, + { + "epoch": 2.801921229586936, + "grad_norm": 0.7188638489637743, + "learning_rate": 9.208341667902487e-08, + "loss": 0.0349, + "step": 3646 + }, + { + "epoch": 2.803458213256484, + "grad_norm": 0.6975773584570585, + "learning_rate": 9.06619842562183e-08, + "loss": 0.0323, + "step": 3648 + }, + { + "epoch": 2.8049951969260327, + "grad_norm": 0.6554005252536124, + "learning_rate": 8.925146424322644e-08, + "loss": 0.0329, + "step": 3650 + }, + { + "epoch": 2.806532180595581, + "grad_norm": 0.7073740246874296, + "learning_rate": 8.78518611547594e-08, + "loss": 0.0386, + "step": 3652 + }, + { + "epoch": 2.8080691642651296, + "grad_norm": 0.6691649384368833, + "learning_rate": 8.64631794705844e-08, + "loss": 0.0399, + "step": 3654 + }, + { + "epoch": 2.8096061479346783, + "grad_norm": 0.7533417497153267, + "learning_rate": 8.508542363551296e-08, + "loss": 0.0376, + "step": 3656 + }, + { + "epoch": 2.8111431316042266, + "grad_norm": 0.7324347703672368, + "learning_rate": 8.371859805938497e-08, + "loss": 0.0364, + "step": 3658 + }, + { + "epoch": 2.8126801152737753, + "grad_norm": 0.6931453128827018, + "learning_rate": 8.236270711705662e-08, + "loss": 0.0326, + "step": 3660 + }, + { + "epoch": 2.814217098943324, + "grad_norm": 0.6612068285353259, + "learning_rate": 8.101775514838372e-08, + "loss": 0.0374, + "step": 3662 + }, + { + "epoch": 2.815754082612872, + "grad_norm": 0.6742383780806924, + "learning_rate": 7.968374645820964e-08, + "loss": 0.0362, + "step": 3664 + }, + { + "epoch": 2.817291066282421, + "grad_norm": 0.7414092505129248, + "learning_rate": 7.836068531635249e-08, + "loss": 0.0403, + "step": 3666 + }, + { + "epoch": 2.818828049951969, + "grad_norm": 0.6917889501190945, + "learning_rate": 7.704857595758802e-08, + "loss": 0.036, + "step": 3668 + }, + { + "epoch": 2.820365033621518, + "grad_norm": 0.7254060046131036, + "learning_rate": 7.574742258163952e-08, + "loss": 0.0403, + "step": 3670 + }, + { + "epoch": 2.821902017291066, + "grad_norm": 0.7112553826453433, + "learning_rate": 7.445722935316307e-08, + "loss": 0.0392, + "step": 3672 + }, + { + "epoch": 2.8234390009606147, + "grad_norm": 0.7231907969490486, + "learning_rate": 7.317800040173311e-08, + "loss": 0.0388, + "step": 3674 + }, + { + "epoch": 2.8249759846301634, + "grad_norm": 0.7143348806863665, + "learning_rate": 7.190973982183124e-08, + "loss": 0.0356, + "step": 3676 + }, + { + "epoch": 2.8265129682997117, + "grad_norm": 0.722256824360624, + "learning_rate": 7.065245167283179e-08, + "loss": 0.0365, + "step": 3678 + }, + { + "epoch": 2.8280499519692603, + "grad_norm": 0.749657038879464, + "learning_rate": 6.940613997898826e-08, + "loss": 0.0377, + "step": 3680 + }, + { + "epoch": 2.829586935638809, + "grad_norm": 0.7506478416183923, + "learning_rate": 6.817080872942393e-08, + "loss": 0.039, + "step": 3682 + }, + { + "epoch": 2.8311239193083573, + "grad_norm": 0.7166949562340486, + "learning_rate": 6.694646187811371e-08, + "loss": 0.0381, + "step": 3684 + }, + { + "epoch": 2.832660902977906, + "grad_norm": 0.6919045066011337, + "learning_rate": 6.573310334387544e-08, + "loss": 0.0359, + "step": 3686 + }, + { + "epoch": 2.8341978866474546, + "grad_norm": 0.7652082696602207, + "learning_rate": 6.453073701035644e-08, + "loss": 0.0387, + "step": 3688 + }, + { + "epoch": 2.835734870317003, + "grad_norm": 0.6875494379565337, + "learning_rate": 6.333936672602058e-08, + "loss": 0.0358, + "step": 3690 + }, + { + "epoch": 2.837271853986551, + "grad_norm": 0.704153070629626, + "learning_rate": 6.215899630413668e-08, + "loss": 0.0399, + "step": 3692 + }, + { + "epoch": 2.8388088376561, + "grad_norm": 0.7683845653293229, + "learning_rate": 6.098962952276449e-08, + "loss": 0.0376, + "step": 3694 + }, + { + "epoch": 2.8403458213256485, + "grad_norm": 0.7444818377336334, + "learning_rate": 5.983127012474498e-08, + "loss": 0.0375, + "step": 3696 + }, + { + "epoch": 2.8418828049951967, + "grad_norm": 0.7131478664262563, + "learning_rate": 5.8683921817687943e-08, + "loss": 0.0377, + "step": 3698 + }, + { + "epoch": 2.8434197886647454, + "grad_norm": 0.6844062973629339, + "learning_rate": 5.7547588273958336e-08, + "loss": 0.0348, + "step": 3700 + }, + { + "epoch": 2.844956772334294, + "grad_norm": 0.6917343381332636, + "learning_rate": 5.6422273130665835e-08, + "loss": 0.0378, + "step": 3702 + }, + { + "epoch": 2.8464937560038424, + "grad_norm": 0.7486578311506366, + "learning_rate": 5.5307979989653534e-08, + "loss": 0.041, + "step": 3704 + }, + { + "epoch": 2.848030739673391, + "grad_norm": 0.7065517002670538, + "learning_rate": 5.420471241748592e-08, + "loss": 0.0344, + "step": 3706 + }, + { + "epoch": 2.8495677233429397, + "grad_norm": 0.7218309216210936, + "learning_rate": 5.311247394543761e-08, + "loss": 0.0396, + "step": 3708 + }, + { + "epoch": 2.851104707012488, + "grad_norm": 0.7344762397348806, + "learning_rate": 5.203126806948127e-08, + "loss": 0.0398, + "step": 3710 + }, + { + "epoch": 2.8526416906820367, + "grad_norm": 0.6978785514396233, + "learning_rate": 5.0961098250277166e-08, + "loss": 0.035, + "step": 3712 + }, + { + "epoch": 2.854178674351585, + "grad_norm": 0.7492370044630394, + "learning_rate": 4.990196791316304e-08, + "loss": 0.0395, + "step": 3714 + }, + { + "epoch": 2.8557156580211336, + "grad_norm": 0.7347118419552384, + "learning_rate": 4.8853880448140876e-08, + "loss": 0.0373, + "step": 3716 + }, + { + "epoch": 2.857252641690682, + "grad_norm": 0.6710823443691567, + "learning_rate": 4.781683920986801e-08, + "loss": 0.0353, + "step": 3718 + }, + { + "epoch": 2.8587896253602305, + "grad_norm": 0.7656067717475032, + "learning_rate": 4.679084751764467e-08, + "loss": 0.0405, + "step": 3720 + }, + { + "epoch": 2.860326609029779, + "grad_norm": 0.6658481972086318, + "learning_rate": 4.5775908655405814e-08, + "loss": 0.0392, + "step": 3722 + }, + { + "epoch": 2.8618635926993274, + "grad_norm": 0.7325672821180539, + "learning_rate": 4.4772025871709085e-08, + "loss": 0.0357, + "step": 3724 + }, + { + "epoch": 2.863400576368876, + "grad_norm": 0.7131675994968449, + "learning_rate": 4.377920237972238e-08, + "loss": 0.0391, + "step": 3726 + }, + { + "epoch": 2.864937560038425, + "grad_norm": 0.735115333347456, + "learning_rate": 4.279744135721764e-08, + "loss": 0.0383, + "step": 3728 + }, + { + "epoch": 2.866474543707973, + "grad_norm": 0.7142932075003074, + "learning_rate": 4.182674594655839e-08, + "loss": 0.0347, + "step": 3730 + }, + { + "epoch": 2.8680115273775217, + "grad_norm": 0.7488967955446748, + "learning_rate": 4.0867119254689664e-08, + "loss": 0.0387, + "step": 3732 + }, + { + "epoch": 2.86954851104707, + "grad_norm": 0.7704536931618361, + "learning_rate": 3.991856435312868e-08, + "loss": 0.0399, + "step": 3734 + }, + { + "epoch": 2.8710854947166187, + "grad_norm": 0.7243897987142581, + "learning_rate": 3.898108427795355e-08, + "loss": 0.0407, + "step": 3736 + }, + { + "epoch": 2.872622478386167, + "grad_norm": 0.7696208552815814, + "learning_rate": 3.805468202979706e-08, + "loss": 0.0377, + "step": 3738 + }, + { + "epoch": 2.8741594620557156, + "grad_norm": 0.732607042331649, + "learning_rate": 3.7139360573832715e-08, + "loss": 0.0398, + "step": 3740 + }, + { + "epoch": 2.8756964457252643, + "grad_norm": 0.6798024620244276, + "learning_rate": 3.6235122839767707e-08, + "loss": 0.0338, + "step": 3742 + }, + { + "epoch": 2.8772334293948125, + "grad_norm": 0.7560881846201208, + "learning_rate": 3.534197172183323e-08, + "loss": 0.0385, + "step": 3744 + }, + { + "epoch": 2.878770413064361, + "grad_norm": 0.7216366213167684, + "learning_rate": 3.4459910078775914e-08, + "loss": 0.0404, + "step": 3746 + }, + { + "epoch": 2.88030739673391, + "grad_norm": 0.7186180485084087, + "learning_rate": 3.358894073384616e-08, + "loss": 0.0388, + "step": 3748 + }, + { + "epoch": 2.881844380403458, + "grad_norm": 0.755800097162076, + "learning_rate": 3.2729066474792734e-08, + "loss": 0.0355, + "step": 3750 + }, + { + "epoch": 2.883381364073007, + "grad_norm": 0.7190348005662992, + "learning_rate": 3.18802900538499e-08, + "loss": 0.0387, + "step": 3752 + }, + { + "epoch": 2.8849183477425555, + "grad_norm": 0.6881759771848993, + "learning_rate": 3.104261418773241e-08, + "loss": 0.0371, + "step": 3754 + }, + { + "epoch": 2.8864553314121038, + "grad_norm": 0.7368031850460476, + "learning_rate": 3.0216041557624196e-08, + "loss": 0.0363, + "step": 3756 + }, + { + "epoch": 2.887992315081652, + "grad_norm": 0.6818452373614833, + "learning_rate": 2.9400574809169856e-08, + "loss": 0.0385, + "step": 3758 + }, + { + "epoch": 2.8895292987512007, + "grad_norm": 0.671665343657444, + "learning_rate": 2.859621655246841e-08, + "loss": 0.0394, + "step": 3760 + }, + { + "epoch": 2.8910662824207494, + "grad_norm": 0.8081746540797051, + "learning_rate": 2.7802969362062057e-08, + "loss": 0.0426, + "step": 3762 + }, + { + "epoch": 2.8926032660902976, + "grad_norm": 0.7953691794225282, + "learning_rate": 2.702083577693071e-08, + "loss": 0.0401, + "step": 3764 + }, + { + "epoch": 2.8941402497598463, + "grad_norm": 0.6780256825935983, + "learning_rate": 2.624981830048151e-08, + "loss": 0.038, + "step": 3766 + }, + { + "epoch": 2.895677233429395, + "grad_norm": 0.7601952372611993, + "learning_rate": 2.5489919400542236e-08, + "loss": 0.0348, + "step": 3768 + }, + { + "epoch": 2.8972142170989432, + "grad_norm": 0.7521761082939742, + "learning_rate": 2.4741141509353136e-08, + "loss": 0.0376, + "step": 3770 + }, + { + "epoch": 2.898751200768492, + "grad_norm": 0.7042258551510416, + "learning_rate": 2.4003487023557978e-08, + "loss": 0.0383, + "step": 3772 + }, + { + "epoch": 2.9002881844380406, + "grad_norm": 0.6754060246689739, + "learning_rate": 2.3276958304198235e-08, + "loss": 0.0356, + "step": 3774 + }, + { + "epoch": 2.901825168107589, + "grad_norm": 0.738067712348994, + "learning_rate": 2.2561557676705314e-08, + "loss": 0.0441, + "step": 3776 + }, + { + "epoch": 2.9033621517771375, + "grad_norm": 0.7057955885412858, + "learning_rate": 2.1857287430891213e-08, + "loss": 0.0378, + "step": 3778 + }, + { + "epoch": 2.9048991354466858, + "grad_norm": 0.6886592794650539, + "learning_rate": 2.1164149820942722e-08, + "loss": 0.0366, + "step": 3780 + }, + { + "epoch": 2.9064361191162345, + "grad_norm": 0.7673640335969266, + "learning_rate": 2.048214706541479e-08, + "loss": 0.0406, + "step": 3782 + }, + { + "epoch": 2.9079731027857827, + "grad_norm": 0.6678527359649526, + "learning_rate": 1.9811281347221597e-08, + "loss": 0.037, + "step": 3784 + }, + { + "epoch": 2.9095100864553314, + "grad_norm": 0.7127890261581394, + "learning_rate": 1.9151554813630734e-08, + "loss": 0.0361, + "step": 3786 + }, + { + "epoch": 2.91104707012488, + "grad_norm": 0.7583876230260865, + "learning_rate": 1.850296957625658e-08, + "loss": 0.0378, + "step": 3788 + }, + { + "epoch": 2.9125840537944283, + "grad_norm": 0.6957251087608395, + "learning_rate": 1.7865527711052932e-08, + "loss": 0.0392, + "step": 3790 + }, + { + "epoch": 2.914121037463977, + "grad_norm": 0.7245984947394084, + "learning_rate": 1.7239231258306397e-08, + "loss": 0.0387, + "step": 3792 + }, + { + "epoch": 2.9156580211335257, + "grad_norm": 0.6799380535511241, + "learning_rate": 1.662408222262979e-08, + "loss": 0.0359, + "step": 3794 + }, + { + "epoch": 2.917195004803074, + "grad_norm": 0.6832560309649182, + "learning_rate": 1.6020082572956674e-08, + "loss": 0.0379, + "step": 3796 + }, + { + "epoch": 2.9187319884726226, + "grad_norm": 0.7506399618941308, + "learning_rate": 1.542723424253323e-08, + "loss": 0.0433, + "step": 3798 + }, + { + "epoch": 2.920268972142171, + "grad_norm": 0.7012670552597634, + "learning_rate": 1.4845539128913954e-08, + "loss": 0.038, + "step": 3800 + }, + { + "epoch": 2.9218059558117195, + "grad_norm": 0.8131855947668669, + "learning_rate": 1.4274999093955077e-08, + "loss": 0.038, + "step": 3802 + }, + { + "epoch": 2.9233429394812678, + "grad_norm": 0.72352053716851, + "learning_rate": 1.371561596380677e-08, + "loss": 0.04, + "step": 3804 + }, + { + "epoch": 2.9248799231508165, + "grad_norm": 0.7277652228743811, + "learning_rate": 1.3167391528910831e-08, + "loss": 0.0404, + "step": 3806 + }, + { + "epoch": 2.926416906820365, + "grad_norm": 0.6608930767109022, + "learning_rate": 1.2630327543991349e-08, + "loss": 0.0333, + "step": 3808 + }, + { + "epoch": 2.9279538904899134, + "grad_norm": 0.7251459149646445, + "learning_rate": 1.2104425728051981e-08, + "loss": 0.035, + "step": 3810 + }, + { + "epoch": 2.929490874159462, + "grad_norm": 0.7280772115836247, + "learning_rate": 1.1589687764367807e-08, + "loss": 0.0366, + "step": 3812 + }, + { + "epoch": 2.9310278578290108, + "grad_norm": 0.7583993906119615, + "learning_rate": 1.1086115300482203e-08, + "loss": 0.0377, + "step": 3814 + }, + { + "epoch": 2.932564841498559, + "grad_norm": 0.6946315289320816, + "learning_rate": 1.0593709948200635e-08, + "loss": 0.0375, + "step": 3816 + }, + { + "epoch": 2.9341018251681077, + "grad_norm": 0.7540614936256114, + "learning_rate": 1.011247328358561e-08, + "loss": 0.0419, + "step": 3818 + }, + { + "epoch": 2.9356388088376564, + "grad_norm": 0.7380916570292597, + "learning_rate": 9.642406846950446e-09, + "loss": 0.0391, + "step": 3820 + }, + { + "epoch": 2.9371757925072046, + "grad_norm": 0.7229342418450578, + "learning_rate": 9.183512142857342e-09, + "loss": 0.0363, + "step": 3822 + }, + { + "epoch": 2.938712776176753, + "grad_norm": 0.7727279695915876, + "learning_rate": 8.73579064010882e-09, + "loss": 0.0395, + "step": 3824 + }, + { + "epoch": 2.9402497598463015, + "grad_norm": 0.741689517658393, + "learning_rate": 8.299243771746179e-09, + "loss": 0.0386, + "step": 3826 + }, + { + "epoch": 2.9417867435158502, + "grad_norm": 0.7231470558103498, + "learning_rate": 7.873872935043269e-09, + "loss": 0.041, + "step": 3828 + }, + { + "epoch": 2.9433237271853985, + "grad_norm": 0.6584072678052216, + "learning_rate": 7.459679491501448e-09, + "loss": 0.0345, + "step": 3830 + }, + { + "epoch": 2.944860710854947, + "grad_norm": 0.6835760459984861, + "learning_rate": 7.0566647668476315e-09, + "loss": 0.0384, + "step": 3832 + }, + { + "epoch": 2.946397694524496, + "grad_norm": 0.7191660536723004, + "learning_rate": 6.6648300510276925e-09, + "loss": 0.0358, + "step": 3834 + }, + { + "epoch": 2.947934678194044, + "grad_norm": 0.7051985526337444, + "learning_rate": 6.284176598202573e-09, + "loss": 0.0369, + "step": 3836 + }, + { + "epoch": 2.9494716618635928, + "grad_norm": 0.6889042075604795, + "learning_rate": 5.914705626746342e-09, + "loss": 0.0346, + "step": 3838 + }, + { + "epoch": 2.9510086455331415, + "grad_norm": 0.6797954961166619, + "learning_rate": 5.556418319239975e-09, + "loss": 0.0358, + "step": 3840 + }, + { + "epoch": 2.9525456292026897, + "grad_norm": 0.7081235515177567, + "learning_rate": 5.209315822468252e-09, + "loss": 0.04, + "step": 3842 + }, + { + "epoch": 2.9540826128722384, + "grad_norm": 0.7586899396756654, + "learning_rate": 4.873399247417032e-09, + "loss": 0.041, + "step": 3844 + }, + { + "epoch": 2.9556195965417866, + "grad_norm": 0.6546459502260559, + "learning_rate": 4.548669669267813e-09, + "loss": 0.0379, + "step": 3846 + }, + { + "epoch": 2.9571565802113353, + "grad_norm": 0.7194117012638997, + "learning_rate": 4.23512812739657e-09, + "loss": 0.0352, + "step": 3848 + }, + { + "epoch": 2.9586935638808836, + "grad_norm": 0.6550475400228738, + "learning_rate": 3.9327756253683125e-09, + "loss": 0.0338, + "step": 3850 + }, + { + "epoch": 2.9602305475504322, + "grad_norm": 0.6471117547944658, + "learning_rate": 3.6416131309355283e-09, + "loss": 0.0327, + "step": 3852 + }, + { + "epoch": 2.961767531219981, + "grad_norm": 0.6643076347264795, + "learning_rate": 3.361641576034302e-09, + "loss": 0.0366, + "step": 3854 + }, + { + "epoch": 2.963304514889529, + "grad_norm": 0.7533191809512965, + "learning_rate": 3.0928618567808153e-09, + "loss": 0.0401, + "step": 3856 + }, + { + "epoch": 2.964841498559078, + "grad_norm": 0.7192365559237837, + "learning_rate": 2.835274833469403e-09, + "loss": 0.0372, + "step": 3858 + }, + { + "epoch": 2.9663784822286265, + "grad_norm": 0.720379151963908, + "learning_rate": 2.588881330570225e-09, + "loss": 0.0362, + "step": 3860 + }, + { + "epoch": 2.967915465898175, + "grad_norm": 0.7348332898518517, + "learning_rate": 2.3536821367246e-09, + "loss": 0.0342, + "step": 3862 + }, + { + "epoch": 2.9694524495677235, + "grad_norm": 0.7451802490285939, + "learning_rate": 2.1296780047446172e-09, + "loss": 0.0393, + "step": 3864 + }, + { + "epoch": 2.9709894332372717, + "grad_norm": 0.5897360637433469, + "learning_rate": 1.9168696516092544e-09, + "loss": 0.035, + "step": 3866 + }, + { + "epoch": 2.9725264169068204, + "grad_norm": 0.6721100987536596, + "learning_rate": 1.7152577584635952e-09, + "loss": 0.0376, + "step": 3868 + }, + { + "epoch": 2.9740634005763686, + "grad_norm": 0.7372791169185593, + "learning_rate": 1.524842970614948e-09, + "loss": 0.0393, + "step": 3870 + }, + { + "epoch": 2.9756003842459173, + "grad_norm": 0.7155596616446167, + "learning_rate": 1.3456258975312885e-09, + "loss": 0.041, + "step": 3872 + }, + { + "epoch": 2.977137367915466, + "grad_norm": 0.7243460282148372, + "learning_rate": 1.1776071128412614e-09, + "loss": 0.0373, + "step": 3874 + }, + { + "epoch": 2.9786743515850143, + "grad_norm": 0.8216882893956271, + "learning_rate": 1.0207871543287394e-09, + "loss": 0.0387, + "step": 3876 + }, + { + "epoch": 2.980211335254563, + "grad_norm": 0.7555366446961992, + "learning_rate": 8.7516652393399e-10, + "loss": 0.0387, + "step": 3878 + }, + { + "epoch": 2.9817483189241116, + "grad_norm": 0.7303863102314947, + "learning_rate": 7.40745687751343e-10, + "loss": 0.0375, + "step": 3880 + }, + { + "epoch": 2.98328530259366, + "grad_norm": 0.7352509297982905, + "learning_rate": 6.175250760268591e-10, + "loss": 0.0398, + "step": 3882 + }, + { + "epoch": 2.9848222862632086, + "grad_norm": 0.7396812555733191, + "learning_rate": 5.055050831579422e-10, + "loss": 0.0454, + "step": 3884 + }, + { + "epoch": 2.986359269932757, + "grad_norm": 0.6935348372155691, + "learning_rate": 4.0468606769178403e-10, + "loss": 0.0353, + "step": 3886 + }, + { + "epoch": 2.9878962536023055, + "grad_norm": 0.7901782130270826, + "learning_rate": 3.150683523238107e-10, + "loss": 0.0376, + "step": 3888 + }, + { + "epoch": 2.9894332372718537, + "grad_norm": 0.6621002431203192, + "learning_rate": 2.366522238972935e-10, + "loss": 0.0356, + "step": 3890 + }, + { + "epoch": 2.9909702209414024, + "grad_norm": 0.7167270045432917, + "learning_rate": 1.6943793340179482e-10, + "loss": 0.0381, + "step": 3892 + }, + { + "epoch": 2.992507204610951, + "grad_norm": 0.70796009257099, + "learning_rate": 1.1342569597277973e-10, + "loss": 0.0376, + "step": 3894 + }, + { + "epoch": 2.9940441882804993, + "grad_norm": 0.725988884787138, + "learning_rate": 6.861569089161578e-11, + "loss": 0.0354, + "step": 3896 + }, + { + "epoch": 2.995581171950048, + "grad_norm": 0.6619948029465499, + "learning_rate": 3.500806158285297e-11, + "loss": 0.0362, + "step": 3898 + }, + { + "epoch": 2.9971181556195967, + "grad_norm": 0.7007360064163597, + "learning_rate": 1.2602915616166665e-11, + "loss": 0.0403, + "step": 3900 + }, + { + "epoch": 2.998655139289145, + "grad_norm": 0.6392213863532241, + "learning_rate": 1.4003247044147572e-12, + "loss": 0.0384, + "step": 3902 + }, + { + "epoch": 2.9994236311239195, + "step": 3903, + "total_flos": 1225143666540544.0, + "train_loss": 0.11350150189982632, + "train_runtime": 22267.9689, + "train_samples_per_second": 22.437, + "train_steps_per_second": 0.175 + } + ], + "logging_steps": 2, + "max_steps": 3903, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1225143666540544.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}