diff --git "a/ARM-7B-Pruned/checkpoint-4800/trainer_state.json" "b/ARM-7B-Pruned/checkpoint-4800/trainer_state.json" new file mode 100644--- /dev/null +++ "b/ARM-7B-Pruned/checkpoint-4800/trainer_state.json" @@ -0,0 +1,16833 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5120409632770622, + "eval_steps": 50000, + "global_step": 4800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00021335040136544257, + "grad_norm": 42.684086970967954, + "learning_rate": 4.264392324093816e-09, + "loss": 0.5986, + "step": 2 + }, + { + "epoch": 0.00042670080273088514, + "grad_norm": 42.537918257537875, + "learning_rate": 8.528784648187632e-09, + "loss": 0.6957, + "step": 4 + }, + { + "epoch": 0.0006400512040963277, + "grad_norm": 58.765999906046574, + "learning_rate": 1.2793176972281448e-08, + "loss": 0.047, + "step": 6 + }, + { + "epoch": 0.0008534016054617703, + "grad_norm": 154.70890416675192, + "learning_rate": 1.7057569296375264e-08, + "loss": -0.6682, + "step": 8 + }, + { + "epoch": 0.0010667520068272129, + "grad_norm": 315.97286091458943, + "learning_rate": 2.1321961620469082e-08, + "loss": 0.0971, + "step": 10 + }, + { + "epoch": 0.0012801024081926554, + "grad_norm": 102.37154058019046, + "learning_rate": 2.5586353944562897e-08, + "loss": 1.0541, + "step": 12 + }, + { + "epoch": 0.001493452809558098, + "grad_norm": 105.127529829733, + "learning_rate": 2.9850746268656714e-08, + "loss": 0.4648, + "step": 14 + }, + { + "epoch": 0.0017068032109235406, + "grad_norm": 48.013196159706226, + "learning_rate": 3.411513859275053e-08, + "loss": -0.4529, + "step": 16 + }, + { + "epoch": 0.0019201536122889832, + "grad_norm": 55.69084727420936, + "learning_rate": 3.837953091684435e-08, + "loss": -0.1927, + "step": 18 + }, + { + "epoch": 0.0021335040136544257, + "grad_norm": 33.25875332287273, + "learning_rate": 4.2643923240938164e-08, + "loss": 0.2517, + "step": 20 + }, + { + "epoch": 0.002346854415019868, + "grad_norm": 245.94604658609376, + "learning_rate": 4.6908315565031985e-08, + "loss": -0.8646, + "step": 22 + }, + { + "epoch": 0.002560204816385311, + "grad_norm": 33.35694659794871, + "learning_rate": 5.117270788912579e-08, + "loss": 0.7383, + "step": 24 + }, + { + "epoch": 0.0027735552177507532, + "grad_norm": 193.28196705366508, + "learning_rate": 5.5437100213219614e-08, + "loss": -0.9655, + "step": 26 + }, + { + "epoch": 0.002986905619116196, + "grad_norm": 50.147398314622684, + "learning_rate": 5.970149253731343e-08, + "loss": -0.5218, + "step": 28 + }, + { + "epoch": 0.0032002560204816384, + "grad_norm": 75.21413198410397, + "learning_rate": 6.396588486140725e-08, + "loss": 0.1629, + "step": 30 + }, + { + "epoch": 0.003413606421847081, + "grad_norm": 103.01315028563378, + "learning_rate": 6.823027718550106e-08, + "loss": -0.7119, + "step": 32 + }, + { + "epoch": 0.0036269568232125235, + "grad_norm": 36.63545743331723, + "learning_rate": 7.249466950959488e-08, + "loss": 0.3538, + "step": 34 + }, + { + "epoch": 0.0038403072245779663, + "grad_norm": 39.002709650329486, + "learning_rate": 7.67590618336887e-08, + "loss": -0.6754, + "step": 36 + }, + { + "epoch": 0.004053657625943409, + "grad_norm": 23.502647009611, + "learning_rate": 8.102345415778252e-08, + "loss": 0.1865, + "step": 38 + }, + { + "epoch": 0.0042670080273088514, + "grad_norm": 26.139670413986497, + "learning_rate": 8.528784648187633e-08, + "loss": -0.3739, + "step": 40 + }, + { + "epoch": 0.004480358428674294, + "grad_norm": 30.02660527440867, + "learning_rate": 8.955223880597014e-08, + "loss": -0.0359, + "step": 42 + }, + { + "epoch": 0.004693708830039736, + "grad_norm": 13.980962042551367, + "learning_rate": 9.381663113006397e-08, + "loss": 0.4169, + "step": 44 + }, + { + "epoch": 0.004907059231405179, + "grad_norm": 94.42191623111091, + "learning_rate": 9.808102345415778e-08, + "loss": -1.4962, + "step": 46 + }, + { + "epoch": 0.005120409632770622, + "grad_norm": 36.146349198898875, + "learning_rate": 1.0234541577825159e-07, + "loss": 0.8652, + "step": 48 + }, + { + "epoch": 0.0053337600341360645, + "grad_norm": 34.112375662585286, + "learning_rate": 1.0660980810234541e-07, + "loss": 0.4556, + "step": 50 + }, + { + "epoch": 0.0055471104355015064, + "grad_norm": 54.41510979168854, + "learning_rate": 1.1087420042643923e-07, + "loss": 0.2452, + "step": 52 + }, + { + "epoch": 0.005760460836866949, + "grad_norm": 27.31961709504184, + "learning_rate": 1.1513859275053305e-07, + "loss": 0.3131, + "step": 54 + }, + { + "epoch": 0.005973811238232392, + "grad_norm": 77.00896363994629, + "learning_rate": 1.1940298507462686e-07, + "loss": -0.0289, + "step": 56 + }, + { + "epoch": 0.006187161639597835, + "grad_norm": 21.8781170195382, + "learning_rate": 1.2366737739872068e-07, + "loss": -0.6188, + "step": 58 + }, + { + "epoch": 0.006400512040963277, + "grad_norm": 16.368843761461356, + "learning_rate": 1.279317697228145e-07, + "loss": 0.2618, + "step": 60 + }, + { + "epoch": 0.0066138624423287195, + "grad_norm": 17.051752922162727, + "learning_rate": 1.3219616204690832e-07, + "loss": -0.1717, + "step": 62 + }, + { + "epoch": 0.006827212843694162, + "grad_norm": 99.13064855085295, + "learning_rate": 1.3646055437100212e-07, + "loss": -0.449, + "step": 64 + }, + { + "epoch": 0.007040563245059605, + "grad_norm": 136.3486847826033, + "learning_rate": 1.4072494669509594e-07, + "loss": 0.2984, + "step": 66 + }, + { + "epoch": 0.007253913646425047, + "grad_norm": 62.554463250965604, + "learning_rate": 1.4498933901918976e-07, + "loss": -0.2482, + "step": 68 + }, + { + "epoch": 0.00746726404779049, + "grad_norm": 51.515904581548206, + "learning_rate": 1.4925373134328355e-07, + "loss": -0.8223, + "step": 70 + }, + { + "epoch": 0.007680614449155933, + "grad_norm": 113.86732871533368, + "learning_rate": 1.535181236673774e-07, + "loss": -0.0383, + "step": 72 + }, + { + "epoch": 0.007893964850521375, + "grad_norm": 26.087446116741575, + "learning_rate": 1.5778251599147122e-07, + "loss": 0.7253, + "step": 74 + }, + { + "epoch": 0.008107315251886817, + "grad_norm": 24.131096654425452, + "learning_rate": 1.6204690831556504e-07, + "loss": -0.0834, + "step": 76 + }, + { + "epoch": 0.008320665653252261, + "grad_norm": 74.42536774731673, + "learning_rate": 1.6631130063965884e-07, + "loss": -0.2997, + "step": 78 + }, + { + "epoch": 0.008534016054617703, + "grad_norm": 71.52938681964571, + "learning_rate": 1.7057569296375266e-07, + "loss": 0.0944, + "step": 80 + }, + { + "epoch": 0.008747366455983145, + "grad_norm": 29.241243114282817, + "learning_rate": 1.7484008528784648e-07, + "loss": -0.7054, + "step": 82 + }, + { + "epoch": 0.008960716857348588, + "grad_norm": 134.58638764428474, + "learning_rate": 1.7910447761194027e-07, + "loss": -0.338, + "step": 84 + }, + { + "epoch": 0.00917406725871403, + "grad_norm": 109.66885361053902, + "learning_rate": 1.8336886993603412e-07, + "loss": -0.1817, + "step": 86 + }, + { + "epoch": 0.009387417660079472, + "grad_norm": 27.969535103995202, + "learning_rate": 1.8763326226012794e-07, + "loss": -0.9258, + "step": 88 + }, + { + "epoch": 0.009600768061444916, + "grad_norm": 27.49676445944368, + "learning_rate": 1.9189765458422174e-07, + "loss": 0.3309, + "step": 90 + }, + { + "epoch": 0.009814118462810358, + "grad_norm": 25.544238337228855, + "learning_rate": 1.9616204690831556e-07, + "loss": -0.0894, + "step": 92 + }, + { + "epoch": 0.010027468864175802, + "grad_norm": 26.09909186001115, + "learning_rate": 2.0042643923240938e-07, + "loss": -0.1209, + "step": 94 + }, + { + "epoch": 0.010240819265541243, + "grad_norm": 32.632466828944544, + "learning_rate": 2.0469083155650317e-07, + "loss": 0.4736, + "step": 96 + }, + { + "epoch": 0.010454169666906685, + "grad_norm": 50.68481961380247, + "learning_rate": 2.08955223880597e-07, + "loss": -0.6321, + "step": 98 + }, + { + "epoch": 0.010667520068272129, + "grad_norm": 25.00088670078768, + "learning_rate": 2.1321961620469082e-07, + "loss": -0.3787, + "step": 100 + }, + { + "epoch": 0.010880870469637571, + "grad_norm": 21.25939551515176, + "learning_rate": 2.1748400852878466e-07, + "loss": 0.1969, + "step": 102 + }, + { + "epoch": 0.011094220871003013, + "grad_norm": 23.873098687717384, + "learning_rate": 2.2174840085287846e-07, + "loss": -0.0885, + "step": 104 + }, + { + "epoch": 0.011307571272368457, + "grad_norm": 17.01811754764777, + "learning_rate": 2.2601279317697228e-07, + "loss": 0.0203, + "step": 106 + }, + { + "epoch": 0.011520921673733898, + "grad_norm": 31.009663505522784, + "learning_rate": 2.302771855010661e-07, + "loss": -0.817, + "step": 108 + }, + { + "epoch": 0.011734272075099342, + "grad_norm": 33.200887615423774, + "learning_rate": 2.345415778251599e-07, + "loss": 0.0471, + "step": 110 + }, + { + "epoch": 0.011947622476464784, + "grad_norm": 32.63042973424626, + "learning_rate": 2.388059701492537e-07, + "loss": -1.5996, + "step": 112 + }, + { + "epoch": 0.012160972877830226, + "grad_norm": 30.22918533639733, + "learning_rate": 2.4307036247334754e-07, + "loss": -0.2466, + "step": 114 + }, + { + "epoch": 0.01237432327919567, + "grad_norm": 27.145400816572458, + "learning_rate": 2.4733475479744136e-07, + "loss": -0.3294, + "step": 116 + }, + { + "epoch": 0.012587673680561112, + "grad_norm": 19.75203303790041, + "learning_rate": 2.515991471215352e-07, + "loss": -0.7427, + "step": 118 + }, + { + "epoch": 0.012801024081926553, + "grad_norm": 16.441388594403527, + "learning_rate": 2.55863539445629e-07, + "loss": -0.2637, + "step": 120 + }, + { + "epoch": 0.013014374483291997, + "grad_norm": 23.81603976262189, + "learning_rate": 2.601279317697228e-07, + "loss": -0.07, + "step": 122 + }, + { + "epoch": 0.013227724884657439, + "grad_norm": 80.42145460011395, + "learning_rate": 2.6439232409381664e-07, + "loss": -0.1647, + "step": 124 + }, + { + "epoch": 0.013441075286022881, + "grad_norm": 28.328954293896295, + "learning_rate": 2.686567164179104e-07, + "loss": -0.1786, + "step": 126 + }, + { + "epoch": 0.013654425687388325, + "grad_norm": 44.29342497352258, + "learning_rate": 2.7292110874200423e-07, + "loss": 0.6337, + "step": 128 + }, + { + "epoch": 0.013867776088753767, + "grad_norm": 14.247744370026695, + "learning_rate": 2.7718550106609805e-07, + "loss": -0.7468, + "step": 130 + }, + { + "epoch": 0.01408112649011921, + "grad_norm": 30.909935252567514, + "learning_rate": 2.8144989339019187e-07, + "loss": 0.2444, + "step": 132 + }, + { + "epoch": 0.014294476891484652, + "grad_norm": 19.951382058960338, + "learning_rate": 2.857142857142857e-07, + "loss": -0.0888, + "step": 134 + }, + { + "epoch": 0.014507827292850094, + "grad_norm": 19.451243521855947, + "learning_rate": 2.899786780383795e-07, + "loss": 0.0622, + "step": 136 + }, + { + "epoch": 0.014721177694215538, + "grad_norm": 44.543818203511506, + "learning_rate": 2.9424307036247334e-07, + "loss": -0.5024, + "step": 138 + }, + { + "epoch": 0.01493452809558098, + "grad_norm": 33.99708212815377, + "learning_rate": 2.985074626865671e-07, + "loss": 0.3096, + "step": 140 + }, + { + "epoch": 0.015147878496946422, + "grad_norm": 23.05447566893894, + "learning_rate": 3.02771855010661e-07, + "loss": 0.3329, + "step": 142 + }, + { + "epoch": 0.015361228898311865, + "grad_norm": 16.88015370164898, + "learning_rate": 3.070362473347548e-07, + "loss": -0.1371, + "step": 144 + }, + { + "epoch": 0.015574579299677307, + "grad_norm": 26.13573562942401, + "learning_rate": 3.113006396588486e-07, + "loss": -0.4183, + "step": 146 + }, + { + "epoch": 0.01578792970104275, + "grad_norm": 22.309803740611596, + "learning_rate": 3.1556503198294244e-07, + "loss": -0.3246, + "step": 148 + }, + { + "epoch": 0.016001280102408193, + "grad_norm": 22.52244155087573, + "learning_rate": 3.1982942430703626e-07, + "loss": -0.5349, + "step": 150 + }, + { + "epoch": 0.016214630503773635, + "grad_norm": 18.473624086858514, + "learning_rate": 3.240938166311301e-07, + "loss": 0.2352, + "step": 152 + }, + { + "epoch": 0.016427980905139077, + "grad_norm": 7.7748886163241355, + "learning_rate": 3.2835820895522385e-07, + "loss": 0.2151, + "step": 154 + }, + { + "epoch": 0.016641331306504522, + "grad_norm": 10.802467734897878, + "learning_rate": 3.3262260127931767e-07, + "loss": 0.2848, + "step": 156 + }, + { + "epoch": 0.016854681707869964, + "grad_norm": 18.162628483420374, + "learning_rate": 3.368869936034115e-07, + "loss": -0.168, + "step": 158 + }, + { + "epoch": 0.017068032109235406, + "grad_norm": 40.528567461071695, + "learning_rate": 3.411513859275053e-07, + "loss": -0.4126, + "step": 160 + }, + { + "epoch": 0.017281382510600848, + "grad_norm": 35.36550572570942, + "learning_rate": 3.4541577825159914e-07, + "loss": 0.0287, + "step": 162 + }, + { + "epoch": 0.01749473291196629, + "grad_norm": 10.892875403462355, + "learning_rate": 3.4968017057569296e-07, + "loss": -0.613, + "step": 164 + }, + { + "epoch": 0.01770808331333173, + "grad_norm": 27.141686915753116, + "learning_rate": 3.539445628997867e-07, + "loss": -0.9305, + "step": 166 + }, + { + "epoch": 0.017921433714697177, + "grad_norm": 37.37206055456951, + "learning_rate": 3.5820895522388055e-07, + "loss": -0.3178, + "step": 168 + }, + { + "epoch": 0.01813478411606262, + "grad_norm": 52.89469341474386, + "learning_rate": 3.6247334754797437e-07, + "loss": -0.6419, + "step": 170 + }, + { + "epoch": 0.01834813451742806, + "grad_norm": 23.215765487324806, + "learning_rate": 3.6673773987206824e-07, + "loss": -0.0533, + "step": 172 + }, + { + "epoch": 0.018561484918793503, + "grad_norm": 21.171847082083467, + "learning_rate": 3.7100213219616206e-07, + "loss": -1.0652, + "step": 174 + }, + { + "epoch": 0.018774835320158945, + "grad_norm": 17.335722663927694, + "learning_rate": 3.752665245202559e-07, + "loss": -0.169, + "step": 176 + }, + { + "epoch": 0.01898818572152439, + "grad_norm": 13.15039858318736, + "learning_rate": 3.795309168443497e-07, + "loss": 0.4414, + "step": 178 + }, + { + "epoch": 0.019201536122889832, + "grad_norm": 9.293181092571949, + "learning_rate": 3.8379530916844347e-07, + "loss": 0.1714, + "step": 180 + }, + { + "epoch": 0.019414886524255274, + "grad_norm": 27.253514499053658, + "learning_rate": 3.880597014925373e-07, + "loss": -0.3719, + "step": 182 + }, + { + "epoch": 0.019628236925620716, + "grad_norm": 11.179212679495947, + "learning_rate": 3.923240938166311e-07, + "loss": -1.5775, + "step": 184 + }, + { + "epoch": 0.019841587326986158, + "grad_norm": 28.002238164167814, + "learning_rate": 3.9658848614072494e-07, + "loss": -0.9116, + "step": 186 + }, + { + "epoch": 0.020054937728351603, + "grad_norm": 20.304811169854457, + "learning_rate": 4.0085287846481876e-07, + "loss": -0.6315, + "step": 188 + }, + { + "epoch": 0.020268288129717045, + "grad_norm": 43.26972545321108, + "learning_rate": 4.051172707889126e-07, + "loss": 0.0818, + "step": 190 + }, + { + "epoch": 0.020481638531082487, + "grad_norm": 23.280256775268967, + "learning_rate": 4.0938166311300635e-07, + "loss": -0.2754, + "step": 192 + }, + { + "epoch": 0.02069498893244793, + "grad_norm": 25.086595962784372, + "learning_rate": 4.1364605543710017e-07, + "loss": 0.1783, + "step": 194 + }, + { + "epoch": 0.02090833933381337, + "grad_norm": 17.929854339567687, + "learning_rate": 4.17910447761194e-07, + "loss": -0.3967, + "step": 196 + }, + { + "epoch": 0.021121689735178813, + "grad_norm": 26.684280905377257, + "learning_rate": 4.221748400852878e-07, + "loss": -0.2432, + "step": 198 + }, + { + "epoch": 0.021335040136544258, + "grad_norm": 20.811875320249584, + "learning_rate": 4.2643923240938163e-07, + "loss": -0.6011, + "step": 200 + }, + { + "epoch": 0.0215483905379097, + "grad_norm": 28.776367397287462, + "learning_rate": 4.3070362473347545e-07, + "loss": -1.6853, + "step": 202 + }, + { + "epoch": 0.021761740939275142, + "grad_norm": 22.23424675819631, + "learning_rate": 4.349680170575693e-07, + "loss": 0.2222, + "step": 204 + }, + { + "epoch": 0.021975091340640584, + "grad_norm": 15.408003573608147, + "learning_rate": 4.392324093816631e-07, + "loss": -0.0164, + "step": 206 + }, + { + "epoch": 0.022188441742006026, + "grad_norm": 8.60164597442568, + "learning_rate": 4.434968017057569e-07, + "loss": -0.5671, + "step": 208 + }, + { + "epoch": 0.02240179214337147, + "grad_norm": 10.343966416597475, + "learning_rate": 4.4776119402985074e-07, + "loss": -0.815, + "step": 210 + }, + { + "epoch": 0.022615142544736913, + "grad_norm": 15.741330211675757, + "learning_rate": 4.5202558635394456e-07, + "loss": -1.0932, + "step": 212 + }, + { + "epoch": 0.022828492946102355, + "grad_norm": 16.39054521840376, + "learning_rate": 4.562899786780384e-07, + "loss": -1.7177, + "step": 214 + }, + { + "epoch": 0.023041843347467797, + "grad_norm": 11.781875373813582, + "learning_rate": 4.605543710021322e-07, + "loss": 0.3606, + "step": 216 + }, + { + "epoch": 0.02325519374883324, + "grad_norm": 10.097449683599512, + "learning_rate": 4.64818763326226e-07, + "loss": -0.7045, + "step": 218 + }, + { + "epoch": 0.023468544150198684, + "grad_norm": 15.715275415653137, + "learning_rate": 4.690831556503198e-07, + "loss": -0.0658, + "step": 220 + }, + { + "epoch": 0.023681894551564126, + "grad_norm": 26.933594441832767, + "learning_rate": 4.733475479744136e-07, + "loss": -0.1298, + "step": 222 + }, + { + "epoch": 0.023895244952929568, + "grad_norm": 8.712912536861461, + "learning_rate": 4.776119402985074e-07, + "loss": 0.095, + "step": 224 + }, + { + "epoch": 0.02410859535429501, + "grad_norm": 21.120479987680937, + "learning_rate": 4.818763326226012e-07, + "loss": -0.0818, + "step": 226 + }, + { + "epoch": 0.024321945755660452, + "grad_norm": 10.071445207074044, + "learning_rate": 4.861407249466951e-07, + "loss": -0.1419, + "step": 228 + }, + { + "epoch": 0.024535296157025894, + "grad_norm": 10.801668877306875, + "learning_rate": 4.904051172707888e-07, + "loss": -0.3432, + "step": 230 + }, + { + "epoch": 0.02474864655839134, + "grad_norm": 13.692305863205236, + "learning_rate": 4.946695095948827e-07, + "loss": -0.1519, + "step": 232 + }, + { + "epoch": 0.02496199695975678, + "grad_norm": 22.11353305341118, + "learning_rate": 4.989339019189765e-07, + "loss": -0.0445, + "step": 234 + }, + { + "epoch": 0.025175347361122223, + "grad_norm": 25.883156385523233, + "learning_rate": 5.031982942430704e-07, + "loss": -0.4949, + "step": 236 + }, + { + "epoch": 0.025388697762487665, + "grad_norm": 61.080118987133744, + "learning_rate": 5.074626865671642e-07, + "loss": -1.1921, + "step": 238 + }, + { + "epoch": 0.025602048163853107, + "grad_norm": 9.282902236974593, + "learning_rate": 5.11727078891258e-07, + "loss": 0.1989, + "step": 240 + }, + { + "epoch": 0.025815398565218552, + "grad_norm": 9.781179043931736, + "learning_rate": 5.159914712153518e-07, + "loss": 0.9967, + "step": 242 + }, + { + "epoch": 0.026028748966583994, + "grad_norm": 12.216217874660604, + "learning_rate": 5.202558635394456e-07, + "loss": -0.3518, + "step": 244 + }, + { + "epoch": 0.026242099367949436, + "grad_norm": 24.42587736751593, + "learning_rate": 5.245202558635394e-07, + "loss": -1.0814, + "step": 246 + }, + { + "epoch": 0.026455449769314878, + "grad_norm": 38.12352896449889, + "learning_rate": 5.287846481876333e-07, + "loss": 0.0811, + "step": 248 + }, + { + "epoch": 0.02666880017068032, + "grad_norm": 24.52163737825959, + "learning_rate": 5.33049040511727e-07, + "loss": -0.8824, + "step": 250 + }, + { + "epoch": 0.026882150572045762, + "grad_norm": 19.231993611076483, + "learning_rate": 5.373134328358208e-07, + "loss": 0.5571, + "step": 252 + }, + { + "epoch": 0.027095500973411207, + "grad_norm": 17.129505372417224, + "learning_rate": 5.415778251599147e-07, + "loss": -0.6865, + "step": 254 + }, + { + "epoch": 0.02730885137477665, + "grad_norm": 19.406038647540843, + "learning_rate": 5.458422174840085e-07, + "loss": -0.4991, + "step": 256 + }, + { + "epoch": 0.02752220177614209, + "grad_norm": 6.847722005977987, + "learning_rate": 5.501066098081023e-07, + "loss": 0.1979, + "step": 258 + }, + { + "epoch": 0.027735552177507533, + "grad_norm": 13.405288361475698, + "learning_rate": 5.543710021321961e-07, + "loss": -0.3812, + "step": 260 + }, + { + "epoch": 0.027948902578872975, + "grad_norm": 21.529785897303153, + "learning_rate": 5.5863539445629e-07, + "loss": -0.6946, + "step": 262 + }, + { + "epoch": 0.02816225298023842, + "grad_norm": 14.20190258955381, + "learning_rate": 5.628997867803837e-07, + "loss": -0.7182, + "step": 264 + }, + { + "epoch": 0.028375603381603862, + "grad_norm": 20.09792579726273, + "learning_rate": 5.671641791044775e-07, + "loss": -0.2583, + "step": 266 + }, + { + "epoch": 0.028588953782969304, + "grad_norm": 17.213177542585473, + "learning_rate": 5.714285714285714e-07, + "loss": 0.3218, + "step": 268 + }, + { + "epoch": 0.028802304184334746, + "grad_norm": 11.185587231889192, + "learning_rate": 5.756929637526652e-07, + "loss": -0.0663, + "step": 270 + }, + { + "epoch": 0.029015654585700188, + "grad_norm": 27.722334316990437, + "learning_rate": 5.79957356076759e-07, + "loss": 0.1254, + "step": 272 + }, + { + "epoch": 0.029229004987065633, + "grad_norm": 20.49167387185228, + "learning_rate": 5.842217484008528e-07, + "loss": -0.3077, + "step": 274 + }, + { + "epoch": 0.029442355388431075, + "grad_norm": 17.084103781873182, + "learning_rate": 5.884861407249467e-07, + "loss": -1.0397, + "step": 276 + }, + { + "epoch": 0.029655705789796517, + "grad_norm": 23.870809301137577, + "learning_rate": 5.927505330490404e-07, + "loss": -0.401, + "step": 278 + }, + { + "epoch": 0.02986905619116196, + "grad_norm": 11.313699213427757, + "learning_rate": 5.970149253731342e-07, + "loss": -1.1555, + "step": 280 + }, + { + "epoch": 0.0300824065925274, + "grad_norm": 9.331131563295791, + "learning_rate": 6.012793176972282e-07, + "loss": -0.5128, + "step": 282 + }, + { + "epoch": 0.030295756993892843, + "grad_norm": 10.71153796479133, + "learning_rate": 6.05543710021322e-07, + "loss": 0.239, + "step": 284 + }, + { + "epoch": 0.03050910739525829, + "grad_norm": 28.31484795145744, + "learning_rate": 6.098081023454158e-07, + "loss": -0.2084, + "step": 286 + }, + { + "epoch": 0.03072245779662373, + "grad_norm": 11.917796878106765, + "learning_rate": 6.140724946695096e-07, + "loss": -0.2701, + "step": 288 + }, + { + "epoch": 0.030935808197989172, + "grad_norm": 15.31345988143714, + "learning_rate": 6.183368869936035e-07, + "loss": -0.9096, + "step": 290 + }, + { + "epoch": 0.031149158599354614, + "grad_norm": 49.77491988047747, + "learning_rate": 6.226012793176972e-07, + "loss": 0.1465, + "step": 292 + }, + { + "epoch": 0.03136250900072006, + "grad_norm": 12.884205486410131, + "learning_rate": 6.26865671641791e-07, + "loss": 0.5899, + "step": 294 + }, + { + "epoch": 0.0315758594020855, + "grad_norm": 22.77696725291766, + "learning_rate": 6.311300639658849e-07, + "loss": -0.3255, + "step": 296 + }, + { + "epoch": 0.031789209803450943, + "grad_norm": 14.760034845954518, + "learning_rate": 6.353944562899787e-07, + "loss": 0.2191, + "step": 298 + }, + { + "epoch": 0.032002560204816385, + "grad_norm": 16.04873208614678, + "learning_rate": 6.396588486140725e-07, + "loss": 0.9425, + "step": 300 + }, + { + "epoch": 0.03221591060618183, + "grad_norm": 21.948692800323336, + "learning_rate": 6.439232409381663e-07, + "loss": 0.4721, + "step": 302 + }, + { + "epoch": 0.03242926100754727, + "grad_norm": 17.475770533963207, + "learning_rate": 6.481876332622602e-07, + "loss": -0.8516, + "step": 304 + }, + { + "epoch": 0.03264261140891271, + "grad_norm": 21.938554689368157, + "learning_rate": 6.524520255863539e-07, + "loss": 0.1675, + "step": 306 + }, + { + "epoch": 0.03285596181027815, + "grad_norm": 46.95903237109459, + "learning_rate": 6.567164179104477e-07, + "loss": 0.7013, + "step": 308 + }, + { + "epoch": 0.033069312211643595, + "grad_norm": 21.7738424183671, + "learning_rate": 6.609808102345416e-07, + "loss": -0.8641, + "step": 310 + }, + { + "epoch": 0.033282662613009044, + "grad_norm": 20.818736283324778, + "learning_rate": 6.652452025586353e-07, + "loss": -0.2203, + "step": 312 + }, + { + "epoch": 0.033496013014374486, + "grad_norm": 12.191027612540129, + "learning_rate": 6.695095948827292e-07, + "loss": -0.2385, + "step": 314 + }, + { + "epoch": 0.03370936341573993, + "grad_norm": 17.485900290878302, + "learning_rate": 6.73773987206823e-07, + "loss": -0.6482, + "step": 316 + }, + { + "epoch": 0.03392271381710537, + "grad_norm": 12.365985112237825, + "learning_rate": 6.780383795309168e-07, + "loss": -0.0618, + "step": 318 + }, + { + "epoch": 0.03413606421847081, + "grad_norm": 19.889441682727426, + "learning_rate": 6.823027718550106e-07, + "loss": 0.255, + "step": 320 + }, + { + "epoch": 0.034349414619836253, + "grad_norm": 19.5015710993416, + "learning_rate": 6.865671641791044e-07, + "loss": -0.6494, + "step": 322 + }, + { + "epoch": 0.034562765021201695, + "grad_norm": 21.45614945706602, + "learning_rate": 6.908315565031983e-07, + "loss": -0.4695, + "step": 324 + }, + { + "epoch": 0.03477611542256714, + "grad_norm": 11.714892632525226, + "learning_rate": 6.95095948827292e-07, + "loss": -0.6134, + "step": 326 + }, + { + "epoch": 0.03498946582393258, + "grad_norm": 17.08341230963109, + "learning_rate": 6.993603411513859e-07, + "loss": -0.9212, + "step": 328 + }, + { + "epoch": 0.03520281622529802, + "grad_norm": 18.71372293407707, + "learning_rate": 7.036247334754797e-07, + "loss": -0.1783, + "step": 330 + }, + { + "epoch": 0.03541616662666346, + "grad_norm": 40.394000763247476, + "learning_rate": 7.078891257995734e-07, + "loss": -0.1696, + "step": 332 + }, + { + "epoch": 0.03562951702802891, + "grad_norm": 16.851717713795054, + "learning_rate": 7.121535181236673e-07, + "loss": -0.168, + "step": 334 + }, + { + "epoch": 0.035842867429394354, + "grad_norm": 20.980121165459167, + "learning_rate": 7.164179104477611e-07, + "loss": -0.1898, + "step": 336 + }, + { + "epoch": 0.036056217830759796, + "grad_norm": 5.128039309618372, + "learning_rate": 7.20682302771855e-07, + "loss": -0.3406, + "step": 338 + }, + { + "epoch": 0.03626956823212524, + "grad_norm": 17.130506401349365, + "learning_rate": 7.249466950959487e-07, + "loss": -0.2751, + "step": 340 + }, + { + "epoch": 0.03648291863349068, + "grad_norm": 17.383816800029777, + "learning_rate": 7.292110874200426e-07, + "loss": -0.0005, + "step": 342 + }, + { + "epoch": 0.03669626903485612, + "grad_norm": 43.198248806456, + "learning_rate": 7.334754797441365e-07, + "loss": -0.2181, + "step": 344 + }, + { + "epoch": 0.036909619436221563, + "grad_norm": 16.46060851996884, + "learning_rate": 7.377398720682303e-07, + "loss": 0.2075, + "step": 346 + }, + { + "epoch": 0.037122969837587005, + "grad_norm": 12.986350336767204, + "learning_rate": 7.420042643923241e-07, + "loss": -0.5395, + "step": 348 + }, + { + "epoch": 0.03733632023895245, + "grad_norm": 19.883036986347154, + "learning_rate": 7.462686567164179e-07, + "loss": -0.047, + "step": 350 + }, + { + "epoch": 0.03754967064031789, + "grad_norm": 8.289068924690984, + "learning_rate": 7.505330490405118e-07, + "loss": 0.9832, + "step": 352 + }, + { + "epoch": 0.03776302104168334, + "grad_norm": 10.173680365988295, + "learning_rate": 7.547974413646055e-07, + "loss": -0.4718, + "step": 354 + }, + { + "epoch": 0.03797637144304878, + "grad_norm": 29.406476571062164, + "learning_rate": 7.590618336886994e-07, + "loss": -0.3167, + "step": 356 + }, + { + "epoch": 0.03818972184441422, + "grad_norm": 27.934726009197963, + "learning_rate": 7.633262260127932e-07, + "loss": -0.1972, + "step": 358 + }, + { + "epoch": 0.038403072245779664, + "grad_norm": 25.076744001467002, + "learning_rate": 7.675906183368869e-07, + "loss": 0.2433, + "step": 360 + }, + { + "epoch": 0.038616422647145106, + "grad_norm": 17.034753146666695, + "learning_rate": 7.718550106609808e-07, + "loss": -0.6862, + "step": 362 + }, + { + "epoch": 0.03882977304851055, + "grad_norm": 30.28266067136412, + "learning_rate": 7.761194029850746e-07, + "loss": 0.0555, + "step": 364 + }, + { + "epoch": 0.03904312344987599, + "grad_norm": 15.433190582955902, + "learning_rate": 7.803837953091685e-07, + "loss": -0.6999, + "step": 366 + }, + { + "epoch": 0.03925647385124143, + "grad_norm": 16.705763291209895, + "learning_rate": 7.846481876332622e-07, + "loss": 0.145, + "step": 368 + }, + { + "epoch": 0.039469824252606873, + "grad_norm": 7.207700811021795, + "learning_rate": 7.889125799573561e-07, + "loss": 0.4911, + "step": 370 + }, + { + "epoch": 0.039683174653972315, + "grad_norm": 12.58245126026889, + "learning_rate": 7.931769722814499e-07, + "loss": -0.4105, + "step": 372 + }, + { + "epoch": 0.03989652505533776, + "grad_norm": 9.727684047283205, + "learning_rate": 7.974413646055436e-07, + "loss": -0.4456, + "step": 374 + }, + { + "epoch": 0.040109875456703206, + "grad_norm": 37.82883818815438, + "learning_rate": 8.017057569296375e-07, + "loss": -1.003, + "step": 376 + }, + { + "epoch": 0.04032322585806865, + "grad_norm": 9.205831543930358, + "learning_rate": 8.059701492537313e-07, + "loss": -0.9034, + "step": 378 + }, + { + "epoch": 0.04053657625943409, + "grad_norm": 17.5439627015, + "learning_rate": 8.102345415778252e-07, + "loss": -0.3568, + "step": 380 + }, + { + "epoch": 0.04074992666079953, + "grad_norm": 15.109693571978122, + "learning_rate": 8.144989339019189e-07, + "loss": -0.3776, + "step": 382 + }, + { + "epoch": 0.040963277062164974, + "grad_norm": 10.552742457862886, + "learning_rate": 8.187633262260127e-07, + "loss": 0.2015, + "step": 384 + }, + { + "epoch": 0.041176627463530416, + "grad_norm": 19.08793342231466, + "learning_rate": 8.230277185501066e-07, + "loss": -1.4421, + "step": 386 + }, + { + "epoch": 0.04138997786489586, + "grad_norm": 34.056443898662884, + "learning_rate": 8.272921108742003e-07, + "loss": -0.197, + "step": 388 + }, + { + "epoch": 0.0416033282662613, + "grad_norm": 7.384540107176898, + "learning_rate": 8.315565031982942e-07, + "loss": 0.7544, + "step": 390 + }, + { + "epoch": 0.04181667866762674, + "grad_norm": 23.683836639585067, + "learning_rate": 8.35820895522388e-07, + "loss": -0.3735, + "step": 392 + }, + { + "epoch": 0.042030029068992183, + "grad_norm": 19.629725497894157, + "learning_rate": 8.400852878464819e-07, + "loss": -0.6826, + "step": 394 + }, + { + "epoch": 0.042243379470357625, + "grad_norm": 13.85103939830437, + "learning_rate": 8.443496801705756e-07, + "loss": -0.2708, + "step": 396 + }, + { + "epoch": 0.042456729871723074, + "grad_norm": 19.710815160612455, + "learning_rate": 8.486140724946694e-07, + "loss": 0.0877, + "step": 398 + }, + { + "epoch": 0.042670080273088516, + "grad_norm": 4.706110486614041, + "learning_rate": 8.528784648187633e-07, + "loss": 0.1916, + "step": 400 + }, + { + "epoch": 0.04288343067445396, + "grad_norm": 12.864617887211866, + "learning_rate": 8.57142857142857e-07, + "loss": 0.0179, + "step": 402 + }, + { + "epoch": 0.0430967810758194, + "grad_norm": 9.560466147390757, + "learning_rate": 8.614072494669509e-07, + "loss": -0.0813, + "step": 404 + }, + { + "epoch": 0.04331013147718484, + "grad_norm": 21.130991357317583, + "learning_rate": 8.656716417910447e-07, + "loss": -1.1384, + "step": 406 + }, + { + "epoch": 0.043523481878550284, + "grad_norm": 19.392285709372985, + "learning_rate": 8.699360341151387e-07, + "loss": 0.1794, + "step": 408 + }, + { + "epoch": 0.043736832279915726, + "grad_norm": 9.266238784116867, + "learning_rate": 8.742004264392324e-07, + "loss": -1.768, + "step": 410 + }, + { + "epoch": 0.04395018268128117, + "grad_norm": 40.16756332202219, + "learning_rate": 8.784648187633262e-07, + "loss": -0.8232, + "step": 412 + }, + { + "epoch": 0.04416353308264661, + "grad_norm": 39.24330802424692, + "learning_rate": 8.827292110874201e-07, + "loss": -1.6178, + "step": 414 + }, + { + "epoch": 0.04437688348401205, + "grad_norm": 11.88082478788392, + "learning_rate": 8.869936034115138e-07, + "loss": -0.3053, + "step": 416 + }, + { + "epoch": 0.044590233885377493, + "grad_norm": 6.357526673930185, + "learning_rate": 8.912579957356077e-07, + "loss": -1.0471, + "step": 418 + }, + { + "epoch": 0.04480358428674294, + "grad_norm": 16.303453044925423, + "learning_rate": 8.955223880597015e-07, + "loss": -0.1262, + "step": 420 + }, + { + "epoch": 0.045016934688108384, + "grad_norm": 14.6518934802552, + "learning_rate": 8.997867803837953e-07, + "loss": -0.2323, + "step": 422 + }, + { + "epoch": 0.045230285089473826, + "grad_norm": 8.02837999022306, + "learning_rate": 9.040511727078891e-07, + "loss": 0.1657, + "step": 424 + }, + { + "epoch": 0.04544363549083927, + "grad_norm": 15.64582877736265, + "learning_rate": 9.083155650319829e-07, + "loss": -0.1882, + "step": 426 + }, + { + "epoch": 0.04565698589220471, + "grad_norm": 16.363233536467853, + "learning_rate": 9.125799573560768e-07, + "loss": -0.4399, + "step": 428 + }, + { + "epoch": 0.04587033629357015, + "grad_norm": 15.548489913061752, + "learning_rate": 9.168443496801705e-07, + "loss": 0.3602, + "step": 430 + }, + { + "epoch": 0.046083686694935594, + "grad_norm": 7.007708536573836, + "learning_rate": 9.211087420042644e-07, + "loss": -0.2646, + "step": 432 + }, + { + "epoch": 0.046297037096301036, + "grad_norm": 13.687256516704583, + "learning_rate": 9.253731343283582e-07, + "loss": -1.0053, + "step": 434 + }, + { + "epoch": 0.04651038749766648, + "grad_norm": 30.306488264414572, + "learning_rate": 9.29637526652452e-07, + "loss": -0.3203, + "step": 436 + }, + { + "epoch": 0.04672373789903192, + "grad_norm": 26.094659474249983, + "learning_rate": 9.339019189765458e-07, + "loss": -0.6359, + "step": 438 + }, + { + "epoch": 0.04693708830039737, + "grad_norm": 13.796812579064204, + "learning_rate": 9.381663113006396e-07, + "loss": -0.3471, + "step": 440 + }, + { + "epoch": 0.04715043870176281, + "grad_norm": 14.69389520446514, + "learning_rate": 9.424307036247334e-07, + "loss": -0.8145, + "step": 442 + }, + { + "epoch": 0.04736378910312825, + "grad_norm": 16.918575972701632, + "learning_rate": 9.466950959488272e-07, + "loss": 0.2185, + "step": 444 + }, + { + "epoch": 0.047577139504493694, + "grad_norm": 10.422480971977198, + "learning_rate": 9.509594882729211e-07, + "loss": -0.4649, + "step": 446 + }, + { + "epoch": 0.047790489905859136, + "grad_norm": 7.03443725132983, + "learning_rate": 9.552238805970149e-07, + "loss": -0.3627, + "step": 448 + }, + { + "epoch": 0.04800384030722458, + "grad_norm": 18.359610314742397, + "learning_rate": 9.594882729211086e-07, + "loss": -0.9404, + "step": 450 + }, + { + "epoch": 0.04821719070859002, + "grad_norm": 8.277072501759134, + "learning_rate": 9.637526652452024e-07, + "loss": 0.2409, + "step": 452 + }, + { + "epoch": 0.04843054110995546, + "grad_norm": 13.071634752922243, + "learning_rate": 9.680170575692964e-07, + "loss": 0.3506, + "step": 454 + }, + { + "epoch": 0.048643891511320904, + "grad_norm": 36.801112890968085, + "learning_rate": 9.722814498933901e-07, + "loss": -0.3961, + "step": 456 + }, + { + "epoch": 0.048857241912686346, + "grad_norm": 40.38991240984631, + "learning_rate": 9.76545842217484e-07, + "loss": -0.3497, + "step": 458 + }, + { + "epoch": 0.04907059231405179, + "grad_norm": 8.414591188760289, + "learning_rate": 9.808102345415777e-07, + "loss": -0.5775, + "step": 460 + }, + { + "epoch": 0.04928394271541724, + "grad_norm": 8.76108225719733, + "learning_rate": 9.850746268656714e-07, + "loss": 0.0839, + "step": 462 + }, + { + "epoch": 0.04949729311678268, + "grad_norm": 9.275399140799998, + "learning_rate": 9.893390191897654e-07, + "loss": 0.0137, + "step": 464 + }, + { + "epoch": 0.04971064351814812, + "grad_norm": 9.832501914571974, + "learning_rate": 9.936034115138592e-07, + "loss": 0.1014, + "step": 466 + }, + { + "epoch": 0.04992399391951356, + "grad_norm": 19.204369086700787, + "learning_rate": 9.97867803837953e-07, + "loss": -0.31, + "step": 468 + }, + { + "epoch": 0.050137344320879004, + "grad_norm": 6.109980991190154, + "learning_rate": 1.0021321961620467e-06, + "loss": -0.26, + "step": 470 + }, + { + "epoch": 0.050350694722244446, + "grad_norm": 31.865612558858906, + "learning_rate": 1.0063965884861407e-06, + "loss": -0.1208, + "step": 472 + }, + { + "epoch": 0.05056404512360989, + "grad_norm": 10.044184014650897, + "learning_rate": 1.0106609808102345e-06, + "loss": -0.2205, + "step": 474 + }, + { + "epoch": 0.05077739552497533, + "grad_norm": 21.875745094199374, + "learning_rate": 1.0149253731343285e-06, + "loss": -0.0812, + "step": 476 + }, + { + "epoch": 0.05099074592634077, + "grad_norm": 11.995975513229988, + "learning_rate": 1.019189765458422e-06, + "loss": -1.7783, + "step": 478 + }, + { + "epoch": 0.051204096327706214, + "grad_norm": 7.696707350691998, + "learning_rate": 1.023454157782516e-06, + "loss": 0.2012, + "step": 480 + }, + { + "epoch": 0.051417446729071656, + "grad_norm": 72.33916217166278, + "learning_rate": 1.0277185501066098e-06, + "loss": 0.0427, + "step": 482 + }, + { + "epoch": 0.051630797130437105, + "grad_norm": 13.569652394718384, + "learning_rate": 1.0319829424307035e-06, + "loss": 0.0953, + "step": 484 + }, + { + "epoch": 0.05184414753180255, + "grad_norm": 6.640484086255635, + "learning_rate": 1.0362473347547973e-06, + "loss": -1.0771, + "step": 486 + }, + { + "epoch": 0.05205749793316799, + "grad_norm": 16.284392321889953, + "learning_rate": 1.0405117270788913e-06, + "loss": -0.661, + "step": 488 + }, + { + "epoch": 0.05227084833453343, + "grad_norm": 10.396345506952937, + "learning_rate": 1.0447761194029848e-06, + "loss": -0.3095, + "step": 490 + }, + { + "epoch": 0.05248419873589887, + "grad_norm": 12.867700984841619, + "learning_rate": 1.0490405117270788e-06, + "loss": 0.4289, + "step": 492 + }, + { + "epoch": 0.052697549137264314, + "grad_norm": 18.31825168601856, + "learning_rate": 1.0533049040511726e-06, + "loss": 0.1729, + "step": 494 + }, + { + "epoch": 0.052910899538629756, + "grad_norm": 17.248690968213506, + "learning_rate": 1.0575692963752666e-06, + "loss": -1.1607, + "step": 496 + }, + { + "epoch": 0.0531242499399952, + "grad_norm": 8.51928074224459, + "learning_rate": 1.0618336886993601e-06, + "loss": -0.3654, + "step": 498 + }, + { + "epoch": 0.05333760034136064, + "grad_norm": 20.639043194792887, + "learning_rate": 1.066098081023454e-06, + "loss": -0.4893, + "step": 500 + }, + { + "epoch": 0.05355095074272608, + "grad_norm": 14.990753207547238, + "learning_rate": 1.070362473347548e-06, + "loss": -0.468, + "step": 502 + }, + { + "epoch": 0.053764301144091524, + "grad_norm": 24.695774003905925, + "learning_rate": 1.0746268656716416e-06, + "loss": -1.2465, + "step": 504 + }, + { + "epoch": 0.05397765154545697, + "grad_norm": 17.81220746000941, + "learning_rate": 1.0788912579957356e-06, + "loss": -0.232, + "step": 506 + }, + { + "epoch": 0.054191001946822415, + "grad_norm": 7.6840011548352924, + "learning_rate": 1.0831556503198294e-06, + "loss": 0.1394, + "step": 508 + }, + { + "epoch": 0.05440435234818786, + "grad_norm": 17.135280175028917, + "learning_rate": 1.0874200426439234e-06, + "loss": -0.1113, + "step": 510 + }, + { + "epoch": 0.0546177027495533, + "grad_norm": 21.577778290948245, + "learning_rate": 1.091684434968017e-06, + "loss": 0.1389, + "step": 512 + }, + { + "epoch": 0.05483105315091874, + "grad_norm": 26.22719409585662, + "learning_rate": 1.095948827292111e-06, + "loss": -0.1216, + "step": 514 + }, + { + "epoch": 0.05504440355228418, + "grad_norm": 8.763214398721058, + "learning_rate": 1.1002132196162047e-06, + "loss": -0.6157, + "step": 516 + }, + { + "epoch": 0.055257753953649624, + "grad_norm": 14.925448860683707, + "learning_rate": 1.1044776119402984e-06, + "loss": 0.0356, + "step": 518 + }, + { + "epoch": 0.055471104355015066, + "grad_norm": 14.365407282055516, + "learning_rate": 1.1087420042643922e-06, + "loss": -0.0624, + "step": 520 + }, + { + "epoch": 0.05568445475638051, + "grad_norm": 8.802366004297667, + "learning_rate": 1.1130063965884862e-06, + "loss": 0.5658, + "step": 522 + }, + { + "epoch": 0.05589780515774595, + "grad_norm": 24.251203850442998, + "learning_rate": 1.11727078891258e-06, + "loss": -0.0644, + "step": 524 + }, + { + "epoch": 0.0561111555591114, + "grad_norm": 16.632568797956132, + "learning_rate": 1.1215351812366737e-06, + "loss": -1.0315, + "step": 526 + }, + { + "epoch": 0.05632450596047684, + "grad_norm": 15.777059103835, + "learning_rate": 1.1257995735607675e-06, + "loss": 0.6824, + "step": 528 + }, + { + "epoch": 0.05653785636184228, + "grad_norm": 6.521931225844872, + "learning_rate": 1.1300639658848615e-06, + "loss": -0.2547, + "step": 530 + }, + { + "epoch": 0.056751206763207725, + "grad_norm": 16.18907262330339, + "learning_rate": 1.134328358208955e-06, + "loss": -0.714, + "step": 532 + }, + { + "epoch": 0.05696455716457317, + "grad_norm": 17.05716262320914, + "learning_rate": 1.138592750533049e-06, + "loss": -1.1937, + "step": 534 + }, + { + "epoch": 0.05717790756593861, + "grad_norm": 8.560189061728243, + "learning_rate": 1.1428571428571428e-06, + "loss": -0.3703, + "step": 536 + }, + { + "epoch": 0.05739125796730405, + "grad_norm": 11.56228500281614, + "learning_rate": 1.1471215351812368e-06, + "loss": -0.8073, + "step": 538 + }, + { + "epoch": 0.05760460836866949, + "grad_norm": 13.24672543046646, + "learning_rate": 1.1513859275053303e-06, + "loss": -0.3222, + "step": 540 + }, + { + "epoch": 0.057817958770034934, + "grad_norm": 14.035481073008786, + "learning_rate": 1.1556503198294243e-06, + "loss": 0.286, + "step": 542 + }, + { + "epoch": 0.058031309171400376, + "grad_norm": 28.95516126438781, + "learning_rate": 1.159914712153518e-06, + "loss": -1.189, + "step": 544 + }, + { + "epoch": 0.05824465957276582, + "grad_norm": 20.850060598425877, + "learning_rate": 1.1641791044776118e-06, + "loss": -0.6204, + "step": 546 + }, + { + "epoch": 0.05845800997413127, + "grad_norm": 21.211499139014034, + "learning_rate": 1.1684434968017056e-06, + "loss": -0.5428, + "step": 548 + }, + { + "epoch": 0.05867136037549671, + "grad_norm": 16.101447224508938, + "learning_rate": 1.1727078891257996e-06, + "loss": -0.7822, + "step": 550 + }, + { + "epoch": 0.05888471077686215, + "grad_norm": 60.24925771229807, + "learning_rate": 1.1769722814498933e-06, + "loss": -0.5338, + "step": 552 + }, + { + "epoch": 0.05909806117822759, + "grad_norm": 10.911944544305852, + "learning_rate": 1.1812366737739871e-06, + "loss": -0.1546, + "step": 554 + }, + { + "epoch": 0.059311411579593035, + "grad_norm": 20.123496073532923, + "learning_rate": 1.1855010660980809e-06, + "loss": 0.1078, + "step": 556 + }, + { + "epoch": 0.05952476198095848, + "grad_norm": 8.077431875506283, + "learning_rate": 1.1897654584221749e-06, + "loss": 0.6775, + "step": 558 + }, + { + "epoch": 0.05973811238232392, + "grad_norm": 20.69430693950377, + "learning_rate": 1.1940298507462684e-06, + "loss": -0.1531, + "step": 560 + }, + { + "epoch": 0.05995146278368936, + "grad_norm": 9.149229771649699, + "learning_rate": 1.1982942430703624e-06, + "loss": 0.2026, + "step": 562 + }, + { + "epoch": 0.0601648131850548, + "grad_norm": 35.409517307149535, + "learning_rate": 1.2025586353944564e-06, + "loss": 0.2081, + "step": 564 + }, + { + "epoch": 0.060378163586420244, + "grad_norm": 18.732961050530974, + "learning_rate": 1.2068230277185501e-06, + "loss": -0.4564, + "step": 566 + }, + { + "epoch": 0.060591513987785686, + "grad_norm": 18.17521906003365, + "learning_rate": 1.211087420042644e-06, + "loss": -0.4694, + "step": 568 + }, + { + "epoch": 0.060804864389151135, + "grad_norm": 15.950240307469345, + "learning_rate": 1.2153518123667377e-06, + "loss": -0.7607, + "step": 570 + }, + { + "epoch": 0.06101821479051658, + "grad_norm": 27.667171128676177, + "learning_rate": 1.2196162046908317e-06, + "loss": -0.4962, + "step": 572 + }, + { + "epoch": 0.06123156519188202, + "grad_norm": 6.875981558346762, + "learning_rate": 1.2238805970149252e-06, + "loss": 0.1071, + "step": 574 + }, + { + "epoch": 0.06144491559324746, + "grad_norm": 23.330801092034456, + "learning_rate": 1.2281449893390192e-06, + "loss": -1.1081, + "step": 576 + }, + { + "epoch": 0.0616582659946129, + "grad_norm": 18.097087985455573, + "learning_rate": 1.232409381663113e-06, + "loss": 0.292, + "step": 578 + }, + { + "epoch": 0.061871616395978345, + "grad_norm": 44.96075507661794, + "learning_rate": 1.236673773987207e-06, + "loss": -1.8344, + "step": 580 + }, + { + "epoch": 0.06208496679734379, + "grad_norm": 11.33849063045387, + "learning_rate": 1.2409381663113005e-06, + "loss": -0.5339, + "step": 582 + }, + { + "epoch": 0.06229831719870923, + "grad_norm": 25.655929914012017, + "learning_rate": 1.2452025586353945e-06, + "loss": 0.2773, + "step": 584 + }, + { + "epoch": 0.06251166760007468, + "grad_norm": 19.518509371933693, + "learning_rate": 1.2494669509594882e-06, + "loss": -0.3635, + "step": 586 + }, + { + "epoch": 0.06272501800144012, + "grad_norm": 19.337178836658445, + "learning_rate": 1.253731343283582e-06, + "loss": -0.4397, + "step": 588 + }, + { + "epoch": 0.06293836840280556, + "grad_norm": 15.629253302757428, + "learning_rate": 1.2579957356076758e-06, + "loss": -0.3098, + "step": 590 + }, + { + "epoch": 0.063151718804171, + "grad_norm": 35.95259160255183, + "learning_rate": 1.2622601279317698e-06, + "loss": -0.9708, + "step": 592 + }, + { + "epoch": 0.06336506920553645, + "grad_norm": 27.574715730226895, + "learning_rate": 1.2665245202558633e-06, + "loss": -0.5131, + "step": 594 + }, + { + "epoch": 0.06357841960690189, + "grad_norm": 47.74900017442824, + "learning_rate": 1.2707889125799573e-06, + "loss": 0.3332, + "step": 596 + }, + { + "epoch": 0.06379177000826733, + "grad_norm": 20.309365479498744, + "learning_rate": 1.275053304904051e-06, + "loss": -1.2309, + "step": 598 + }, + { + "epoch": 0.06400512040963277, + "grad_norm": 11.40208623029672, + "learning_rate": 1.279317697228145e-06, + "loss": -0.5857, + "step": 600 + }, + { + "epoch": 0.06421847081099821, + "grad_norm": 11.402181219311933, + "learning_rate": 1.2835820895522386e-06, + "loss": -0.3416, + "step": 602 + }, + { + "epoch": 0.06443182121236365, + "grad_norm": 15.010371839635921, + "learning_rate": 1.2878464818763326e-06, + "loss": -0.4071, + "step": 604 + }, + { + "epoch": 0.0646451716137291, + "grad_norm": 9.99633951597125, + "learning_rate": 1.2921108742004264e-06, + "loss": -1.1453, + "step": 606 + }, + { + "epoch": 0.06485852201509454, + "grad_norm": 21.31739921019411, + "learning_rate": 1.2963752665245203e-06, + "loss": 0.3107, + "step": 608 + }, + { + "epoch": 0.06507187241645998, + "grad_norm": 16.7015124046411, + "learning_rate": 1.3006396588486139e-06, + "loss": -0.9792, + "step": 610 + }, + { + "epoch": 0.06528522281782542, + "grad_norm": 17.983069493288237, + "learning_rate": 1.3049040511727079e-06, + "loss": -0.3921, + "step": 612 + }, + { + "epoch": 0.06549857321919086, + "grad_norm": 10.553594812872817, + "learning_rate": 1.3091684434968016e-06, + "loss": -1.2914, + "step": 614 + }, + { + "epoch": 0.0657119236205563, + "grad_norm": 14.041836178013272, + "learning_rate": 1.3134328358208954e-06, + "loss": -0.3353, + "step": 616 + }, + { + "epoch": 0.06592527402192175, + "grad_norm": 15.639430537257736, + "learning_rate": 1.3176972281449892e-06, + "loss": -0.4705, + "step": 618 + }, + { + "epoch": 0.06613862442328719, + "grad_norm": 12.786525687977138, + "learning_rate": 1.3219616204690832e-06, + "loss": -0.0215, + "step": 620 + }, + { + "epoch": 0.06635197482465263, + "grad_norm": 9.934313047441512, + "learning_rate": 1.3262260127931767e-06, + "loss": 0.6287, + "step": 622 + }, + { + "epoch": 0.06656532522601809, + "grad_norm": 9.779454896556423, + "learning_rate": 1.3304904051172707e-06, + "loss": -0.0751, + "step": 624 + }, + { + "epoch": 0.06677867562738353, + "grad_norm": 9.494698563936842, + "learning_rate": 1.3347547974413647e-06, + "loss": 0.1564, + "step": 626 + }, + { + "epoch": 0.06699202602874897, + "grad_norm": 10.740511386568414, + "learning_rate": 1.3390191897654584e-06, + "loss": -0.7072, + "step": 628 + }, + { + "epoch": 0.06720537643011441, + "grad_norm": 5.704581178120186, + "learning_rate": 1.3432835820895522e-06, + "loss": -0.6728, + "step": 630 + }, + { + "epoch": 0.06741872683147986, + "grad_norm": 22.298716006252704, + "learning_rate": 1.347547974413646e-06, + "loss": -0.7213, + "step": 632 + }, + { + "epoch": 0.0676320772328453, + "grad_norm": 15.330297898109906, + "learning_rate": 1.35181236673774e-06, + "loss": 0.2685, + "step": 634 + }, + { + "epoch": 0.06784542763421074, + "grad_norm": 22.69580065155436, + "learning_rate": 1.3560767590618335e-06, + "loss": 0.3992, + "step": 636 + }, + { + "epoch": 0.06805877803557618, + "grad_norm": 28.833859562335263, + "learning_rate": 1.3603411513859275e-06, + "loss": -0.7339, + "step": 638 + }, + { + "epoch": 0.06827212843694162, + "grad_norm": 11.283562143626995, + "learning_rate": 1.3646055437100213e-06, + "loss": -0.7638, + "step": 640 + }, + { + "epoch": 0.06848547883830707, + "grad_norm": 10.725377082827144, + "learning_rate": 1.3688699360341152e-06, + "loss": -0.8662, + "step": 642 + }, + { + "epoch": 0.06869882923967251, + "grad_norm": 26.31029696416489, + "learning_rate": 1.3731343283582088e-06, + "loss": -0.7486, + "step": 644 + }, + { + "epoch": 0.06891217964103795, + "grad_norm": 18.040887121015924, + "learning_rate": 1.3773987206823028e-06, + "loss": 0.3118, + "step": 646 + }, + { + "epoch": 0.06912553004240339, + "grad_norm": 10.96637271492357, + "learning_rate": 1.3816631130063965e-06, + "loss": -0.5209, + "step": 648 + }, + { + "epoch": 0.06933888044376883, + "grad_norm": 8.06597559224624, + "learning_rate": 1.3859275053304903e-06, + "loss": -0.4205, + "step": 650 + }, + { + "epoch": 0.06955223084513427, + "grad_norm": 17.471203906083133, + "learning_rate": 1.390191897654584e-06, + "loss": -0.5424, + "step": 652 + }, + { + "epoch": 0.06976558124649972, + "grad_norm": 11.059775225202081, + "learning_rate": 1.394456289978678e-06, + "loss": -1.2152, + "step": 654 + }, + { + "epoch": 0.06997893164786516, + "grad_norm": 8.31714286189907, + "learning_rate": 1.3987206823027718e-06, + "loss": -0.811, + "step": 656 + }, + { + "epoch": 0.0701922820492306, + "grad_norm": 28.554761860008615, + "learning_rate": 1.4029850746268656e-06, + "loss": -1.0846, + "step": 658 + }, + { + "epoch": 0.07040563245059604, + "grad_norm": 16.176945715631096, + "learning_rate": 1.4072494669509594e-06, + "loss": -0.5155, + "step": 660 + }, + { + "epoch": 0.07061898285196148, + "grad_norm": 21.625404385138676, + "learning_rate": 1.4115138592750533e-06, + "loss": -0.9076, + "step": 662 + }, + { + "epoch": 0.07083233325332693, + "grad_norm": 5.4202622935954405, + "learning_rate": 1.415778251599147e-06, + "loss": 0.2342, + "step": 664 + }, + { + "epoch": 0.07104568365469238, + "grad_norm": 17.194951603060776, + "learning_rate": 1.4200426439232409e-06, + "loss": 0.0548, + "step": 666 + }, + { + "epoch": 0.07125903405605782, + "grad_norm": 11.512096288311646, + "learning_rate": 1.4243070362473346e-06, + "loss": -0.886, + "step": 668 + }, + { + "epoch": 0.07147238445742327, + "grad_norm": 16.05900514880833, + "learning_rate": 1.4285714285714286e-06, + "loss": -0.1307, + "step": 670 + }, + { + "epoch": 0.07168573485878871, + "grad_norm": 9.185127296516999, + "learning_rate": 1.4328358208955222e-06, + "loss": -0.1077, + "step": 672 + }, + { + "epoch": 0.07189908526015415, + "grad_norm": 7.334484358389449, + "learning_rate": 1.4371002132196162e-06, + "loss": 0.1615, + "step": 674 + }, + { + "epoch": 0.07211243566151959, + "grad_norm": 18.964205902532676, + "learning_rate": 1.44136460554371e-06, + "loss": -1.4788, + "step": 676 + }, + { + "epoch": 0.07232578606288503, + "grad_norm": 17.11189372092431, + "learning_rate": 1.4456289978678037e-06, + "loss": 0.0577, + "step": 678 + }, + { + "epoch": 0.07253913646425048, + "grad_norm": 10.275590248100219, + "learning_rate": 1.4498933901918975e-06, + "loss": -0.0749, + "step": 680 + }, + { + "epoch": 0.07275248686561592, + "grad_norm": 25.814533145099567, + "learning_rate": 1.4541577825159914e-06, + "loss": 0.4326, + "step": 682 + }, + { + "epoch": 0.07296583726698136, + "grad_norm": 6.539318927578008, + "learning_rate": 1.4584221748400852e-06, + "loss": 0.0608, + "step": 684 + }, + { + "epoch": 0.0731791876683468, + "grad_norm": 27.253684296985305, + "learning_rate": 1.462686567164179e-06, + "loss": 0.1022, + "step": 686 + }, + { + "epoch": 0.07339253806971224, + "grad_norm": 12.415981292555502, + "learning_rate": 1.466950959488273e-06, + "loss": -0.3008, + "step": 688 + }, + { + "epoch": 0.07360588847107769, + "grad_norm": 21.077155825145116, + "learning_rate": 1.4712153518123667e-06, + "loss": -0.4936, + "step": 690 + }, + { + "epoch": 0.07381923887244313, + "grad_norm": 14.246036421919664, + "learning_rate": 1.4754797441364605e-06, + "loss": 0.0596, + "step": 692 + }, + { + "epoch": 0.07403258927380857, + "grad_norm": 15.479313497531585, + "learning_rate": 1.4797441364605543e-06, + "loss": -0.0542, + "step": 694 + }, + { + "epoch": 0.07424593967517401, + "grad_norm": 21.800805681217618, + "learning_rate": 1.4840085287846482e-06, + "loss": 0.8143, + "step": 696 + }, + { + "epoch": 0.07445929007653945, + "grad_norm": 12.447341265262597, + "learning_rate": 1.488272921108742e-06, + "loss": 0.2514, + "step": 698 + }, + { + "epoch": 0.0746726404779049, + "grad_norm": 15.457895001240969, + "learning_rate": 1.4925373134328358e-06, + "loss": -0.1873, + "step": 700 + }, + { + "epoch": 0.07488599087927034, + "grad_norm": 9.103159220217133, + "learning_rate": 1.4968017057569296e-06, + "loss": -0.1545, + "step": 702 + }, + { + "epoch": 0.07509934128063578, + "grad_norm": 13.892293596081212, + "learning_rate": 1.5010660980810235e-06, + "loss": -1.0031, + "step": 704 + }, + { + "epoch": 0.07531269168200122, + "grad_norm": 4.018531204967449, + "learning_rate": 1.505330490405117e-06, + "loss": -0.5123, + "step": 706 + }, + { + "epoch": 0.07552604208336668, + "grad_norm": 9.26403570634237, + "learning_rate": 1.509594882729211e-06, + "loss": -0.009, + "step": 708 + }, + { + "epoch": 0.07573939248473212, + "grad_norm": 23.108096955629904, + "learning_rate": 1.5138592750533048e-06, + "loss": 0.0175, + "step": 710 + }, + { + "epoch": 0.07595274288609756, + "grad_norm": 28.54965038619342, + "learning_rate": 1.5181236673773988e-06, + "loss": -0.4034, + "step": 712 + }, + { + "epoch": 0.076166093287463, + "grad_norm": 36.93993798487583, + "learning_rate": 1.5223880597014924e-06, + "loss": -1.3652, + "step": 714 + }, + { + "epoch": 0.07637944368882844, + "grad_norm": 9.796656002782315, + "learning_rate": 1.5266524520255864e-06, + "loss": 0.5496, + "step": 716 + }, + { + "epoch": 0.07659279409019389, + "grad_norm": 10.912135520925375, + "learning_rate": 1.5309168443496801e-06, + "loss": -0.3247, + "step": 718 + }, + { + "epoch": 0.07680614449155933, + "grad_norm": 14.84305599240473, + "learning_rate": 1.5351812366737739e-06, + "loss": -1.3197, + "step": 720 + }, + { + "epoch": 0.07701949489292477, + "grad_norm": 25.835978612302938, + "learning_rate": 1.5394456289978677e-06, + "loss": -0.2446, + "step": 722 + }, + { + "epoch": 0.07723284529429021, + "grad_norm": 7.4900190590260545, + "learning_rate": 1.5437100213219616e-06, + "loss": -0.6921, + "step": 724 + }, + { + "epoch": 0.07744619569565565, + "grad_norm": 12.73580854140001, + "learning_rate": 1.5479744136460552e-06, + "loss": 0.1706, + "step": 726 + }, + { + "epoch": 0.0776595460970211, + "grad_norm": 10.90211860054164, + "learning_rate": 1.5522388059701492e-06, + "loss": 0.0817, + "step": 728 + }, + { + "epoch": 0.07787289649838654, + "grad_norm": 6.002849805432761, + "learning_rate": 1.556503198294243e-06, + "loss": 0.0037, + "step": 730 + }, + { + "epoch": 0.07808624689975198, + "grad_norm": 15.988913842632046, + "learning_rate": 1.560767590618337e-06, + "loss": 0.3994, + "step": 732 + }, + { + "epoch": 0.07829959730111742, + "grad_norm": 21.834364945681322, + "learning_rate": 1.5650319829424305e-06, + "loss": -0.4599, + "step": 734 + }, + { + "epoch": 0.07851294770248286, + "grad_norm": 7.432963689652445, + "learning_rate": 1.5692963752665245e-06, + "loss": -0.3263, + "step": 736 + }, + { + "epoch": 0.0787262981038483, + "grad_norm": 16.20259948956077, + "learning_rate": 1.5735607675906182e-06, + "loss": -0.1557, + "step": 738 + }, + { + "epoch": 0.07893964850521375, + "grad_norm": 30.282091888880974, + "learning_rate": 1.5778251599147122e-06, + "loss": -1.2297, + "step": 740 + }, + { + "epoch": 0.07915299890657919, + "grad_norm": 18.86228833994148, + "learning_rate": 1.5820895522388058e-06, + "loss": -0.1715, + "step": 742 + }, + { + "epoch": 0.07936634930794463, + "grad_norm": 26.911000632499032, + "learning_rate": 1.5863539445628997e-06, + "loss": -0.2761, + "step": 744 + }, + { + "epoch": 0.07957969970931007, + "grad_norm": 55.67332241793552, + "learning_rate": 1.5906183368869935e-06, + "loss": -0.1245, + "step": 746 + }, + { + "epoch": 0.07979305011067551, + "grad_norm": 11.64128268726896, + "learning_rate": 1.5948827292110873e-06, + "loss": 0.2172, + "step": 748 + }, + { + "epoch": 0.08000640051204096, + "grad_norm": 7.141781123388319, + "learning_rate": 1.599147121535181e-06, + "loss": -1.0433, + "step": 750 + }, + { + "epoch": 0.08021975091340641, + "grad_norm": 14.355586850326564, + "learning_rate": 1.603411513859275e-06, + "loss": 0.7587, + "step": 752 + }, + { + "epoch": 0.08043310131477185, + "grad_norm": 21.229967868951906, + "learning_rate": 1.607675906183369e-06, + "loss": -1.1211, + "step": 754 + }, + { + "epoch": 0.0806464517161373, + "grad_norm": 11.519887266492887, + "learning_rate": 1.6119402985074626e-06, + "loss": 0.589, + "step": 756 + }, + { + "epoch": 0.08085980211750274, + "grad_norm": 14.339130440951164, + "learning_rate": 1.6162046908315565e-06, + "loss": -0.7255, + "step": 758 + }, + { + "epoch": 0.08107315251886818, + "grad_norm": 14.27705846557268, + "learning_rate": 1.6204690831556503e-06, + "loss": -1.539, + "step": 760 + }, + { + "epoch": 0.08128650292023362, + "grad_norm": 17.0155387877924, + "learning_rate": 1.624733475479744e-06, + "loss": -0.366, + "step": 762 + }, + { + "epoch": 0.08149985332159906, + "grad_norm": 7.661991050372059, + "learning_rate": 1.6289978678038378e-06, + "loss": -0.0189, + "step": 764 + }, + { + "epoch": 0.0817132037229645, + "grad_norm": 8.979485378676086, + "learning_rate": 1.6332622601279318e-06, + "loss": -1.96, + "step": 766 + }, + { + "epoch": 0.08192655412432995, + "grad_norm": 5.161703747401364, + "learning_rate": 1.6375266524520254e-06, + "loss": -0.5528, + "step": 768 + }, + { + "epoch": 0.08213990452569539, + "grad_norm": 8.76501482963132, + "learning_rate": 1.6417910447761194e-06, + "loss": -0.0766, + "step": 770 + }, + { + "epoch": 0.08235325492706083, + "grad_norm": 16.69504549029163, + "learning_rate": 1.6460554371002131e-06, + "loss": 0.3121, + "step": 772 + }, + { + "epoch": 0.08256660532842627, + "grad_norm": 17.43852691547062, + "learning_rate": 1.6503198294243071e-06, + "loss": -0.6262, + "step": 774 + }, + { + "epoch": 0.08277995572979172, + "grad_norm": 7.8082474899976075, + "learning_rate": 1.6545842217484007e-06, + "loss": 0.4352, + "step": 776 + }, + { + "epoch": 0.08299330613115716, + "grad_norm": 23.61906025471836, + "learning_rate": 1.6588486140724946e-06, + "loss": -0.586, + "step": 778 + }, + { + "epoch": 0.0832066565325226, + "grad_norm": 16.906426462089843, + "learning_rate": 1.6631130063965884e-06, + "loss": 0.1607, + "step": 780 + }, + { + "epoch": 0.08342000693388804, + "grad_norm": 12.849146673239158, + "learning_rate": 1.6673773987206822e-06, + "loss": -0.4005, + "step": 782 + }, + { + "epoch": 0.08363335733525348, + "grad_norm": 15.52998781884974, + "learning_rate": 1.671641791044776e-06, + "loss": -0.2504, + "step": 784 + }, + { + "epoch": 0.08384670773661893, + "grad_norm": 7.223884938731002, + "learning_rate": 1.67590618336887e-06, + "loss": -0.9925, + "step": 786 + }, + { + "epoch": 0.08406005813798437, + "grad_norm": 4.8718682784669145, + "learning_rate": 1.6801705756929637e-06, + "loss": -0.3866, + "step": 788 + }, + { + "epoch": 0.08427340853934981, + "grad_norm": 17.60760255975411, + "learning_rate": 1.6844349680170575e-06, + "loss": -0.9417, + "step": 790 + }, + { + "epoch": 0.08448675894071525, + "grad_norm": 27.714848425193637, + "learning_rate": 1.6886993603411512e-06, + "loss": 0.3963, + "step": 792 + }, + { + "epoch": 0.0847001093420807, + "grad_norm": 35.493014709513936, + "learning_rate": 1.6929637526652452e-06, + "loss": -0.4515, + "step": 794 + }, + { + "epoch": 0.08491345974344615, + "grad_norm": 21.400152124030786, + "learning_rate": 1.6972281449893388e-06, + "loss": -0.67, + "step": 796 + }, + { + "epoch": 0.08512681014481159, + "grad_norm": 16.911674814364677, + "learning_rate": 1.7014925373134328e-06, + "loss": -0.1268, + "step": 798 + }, + { + "epoch": 0.08534016054617703, + "grad_norm": 13.009852458798768, + "learning_rate": 1.7057569296375265e-06, + "loss": -0.5053, + "step": 800 + }, + { + "epoch": 0.08555351094754247, + "grad_norm": 16.266588077557774, + "learning_rate": 1.7100213219616205e-06, + "loss": -0.7199, + "step": 802 + }, + { + "epoch": 0.08576686134890792, + "grad_norm": 19.16342878443152, + "learning_rate": 1.714285714285714e-06, + "loss": 0.0409, + "step": 804 + }, + { + "epoch": 0.08598021175027336, + "grad_norm": 19.235257083323877, + "learning_rate": 1.718550106609808e-06, + "loss": -0.3201, + "step": 806 + }, + { + "epoch": 0.0861935621516388, + "grad_norm": 13.755965707142774, + "learning_rate": 1.7228144989339018e-06, + "loss": -0.7414, + "step": 808 + }, + { + "epoch": 0.08640691255300424, + "grad_norm": 9.2611346408095, + "learning_rate": 1.7270788912579956e-06, + "loss": 0.543, + "step": 810 + }, + { + "epoch": 0.08662026295436968, + "grad_norm": 11.293068548525245, + "learning_rate": 1.7313432835820893e-06, + "loss": 0.5024, + "step": 812 + }, + { + "epoch": 0.08683361335573513, + "grad_norm": 8.43885851672634, + "learning_rate": 1.7356076759061833e-06, + "loss": -0.8492, + "step": 814 + }, + { + "epoch": 0.08704696375710057, + "grad_norm": 10.34717338583048, + "learning_rate": 1.7398720682302773e-06, + "loss": -0.8168, + "step": 816 + }, + { + "epoch": 0.08726031415846601, + "grad_norm": 4.346106915743504, + "learning_rate": 1.7441364605543709e-06, + "loss": -0.781, + "step": 818 + }, + { + "epoch": 0.08747366455983145, + "grad_norm": 16.261736385850632, + "learning_rate": 1.7484008528784648e-06, + "loss": 0.3626, + "step": 820 + }, + { + "epoch": 0.0876870149611969, + "grad_norm": 6.451053388102568, + "learning_rate": 1.7526652452025586e-06, + "loss": 0.1613, + "step": 822 + }, + { + "epoch": 0.08790036536256234, + "grad_norm": 13.971393729868783, + "learning_rate": 1.7569296375266524e-06, + "loss": 0.1887, + "step": 824 + }, + { + "epoch": 0.08811371576392778, + "grad_norm": 4.60426711294295, + "learning_rate": 1.7611940298507461e-06, + "loss": -0.0714, + "step": 826 + }, + { + "epoch": 0.08832706616529322, + "grad_norm": 7.788765445569996, + "learning_rate": 1.7654584221748401e-06, + "loss": 0.1057, + "step": 828 + }, + { + "epoch": 0.08854041656665866, + "grad_norm": 16.242118293664294, + "learning_rate": 1.7697228144989339e-06, + "loss": 0.2466, + "step": 830 + }, + { + "epoch": 0.0887537669680241, + "grad_norm": 14.723029688197625, + "learning_rate": 1.7739872068230277e-06, + "loss": -0.6136, + "step": 832 + }, + { + "epoch": 0.08896711736938955, + "grad_norm": 20.958772423479118, + "learning_rate": 1.7782515991471214e-06, + "loss": -1.257, + "step": 834 + }, + { + "epoch": 0.08918046777075499, + "grad_norm": 12.999192030716888, + "learning_rate": 1.7825159914712154e-06, + "loss": 0.7867, + "step": 836 + }, + { + "epoch": 0.08939381817212044, + "grad_norm": 8.749254011513045, + "learning_rate": 1.786780383795309e-06, + "loss": -0.1854, + "step": 838 + }, + { + "epoch": 0.08960716857348588, + "grad_norm": 28.375671216859047, + "learning_rate": 1.791044776119403e-06, + "loss": -1.1224, + "step": 840 + }, + { + "epoch": 0.08982051897485133, + "grad_norm": 11.982995716340508, + "learning_rate": 1.7953091684434967e-06, + "loss": 0.1255, + "step": 842 + }, + { + "epoch": 0.09003386937621677, + "grad_norm": 13.991706133756487, + "learning_rate": 1.7995735607675907e-06, + "loss": -0.8058, + "step": 844 + }, + { + "epoch": 0.09024721977758221, + "grad_norm": 6.919587436813207, + "learning_rate": 1.8038379530916842e-06, + "loss": -0.514, + "step": 846 + }, + { + "epoch": 0.09046057017894765, + "grad_norm": 8.081461324634596, + "learning_rate": 1.8081023454157782e-06, + "loss": -0.4794, + "step": 848 + }, + { + "epoch": 0.0906739205803131, + "grad_norm": 21.839476666247815, + "learning_rate": 1.812366737739872e-06, + "loss": -0.1143, + "step": 850 + }, + { + "epoch": 0.09088727098167854, + "grad_norm": 4.526543841561235, + "learning_rate": 1.8166311300639658e-06, + "loss": -0.2663, + "step": 852 + }, + { + "epoch": 0.09110062138304398, + "grad_norm": 4.910992516997152, + "learning_rate": 1.8208955223880595e-06, + "loss": -0.79, + "step": 854 + }, + { + "epoch": 0.09131397178440942, + "grad_norm": 13.643412575777333, + "learning_rate": 1.8251599147121535e-06, + "loss": 0.3456, + "step": 856 + }, + { + "epoch": 0.09152732218577486, + "grad_norm": 12.207406995564389, + "learning_rate": 1.829424307036247e-06, + "loss": -0.1668, + "step": 858 + }, + { + "epoch": 0.0917406725871403, + "grad_norm": 10.619915846442005, + "learning_rate": 1.833688699360341e-06, + "loss": 0.8485, + "step": 860 + }, + { + "epoch": 0.09195402298850575, + "grad_norm": 7.23403783091102, + "learning_rate": 1.8379530916844348e-06, + "loss": 0.0591, + "step": 862 + }, + { + "epoch": 0.09216737338987119, + "grad_norm": 8.49916268019599, + "learning_rate": 1.8422174840085288e-06, + "loss": 0.3622, + "step": 864 + }, + { + "epoch": 0.09238072379123663, + "grad_norm": 17.881493738990475, + "learning_rate": 1.8464818763326224e-06, + "loss": 0.5555, + "step": 866 + }, + { + "epoch": 0.09259407419260207, + "grad_norm": 12.811241804624386, + "learning_rate": 1.8507462686567163e-06, + "loss": 0.1441, + "step": 868 + }, + { + "epoch": 0.09280742459396751, + "grad_norm": 17.08876136574821, + "learning_rate": 1.85501066098081e-06, + "loss": -0.2161, + "step": 870 + }, + { + "epoch": 0.09302077499533296, + "grad_norm": 18.87972131413687, + "learning_rate": 1.859275053304904e-06, + "loss": -0.643, + "step": 872 + }, + { + "epoch": 0.0932341253966984, + "grad_norm": 19.214085243462733, + "learning_rate": 1.8635394456289976e-06, + "loss": -0.3228, + "step": 874 + }, + { + "epoch": 0.09344747579806384, + "grad_norm": 13.107944155833772, + "learning_rate": 1.8678038379530916e-06, + "loss": -0.0718, + "step": 876 + }, + { + "epoch": 0.09366082619942928, + "grad_norm": 11.902725378153237, + "learning_rate": 1.8720682302771856e-06, + "loss": 0.1609, + "step": 878 + }, + { + "epoch": 0.09387417660079474, + "grad_norm": 12.04227842087194, + "learning_rate": 1.8763326226012792e-06, + "loss": 0.542, + "step": 880 + }, + { + "epoch": 0.09408752700216018, + "grad_norm": 10.083129476715843, + "learning_rate": 1.8805970149253731e-06, + "loss": -0.2667, + "step": 882 + }, + { + "epoch": 0.09430087740352562, + "grad_norm": 13.758222232767348, + "learning_rate": 1.884861407249467e-06, + "loss": 0.8839, + "step": 884 + }, + { + "epoch": 0.09451422780489106, + "grad_norm": 26.46812082435916, + "learning_rate": 1.8891257995735609e-06, + "loss": -0.808, + "step": 886 + }, + { + "epoch": 0.0947275782062565, + "grad_norm": 22.4696755994172, + "learning_rate": 1.8933901918976544e-06, + "loss": 0.206, + "step": 888 + }, + { + "epoch": 0.09494092860762195, + "grad_norm": 13.650968128133929, + "learning_rate": 1.8976545842217484e-06, + "loss": -0.2053, + "step": 890 + }, + { + "epoch": 0.09515427900898739, + "grad_norm": 25.295891533832265, + "learning_rate": 1.9019189765458422e-06, + "loss": 0.5393, + "step": 892 + }, + { + "epoch": 0.09536762941035283, + "grad_norm": 4.717180645046515, + "learning_rate": 1.906183368869936e-06, + "loss": -0.1717, + "step": 894 + }, + { + "epoch": 0.09558097981171827, + "grad_norm": 3.980783546053527, + "learning_rate": 1.9104477611940297e-06, + "loss": -0.3342, + "step": 896 + }, + { + "epoch": 0.09579433021308371, + "grad_norm": 17.53966519035405, + "learning_rate": 1.9147121535181237e-06, + "loss": 0.1547, + "step": 898 + }, + { + "epoch": 0.09600768061444916, + "grad_norm": 37.64965817072852, + "learning_rate": 1.9189765458422173e-06, + "loss": -0.1405, + "step": 900 + }, + { + "epoch": 0.0962210310158146, + "grad_norm": 13.214607736783556, + "learning_rate": 1.9232409381663112e-06, + "loss": -0.6144, + "step": 902 + }, + { + "epoch": 0.09643438141718004, + "grad_norm": 13.755458504610731, + "learning_rate": 1.927505330490405e-06, + "loss": -0.5176, + "step": 904 + }, + { + "epoch": 0.09664773181854548, + "grad_norm": 13.160266215659087, + "learning_rate": 1.9317697228144988e-06, + "loss": -0.3805, + "step": 906 + }, + { + "epoch": 0.09686108221991092, + "grad_norm": 8.56427462579027, + "learning_rate": 1.9360341151385928e-06, + "loss": -0.3108, + "step": 908 + }, + { + "epoch": 0.09707443262127637, + "grad_norm": 7.732136866046191, + "learning_rate": 1.9402985074626867e-06, + "loss": -0.7029, + "step": 910 + }, + { + "epoch": 0.09728778302264181, + "grad_norm": 3.674972386186435, + "learning_rate": 1.9445628997867803e-06, + "loss": -0.171, + "step": 912 + }, + { + "epoch": 0.09750113342400725, + "grad_norm": 5.033223647050098, + "learning_rate": 1.9488272921108743e-06, + "loss": -0.3051, + "step": 914 + }, + { + "epoch": 0.09771448382537269, + "grad_norm": 6.185985978437299, + "learning_rate": 1.953091684434968e-06, + "loss": -0.2443, + "step": 916 + }, + { + "epoch": 0.09792783422673813, + "grad_norm": 12.390573810874344, + "learning_rate": 1.957356076759062e-06, + "loss": -0.9419, + "step": 918 + }, + { + "epoch": 0.09814118462810358, + "grad_norm": 33.36024010674656, + "learning_rate": 1.9616204690831554e-06, + "loss": -1.5106, + "step": 920 + }, + { + "epoch": 0.09835453502946902, + "grad_norm": 8.028565562422017, + "learning_rate": 1.9658848614072493e-06, + "loss": -0.0367, + "step": 922 + }, + { + "epoch": 0.09856788543083447, + "grad_norm": 11.244074568741834, + "learning_rate": 1.970149253731343e-06, + "loss": -0.198, + "step": 924 + }, + { + "epoch": 0.09878123583219992, + "grad_norm": 8.155488460294738, + "learning_rate": 1.974413646055437e-06, + "loss": -0.4533, + "step": 926 + }, + { + "epoch": 0.09899458623356536, + "grad_norm": 8.455296272398023, + "learning_rate": 1.978678038379531e-06, + "loss": -0.2075, + "step": 928 + }, + { + "epoch": 0.0992079366349308, + "grad_norm": 19.66763752484763, + "learning_rate": 1.982942430703625e-06, + "loss": -0.8438, + "step": 930 + }, + { + "epoch": 0.09942128703629624, + "grad_norm": 16.941346284552324, + "learning_rate": 1.9872068230277184e-06, + "loss": 0.2668, + "step": 932 + }, + { + "epoch": 0.09963463743766168, + "grad_norm": 30.19526185636887, + "learning_rate": 1.9914712153518124e-06, + "loss": 0.7882, + "step": 934 + }, + { + "epoch": 0.09984798783902712, + "grad_norm": 7.7652033417557185, + "learning_rate": 1.995735607675906e-06, + "loss": -0.5121, + "step": 936 + }, + { + "epoch": 0.10006133824039257, + "grad_norm": 10.002945250026167, + "learning_rate": 2e-06, + "loss": -0.4891, + "step": 938 + }, + { + "epoch": 0.10027468864175801, + "grad_norm": 9.562548994737613, + "learning_rate": 1.999999722631857e-06, + "loss": -0.1357, + "step": 940 + }, + { + "epoch": 0.10048803904312345, + "grad_norm": 9.18438104835198, + "learning_rate": 1.999998890527582e-06, + "loss": -1.313, + "step": 942 + }, + { + "epoch": 0.10070138944448889, + "grad_norm": 19.038858970547153, + "learning_rate": 1.9999975036876364e-06, + "loss": 0.5632, + "step": 944 + }, + { + "epoch": 0.10091473984585433, + "grad_norm": 8.880460982287538, + "learning_rate": 1.9999955621127898e-06, + "loss": -0.1845, + "step": 946 + }, + { + "epoch": 0.10112809024721978, + "grad_norm": 27.328430670871857, + "learning_rate": 1.999993065804119e-06, + "loss": -1.1547, + "step": 948 + }, + { + "epoch": 0.10134144064858522, + "grad_norm": 3.7529838788448946, + "learning_rate": 1.9999900147630093e-06, + "loss": 0.4223, + "step": 950 + }, + { + "epoch": 0.10155479104995066, + "grad_norm": 20.065464957241378, + "learning_rate": 1.9999864089911524e-06, + "loss": -0.6613, + "step": 952 + }, + { + "epoch": 0.1017681414513161, + "grad_norm": 6.44845480619213, + "learning_rate": 1.9999822484905493e-06, + "loss": -0.1856, + "step": 954 + }, + { + "epoch": 0.10198149185268154, + "grad_norm": 15.587441612646101, + "learning_rate": 1.9999775332635073e-06, + "loss": -0.659, + "step": 956 + }, + { + "epoch": 0.10219484225404699, + "grad_norm": 10.95597370533827, + "learning_rate": 1.9999722633126426e-06, + "loss": 0.1653, + "step": 958 + }, + { + "epoch": 0.10240819265541243, + "grad_norm": 8.043054511297433, + "learning_rate": 1.9999664386408786e-06, + "loss": -0.5782, + "step": 960 + }, + { + "epoch": 0.10262154305677787, + "grad_norm": 19.158675843982078, + "learning_rate": 1.9999600592514464e-06, + "loss": -0.2572, + "step": 962 + }, + { + "epoch": 0.10283489345814331, + "grad_norm": 6.444667549256743, + "learning_rate": 1.9999531251478848e-06, + "loss": -0.3405, + "step": 964 + }, + { + "epoch": 0.10304824385950877, + "grad_norm": 59.77200968326433, + "learning_rate": 1.9999456363340406e-06, + "loss": -0.3214, + "step": 966 + }, + { + "epoch": 0.10326159426087421, + "grad_norm": 6.48727133868538, + "learning_rate": 1.999937592814068e-06, + "loss": 0.193, + "step": 968 + }, + { + "epoch": 0.10347494466223965, + "grad_norm": 9.899885529844648, + "learning_rate": 1.999928994592429e-06, + "loss": -0.8417, + "step": 970 + }, + { + "epoch": 0.1036882950636051, + "grad_norm": 10.671301533657493, + "learning_rate": 1.999919841673893e-06, + "loss": 0.6551, + "step": 972 + }, + { + "epoch": 0.10390164546497054, + "grad_norm": 18.54534340015214, + "learning_rate": 1.999910134063538e-06, + "loss": -0.2989, + "step": 974 + }, + { + "epoch": 0.10411499586633598, + "grad_norm": 4.6709829991723835, + "learning_rate": 1.999899871766749e-06, + "loss": 0.0672, + "step": 976 + }, + { + "epoch": 0.10432834626770142, + "grad_norm": 5.48735260106571, + "learning_rate": 1.9998890547892183e-06, + "loss": -0.6261, + "step": 978 + }, + { + "epoch": 0.10454169666906686, + "grad_norm": 16.54236866901039, + "learning_rate": 1.9998776831369476e-06, + "loss": -0.2955, + "step": 980 + }, + { + "epoch": 0.1047550470704323, + "grad_norm": 18.85187898982959, + "learning_rate": 1.9998657568162446e-06, + "loss": 0.1206, + "step": 982 + }, + { + "epoch": 0.10496839747179774, + "grad_norm": 12.054223122838927, + "learning_rate": 1.999853275833725e-06, + "loss": 0.4311, + "step": 984 + }, + { + "epoch": 0.10518174787316319, + "grad_norm": 12.198982090572391, + "learning_rate": 1.9998402401963128e-06, + "loss": -0.442, + "step": 986 + }, + { + "epoch": 0.10539509827452863, + "grad_norm": 37.98649241207374, + "learning_rate": 1.999826649911239e-06, + "loss": -1.3668, + "step": 988 + }, + { + "epoch": 0.10560844867589407, + "grad_norm": 11.038918775818233, + "learning_rate": 1.9998125049860433e-06, + "loss": 0.6653, + "step": 990 + }, + { + "epoch": 0.10582179907725951, + "grad_norm": 6.476149411837476, + "learning_rate": 1.999797805428572e-06, + "loss": 0.2109, + "step": 992 + }, + { + "epoch": 0.10603514947862495, + "grad_norm": 15.406789152771664, + "learning_rate": 1.999782551246979e-06, + "loss": -0.5677, + "step": 994 + }, + { + "epoch": 0.1062484998799904, + "grad_norm": 29.873738526715083, + "learning_rate": 1.999766742449727e-06, + "loss": -1.3612, + "step": 996 + }, + { + "epoch": 0.10646185028135584, + "grad_norm": 12.480796329993506, + "learning_rate": 1.999750379045585e-06, + "loss": -0.3667, + "step": 998 + }, + { + "epoch": 0.10667520068272128, + "grad_norm": 13.76196266074063, + "learning_rate": 1.9997334610436318e-06, + "loss": 0.1408, + "step": 1000 + }, + { + "epoch": 0.10688855108408672, + "grad_norm": 7.342086872809186, + "learning_rate": 1.999715988453251e-06, + "loss": -0.1217, + "step": 1002 + }, + { + "epoch": 0.10710190148545216, + "grad_norm": 24.793407128549426, + "learning_rate": 1.9996979612841357e-06, + "loss": -0.2125, + "step": 1004 + }, + { + "epoch": 0.1073152518868176, + "grad_norm": 12.462548496218666, + "learning_rate": 1.9996793795462862e-06, + "loss": -0.9109, + "step": 1006 + }, + { + "epoch": 0.10752860228818305, + "grad_norm": 22.456483713157958, + "learning_rate": 1.9996602432500107e-06, + "loss": -0.8032, + "step": 1008 + }, + { + "epoch": 0.1077419526895485, + "grad_norm": 8.279610471346707, + "learning_rate": 1.999640552405925e-06, + "loss": 0.5294, + "step": 1010 + }, + { + "epoch": 0.10795530309091395, + "grad_norm": 8.492474727142435, + "learning_rate": 1.9996203070249514e-06, + "loss": 0.345, + "step": 1012 + }, + { + "epoch": 0.10816865349227939, + "grad_norm": 10.855775876716889, + "learning_rate": 1.9995995071183215e-06, + "loss": -0.0323, + "step": 1014 + }, + { + "epoch": 0.10838200389364483, + "grad_norm": 13.044679276302682, + "learning_rate": 1.999578152697574e-06, + "loss": -0.9284, + "step": 1016 + }, + { + "epoch": 0.10859535429501027, + "grad_norm": 13.041568910636132, + "learning_rate": 1.999556243774554e-06, + "loss": 0.0727, + "step": 1018 + }, + { + "epoch": 0.10880870469637571, + "grad_norm": 8.840285871574821, + "learning_rate": 1.9995337803614165e-06, + "loss": -0.8054, + "step": 1020 + }, + { + "epoch": 0.10902205509774116, + "grad_norm": 11.045434971938723, + "learning_rate": 1.999510762470621e-06, + "loss": -0.4123, + "step": 1022 + }, + { + "epoch": 0.1092354054991066, + "grad_norm": 9.987536744093788, + "learning_rate": 1.999487190114938e-06, + "loss": -0.276, + "step": 1024 + }, + { + "epoch": 0.10944875590047204, + "grad_norm": 21.660050733355092, + "learning_rate": 1.9994630633074433e-06, + "loss": 0.0614, + "step": 1026 + }, + { + "epoch": 0.10966210630183748, + "grad_norm": 13.372785558415854, + "learning_rate": 1.9994383820615212e-06, + "loss": -0.5344, + "step": 1028 + }, + { + "epoch": 0.10987545670320292, + "grad_norm": 12.356702814123958, + "learning_rate": 1.9994131463908624e-06, + "loss": -0.1296, + "step": 1030 + }, + { + "epoch": 0.11008880710456836, + "grad_norm": 4.739222690282173, + "learning_rate": 1.999387356309467e-06, + "loss": -0.0715, + "step": 1032 + }, + { + "epoch": 0.1103021575059338, + "grad_norm": 2.353063878699358, + "learning_rate": 1.9993610118316415e-06, + "loss": -0.0108, + "step": 1034 + }, + { + "epoch": 0.11051550790729925, + "grad_norm": 9.018922760973295, + "learning_rate": 1.9993341129719997e-06, + "loss": 0.1682, + "step": 1036 + }, + { + "epoch": 0.11072885830866469, + "grad_norm": 11.347020997940431, + "learning_rate": 1.9993066597454637e-06, + "loss": 0.503, + "step": 1038 + }, + { + "epoch": 0.11094220871003013, + "grad_norm": 6.599406732799808, + "learning_rate": 1.9992786521672633e-06, + "loss": -1.0294, + "step": 1040 + }, + { + "epoch": 0.11115555911139557, + "grad_norm": 10.664927908662923, + "learning_rate": 1.999250090252934e-06, + "loss": -0.0274, + "step": 1042 + }, + { + "epoch": 0.11136890951276102, + "grad_norm": 12.59123280862704, + "learning_rate": 1.9992209740183212e-06, + "loss": -1.3216, + "step": 1044 + }, + { + "epoch": 0.11158225991412646, + "grad_norm": 12.964624626643758, + "learning_rate": 1.9991913034795767e-06, + "loss": 0.4125, + "step": 1046 + }, + { + "epoch": 0.1117956103154919, + "grad_norm": 4.273338653928415, + "learning_rate": 1.9991610786531593e-06, + "loss": -0.4966, + "step": 1048 + }, + { + "epoch": 0.11200896071685734, + "grad_norm": 5.088476515803206, + "learning_rate": 1.999130299555836e-06, + "loss": 0.2971, + "step": 1050 + }, + { + "epoch": 0.1122223111182228, + "grad_norm": 6.974108763676147, + "learning_rate": 1.9990989662046816e-06, + "loss": -0.7651, + "step": 1052 + }, + { + "epoch": 0.11243566151958824, + "grad_norm": 5.249430049582833, + "learning_rate": 1.999067078617077e-06, + "loss": -0.5608, + "step": 1054 + }, + { + "epoch": 0.11264901192095368, + "grad_norm": 15.65538195259475, + "learning_rate": 1.999034636810712e-06, + "loss": -0.5415, + "step": 1056 + }, + { + "epoch": 0.11286236232231912, + "grad_norm": 5.252233548968885, + "learning_rate": 1.999001640803583e-06, + "loss": -0.7343, + "step": 1058 + }, + { + "epoch": 0.11307571272368457, + "grad_norm": 8.691749940228224, + "learning_rate": 1.998968090613994e-06, + "loss": -0.418, + "step": 1060 + }, + { + "epoch": 0.11328906312505001, + "grad_norm": 13.48795251406841, + "learning_rate": 1.998933986260557e-06, + "loss": -0.7516, + "step": 1062 + }, + { + "epoch": 0.11350241352641545, + "grad_norm": 19.397334567221574, + "learning_rate": 1.99889932776219e-06, + "loss": 0.3052, + "step": 1064 + }, + { + "epoch": 0.11371576392778089, + "grad_norm": 13.172606242995984, + "learning_rate": 1.99886411513812e-06, + "loss": -0.4304, + "step": 1066 + }, + { + "epoch": 0.11392911432914633, + "grad_norm": 28.550101024674554, + "learning_rate": 1.9988283484078813e-06, + "loss": 0.2051, + "step": 1068 + }, + { + "epoch": 0.11414246473051178, + "grad_norm": 11.786086088715077, + "learning_rate": 1.9987920275913135e-06, + "loss": 0.5976, + "step": 1070 + }, + { + "epoch": 0.11435581513187722, + "grad_norm": 20.47705125595093, + "learning_rate": 1.9987551527085665e-06, + "loss": -0.6878, + "step": 1072 + }, + { + "epoch": 0.11456916553324266, + "grad_norm": 17.256084624740367, + "learning_rate": 1.9987177237800954e-06, + "loss": -1.1181, + "step": 1074 + }, + { + "epoch": 0.1147825159346081, + "grad_norm": 20.121126793429465, + "learning_rate": 1.9986797408266633e-06, + "loss": -1.1092, + "step": 1076 + }, + { + "epoch": 0.11499586633597354, + "grad_norm": 9.787270565538346, + "learning_rate": 1.998641203869341e-06, + "loss": -0.0358, + "step": 1078 + }, + { + "epoch": 0.11520921673733898, + "grad_norm": 10.072654902116856, + "learning_rate": 1.9986021129295067e-06, + "loss": -1.121, + "step": 1080 + }, + { + "epoch": 0.11542256713870443, + "grad_norm": 26.902121573368476, + "learning_rate": 1.9985624680288445e-06, + "loss": -0.1283, + "step": 1082 + }, + { + "epoch": 0.11563591754006987, + "grad_norm": 6.9266710792537936, + "learning_rate": 1.998522269189348e-06, + "loss": -0.1049, + "step": 1084 + }, + { + "epoch": 0.11584926794143531, + "grad_norm": 11.110711462850878, + "learning_rate": 1.998481516433316e-06, + "loss": -0.51, + "step": 1086 + }, + { + "epoch": 0.11606261834280075, + "grad_norm": 12.887226914453029, + "learning_rate": 1.9984402097833563e-06, + "loss": -0.1358, + "step": 1088 + }, + { + "epoch": 0.1162759687441662, + "grad_norm": 11.597183300597381, + "learning_rate": 1.998398349262383e-06, + "loss": -1.1465, + "step": 1090 + }, + { + "epoch": 0.11648931914553164, + "grad_norm": 9.612007934917326, + "learning_rate": 1.9983559348936175e-06, + "loss": -0.4243, + "step": 1092 + }, + { + "epoch": 0.11670266954689708, + "grad_norm": 7.480442846976291, + "learning_rate": 1.9983129667005884e-06, + "loss": -0.8296, + "step": 1094 + }, + { + "epoch": 0.11691601994826253, + "grad_norm": 11.901507316301505, + "learning_rate": 1.998269444707132e-06, + "loss": 0.2059, + "step": 1096 + }, + { + "epoch": 0.11712937034962798, + "grad_norm": 11.84934594902733, + "learning_rate": 1.9982253689373918e-06, + "loss": -0.7015, + "step": 1098 + }, + { + "epoch": 0.11734272075099342, + "grad_norm": 18.98160723910663, + "learning_rate": 1.9981807394158177e-06, + "loss": 0.3484, + "step": 1100 + }, + { + "epoch": 0.11755607115235886, + "grad_norm": 9.999100561355233, + "learning_rate": 1.9981355561671677e-06, + "loss": -0.0919, + "step": 1102 + }, + { + "epoch": 0.1177694215537243, + "grad_norm": 8.253006466198785, + "learning_rate": 1.9980898192165063e-06, + "loss": -0.3401, + "step": 1104 + }, + { + "epoch": 0.11798277195508974, + "grad_norm": 21.38855116207188, + "learning_rate": 1.998043528589205e-06, + "loss": -0.2649, + "step": 1106 + }, + { + "epoch": 0.11819612235645519, + "grad_norm": 17.581276449720193, + "learning_rate": 1.9979966843109445e-06, + "loss": -0.7882, + "step": 1108 + }, + { + "epoch": 0.11840947275782063, + "grad_norm": 17.881994136606902, + "learning_rate": 1.9979492864077094e-06, + "loss": -0.9395, + "step": 1110 + }, + { + "epoch": 0.11862282315918607, + "grad_norm": 27.450698807590868, + "learning_rate": 1.9979013349057932e-06, + "loss": -1.1812, + "step": 1112 + }, + { + "epoch": 0.11883617356055151, + "grad_norm": 11.85016744188475, + "learning_rate": 1.997852829831797e-06, + "loss": -0.2322, + "step": 1114 + }, + { + "epoch": 0.11904952396191695, + "grad_norm": 19.57162822354579, + "learning_rate": 1.997803771212629e-06, + "loss": -0.2706, + "step": 1116 + }, + { + "epoch": 0.1192628743632824, + "grad_norm": 10.830735208685383, + "learning_rate": 1.997754159075502e-06, + "loss": -1.1194, + "step": 1118 + }, + { + "epoch": 0.11947622476464784, + "grad_norm": 4.730754320481347, + "learning_rate": 1.9977039934479385e-06, + "loss": -0.7898, + "step": 1120 + }, + { + "epoch": 0.11968957516601328, + "grad_norm": 8.745657757788235, + "learning_rate": 1.9976532743577673e-06, + "loss": 0.0379, + "step": 1122 + }, + { + "epoch": 0.11990292556737872, + "grad_norm": 28.813446860024147, + "learning_rate": 1.9976020018331243e-06, + "loss": 0.3645, + "step": 1124 + }, + { + "epoch": 0.12011627596874416, + "grad_norm": 10.440814030501423, + "learning_rate": 1.9975501759024517e-06, + "loss": -0.0548, + "step": 1126 + }, + { + "epoch": 0.1203296263701096, + "grad_norm": 11.416071472081667, + "learning_rate": 1.9974977965945e-06, + "loss": 0.2929, + "step": 1128 + }, + { + "epoch": 0.12054297677147505, + "grad_norm": 8.441152040615334, + "learning_rate": 1.9974448639383244e-06, + "loss": -0.6188, + "step": 1130 + }, + { + "epoch": 0.12075632717284049, + "grad_norm": 5.663303106994784, + "learning_rate": 1.9973913779632904e-06, + "loss": -0.4657, + "step": 1132 + }, + { + "epoch": 0.12096967757420593, + "grad_norm": 18.35892778992275, + "learning_rate": 1.9973373386990674e-06, + "loss": 0.2044, + "step": 1134 + }, + { + "epoch": 0.12118302797557137, + "grad_norm": 23.684412769651576, + "learning_rate": 1.9972827461756334e-06, + "loss": -0.4693, + "step": 1136 + }, + { + "epoch": 0.12139637837693683, + "grad_norm": 14.171496402604255, + "learning_rate": 1.997227600423273e-06, + "loss": -0.6943, + "step": 1138 + }, + { + "epoch": 0.12160972877830227, + "grad_norm": 13.922460920458215, + "learning_rate": 1.9971719014725768e-06, + "loss": 0.8172, + "step": 1140 + }, + { + "epoch": 0.12182307917966771, + "grad_norm": 21.72636134013763, + "learning_rate": 1.9971156493544437e-06, + "loss": -0.4408, + "step": 1142 + }, + { + "epoch": 0.12203642958103315, + "grad_norm": 20.82069946728396, + "learning_rate": 1.9970588441000787e-06, + "loss": -0.3291, + "step": 1144 + }, + { + "epoch": 0.1222497799823986, + "grad_norm": 15.957243371369229, + "learning_rate": 1.9970014857409936e-06, + "loss": -0.3534, + "step": 1146 + }, + { + "epoch": 0.12246313038376404, + "grad_norm": 13.228394989772614, + "learning_rate": 1.9969435743090076e-06, + "loss": -0.4047, + "step": 1148 + }, + { + "epoch": 0.12267648078512948, + "grad_norm": 9.336630176087915, + "learning_rate": 1.9968851098362455e-06, + "loss": -0.2281, + "step": 1150 + }, + { + "epoch": 0.12288983118649492, + "grad_norm": 21.372600065820446, + "learning_rate": 1.9968260923551405e-06, + "loss": -0.1527, + "step": 1152 + }, + { + "epoch": 0.12310318158786036, + "grad_norm": 10.49080827854095, + "learning_rate": 1.9967665218984306e-06, + "loss": 0.2273, + "step": 1154 + }, + { + "epoch": 0.1233165319892258, + "grad_norm": 26.54581332312223, + "learning_rate": 1.996706398499163e-06, + "loss": -0.0176, + "step": 1156 + }, + { + "epoch": 0.12352988239059125, + "grad_norm": 10.60684954766727, + "learning_rate": 1.9966457221906893e-06, + "loss": -0.7239, + "step": 1158 + }, + { + "epoch": 0.12374323279195669, + "grad_norm": 18.45210289211234, + "learning_rate": 1.9965844930066696e-06, + "loss": -0.4982, + "step": 1160 + }, + { + "epoch": 0.12395658319332213, + "grad_norm": 20.290379711344247, + "learning_rate": 1.9965227109810694e-06, + "loss": -0.1957, + "step": 1162 + }, + { + "epoch": 0.12416993359468757, + "grad_norm": 10.365692652900515, + "learning_rate": 1.9964603761481623e-06, + "loss": 0.3669, + "step": 1164 + }, + { + "epoch": 0.12438328399605302, + "grad_norm": 13.125311380390214, + "learning_rate": 1.996397488542526e-06, + "loss": -0.3221, + "step": 1166 + }, + { + "epoch": 0.12459663439741846, + "grad_norm": 14.361058641237294, + "learning_rate": 1.9963340481990486e-06, + "loss": 0.1227, + "step": 1168 + }, + { + "epoch": 0.1248099847987839, + "grad_norm": 7.056112272117484, + "learning_rate": 1.9962700551529214e-06, + "loss": 0.0827, + "step": 1170 + }, + { + "epoch": 0.12502333520014935, + "grad_norm": 20.11483934762056, + "learning_rate": 1.996205509439644e-06, + "loss": -1.1759, + "step": 1172 + }, + { + "epoch": 0.1252366856015148, + "grad_norm": 9.441979966620615, + "learning_rate": 1.996140411095022e-06, + "loss": -0.2787, + "step": 1174 + }, + { + "epoch": 0.12545003600288024, + "grad_norm": 6.0147967546848395, + "learning_rate": 1.9960747601551686e-06, + "loss": -0.791, + "step": 1176 + }, + { + "epoch": 0.12566338640424568, + "grad_norm": 27.231381943588612, + "learning_rate": 1.9960085566565015e-06, + "loss": -0.3671, + "step": 1178 + }, + { + "epoch": 0.12587673680561112, + "grad_norm": 9.81504324283188, + "learning_rate": 1.9959418006357476e-06, + "loss": 0.398, + "step": 1180 + }, + { + "epoch": 0.12609008720697656, + "grad_norm": 9.244327583663429, + "learning_rate": 1.995874492129938e-06, + "loss": -0.4957, + "step": 1182 + }, + { + "epoch": 0.126303437608342, + "grad_norm": 12.948378749756142, + "learning_rate": 1.995806631176411e-06, + "loss": -0.6401, + "step": 1184 + }, + { + "epoch": 0.12651678800970745, + "grad_norm": 14.684805388151638, + "learning_rate": 1.9957382178128122e-06, + "loss": -0.7437, + "step": 1186 + }, + { + "epoch": 0.1267301384110729, + "grad_norm": 12.31300809828398, + "learning_rate": 1.995669252077093e-06, + "loss": 0.1044, + "step": 1188 + }, + { + "epoch": 0.12694348881243833, + "grad_norm": 11.76491596226188, + "learning_rate": 1.9955997340075107e-06, + "loss": -0.4917, + "step": 1190 + }, + { + "epoch": 0.12715683921380377, + "grad_norm": 10.89131589218926, + "learning_rate": 1.9955296636426294e-06, + "loss": -0.0883, + "step": 1192 + }, + { + "epoch": 0.12737018961516922, + "grad_norm": 7.320449762607573, + "learning_rate": 1.9954590410213204e-06, + "loss": -0.4345, + "step": 1194 + }, + { + "epoch": 0.12758354001653466, + "grad_norm": 7.738748707298395, + "learning_rate": 1.9953878661827603e-06, + "loss": -0.1405, + "step": 1196 + }, + { + "epoch": 0.1277968904179001, + "grad_norm": 29.041058801442883, + "learning_rate": 1.9953161391664314e-06, + "loss": -0.0113, + "step": 1198 + }, + { + "epoch": 0.12801024081926554, + "grad_norm": 18.64878794716449, + "learning_rate": 1.9952438600121247e-06, + "loss": -0.3338, + "step": 1200 + }, + { + "epoch": 0.12822359122063098, + "grad_norm": 12.138740310190034, + "learning_rate": 1.995171028759936e-06, + "loss": 0.1787, + "step": 1202 + }, + { + "epoch": 0.12843694162199643, + "grad_norm": 9.910448331113573, + "learning_rate": 1.9950976454502658e-06, + "loss": -0.1987, + "step": 1204 + }, + { + "epoch": 0.12865029202336187, + "grad_norm": 8.303854195961756, + "learning_rate": 1.995023710123824e-06, + "loss": -0.3242, + "step": 1206 + }, + { + "epoch": 0.1288636424247273, + "grad_norm": 2.8280360825998887, + "learning_rate": 1.9949492228216255e-06, + "loss": -0.5063, + "step": 1208 + }, + { + "epoch": 0.12907699282609275, + "grad_norm": 24.78519545923043, + "learning_rate": 1.99487418358499e-06, + "loss": -0.6436, + "step": 1210 + }, + { + "epoch": 0.1292903432274582, + "grad_norm": 5.247991941700894, + "learning_rate": 1.994798592455545e-06, + "loss": -0.3478, + "step": 1212 + }, + { + "epoch": 0.12950369362882364, + "grad_norm": 12.131817051335103, + "learning_rate": 1.9947224494752233e-06, + "loss": -0.4232, + "step": 1214 + }, + { + "epoch": 0.12971704403018908, + "grad_norm": 9.201909172979075, + "learning_rate": 1.9946457546862645e-06, + "loss": -0.1835, + "step": 1216 + }, + { + "epoch": 0.12993039443155452, + "grad_norm": 4.964833051326164, + "learning_rate": 1.994568508131214e-06, + "loss": -0.7964, + "step": 1218 + }, + { + "epoch": 0.13014374483291996, + "grad_norm": 9.786105618411568, + "learning_rate": 1.9944907098529234e-06, + "loss": -1.2816, + "step": 1220 + }, + { + "epoch": 0.1303570952342854, + "grad_norm": 12.270192445358681, + "learning_rate": 1.9944123598945498e-06, + "loss": 0.3211, + "step": 1222 + }, + { + "epoch": 0.13057044563565084, + "grad_norm": 16.548756210800725, + "learning_rate": 1.994333458299557e-06, + "loss": 0.202, + "step": 1224 + }, + { + "epoch": 0.1307837960370163, + "grad_norm": 9.753353434876177, + "learning_rate": 1.9942540051117148e-06, + "loss": 0.1426, + "step": 1226 + }, + { + "epoch": 0.13099714643838173, + "grad_norm": 11.58213155194585, + "learning_rate": 1.9941740003750984e-06, + "loss": -0.2848, + "step": 1228 + }, + { + "epoch": 0.13121049683974717, + "grad_norm": 13.290613366769495, + "learning_rate": 1.994093444134089e-06, + "loss": -0.0448, + "step": 1230 + }, + { + "epoch": 0.1314238472411126, + "grad_norm": 12.964788671743946, + "learning_rate": 1.9940123364333753e-06, + "loss": -0.1976, + "step": 1232 + }, + { + "epoch": 0.13163719764247805, + "grad_norm": 26.125246341050705, + "learning_rate": 1.9939306773179494e-06, + "loss": 0.1018, + "step": 1234 + }, + { + "epoch": 0.1318505480438435, + "grad_norm": 21.460777062383563, + "learning_rate": 1.9938484668331112e-06, + "loss": -0.8494, + "step": 1236 + }, + { + "epoch": 0.13206389844520894, + "grad_norm": 14.475281026483652, + "learning_rate": 1.993765705024466e-06, + "loss": -0.0824, + "step": 1238 + }, + { + "epoch": 0.13227724884657438, + "grad_norm": 19.194731333848527, + "learning_rate": 1.993682391937924e-06, + "loss": -0.8551, + "step": 1240 + }, + { + "epoch": 0.13249059924793982, + "grad_norm": 8.025263166829147, + "learning_rate": 1.993598527619703e-06, + "loss": -0.277, + "step": 1242 + }, + { + "epoch": 0.13270394964930526, + "grad_norm": 7.417238847002507, + "learning_rate": 1.9935141121163247e-06, + "loss": -1.7068, + "step": 1244 + }, + { + "epoch": 0.13291730005067073, + "grad_norm": 6.013712861217545, + "learning_rate": 1.993429145474618e-06, + "loss": -0.0375, + "step": 1246 + }, + { + "epoch": 0.13313065045203618, + "grad_norm": 6.222089997970306, + "learning_rate": 1.9933436277417167e-06, + "loss": -0.2544, + "step": 1248 + }, + { + "epoch": 0.13334400085340162, + "grad_norm": 8.644621723582299, + "learning_rate": 1.9932575589650607e-06, + "loss": -0.6859, + "step": 1250 + }, + { + "epoch": 0.13355735125476706, + "grad_norm": 8.81918821940543, + "learning_rate": 1.993170939192396e-06, + "loss": 0.3251, + "step": 1252 + }, + { + "epoch": 0.1337707016561325, + "grad_norm": 9.151176110701087, + "learning_rate": 1.9930837684717724e-06, + "loss": 0.5739, + "step": 1254 + }, + { + "epoch": 0.13398405205749794, + "grad_norm": 20.48823759322151, + "learning_rate": 1.9929960468515477e-06, + "loss": 0.2761, + "step": 1256 + }, + { + "epoch": 0.13419740245886339, + "grad_norm": 12.154278550654555, + "learning_rate": 1.9929077743803843e-06, + "loss": -0.3458, + "step": 1258 + }, + { + "epoch": 0.13441075286022883, + "grad_norm": 14.265983416521257, + "learning_rate": 1.9928189511072497e-06, + "loss": 0.8493, + "step": 1260 + }, + { + "epoch": 0.13462410326159427, + "grad_norm": 4.676628174710065, + "learning_rate": 1.9927295770814173e-06, + "loss": 0.2909, + "step": 1262 + }, + { + "epoch": 0.1348374536629597, + "grad_norm": 9.90417389583978, + "learning_rate": 1.9926396523524665e-06, + "loss": -1.2819, + "step": 1264 + }, + { + "epoch": 0.13505080406432515, + "grad_norm": 9.304209273056328, + "learning_rate": 1.9925491769702822e-06, + "loss": -0.5668, + "step": 1266 + }, + { + "epoch": 0.1352641544656906, + "grad_norm": 7.4993572899829335, + "learning_rate": 1.992458150985053e-06, + "loss": -0.2587, + "step": 1268 + }, + { + "epoch": 0.13547750486705604, + "grad_norm": 8.723865402043476, + "learning_rate": 1.992366574447276e-06, + "loss": 0.1551, + "step": 1270 + }, + { + "epoch": 0.13569085526842148, + "grad_norm": 5.057075668658937, + "learning_rate": 1.9922744474077508e-06, + "loss": -0.3648, + "step": 1272 + }, + { + "epoch": 0.13590420566978692, + "grad_norm": 16.770112625398436, + "learning_rate": 1.9921817699175844e-06, + "loss": -1.0814, + "step": 1274 + }, + { + "epoch": 0.13611755607115236, + "grad_norm": 33.36589858143468, + "learning_rate": 1.9920885420281872e-06, + "loss": -0.4457, + "step": 1276 + }, + { + "epoch": 0.1363309064725178, + "grad_norm": 6.946967661217, + "learning_rate": 1.9919947637912777e-06, + "loss": -1.0567, + "step": 1278 + }, + { + "epoch": 0.13654425687388325, + "grad_norm": 8.519674061007464, + "learning_rate": 1.9919004352588765e-06, + "loss": -0.2459, + "step": 1280 + }, + { + "epoch": 0.1367576072752487, + "grad_norm": 17.53676710023234, + "learning_rate": 1.9918055564833123e-06, + "loss": 0.1388, + "step": 1282 + }, + { + "epoch": 0.13697095767661413, + "grad_norm": 20.708957064208715, + "learning_rate": 1.9917101275172173e-06, + "loss": -0.1265, + "step": 1284 + }, + { + "epoch": 0.13718430807797957, + "grad_norm": 8.901672321260623, + "learning_rate": 1.9916141484135297e-06, + "loss": -0.4476, + "step": 1286 + }, + { + "epoch": 0.13739765847934501, + "grad_norm": 7.343042152992284, + "learning_rate": 1.991517619225492e-06, + "loss": -0.5943, + "step": 1288 + }, + { + "epoch": 0.13761100888071046, + "grad_norm": 27.76539957519422, + "learning_rate": 1.9914205400066527e-06, + "loss": -0.3234, + "step": 1290 + }, + { + "epoch": 0.1378243592820759, + "grad_norm": 50.99311574078685, + "learning_rate": 1.9913229108108657e-06, + "loss": -1.0094, + "step": 1292 + }, + { + "epoch": 0.13803770968344134, + "grad_norm": 14.695157878092305, + "learning_rate": 1.991224731692288e-06, + "loss": -1.3651, + "step": 1294 + }, + { + "epoch": 0.13825106008480678, + "grad_norm": 3.960187280725374, + "learning_rate": 1.9911260027053853e-06, + "loss": -0.5186, + "step": 1296 + }, + { + "epoch": 0.13846441048617222, + "grad_norm": 12.529647758062435, + "learning_rate": 1.9910267239049244e-06, + "loss": -0.368, + "step": 1298 + }, + { + "epoch": 0.13867776088753767, + "grad_norm": 20.251391963231992, + "learning_rate": 1.9909268953459796e-06, + "loss": 0.193, + "step": 1300 + }, + { + "epoch": 0.1388911112889031, + "grad_norm": 8.363591199706423, + "learning_rate": 1.9908265170839287e-06, + "loss": -1.2309, + "step": 1302 + }, + { + "epoch": 0.13910446169026855, + "grad_norm": 19.744142730100023, + "learning_rate": 1.990725589174456e-06, + "loss": -0.8554, + "step": 1304 + }, + { + "epoch": 0.139317812091634, + "grad_norm": 12.061297088872797, + "learning_rate": 1.99062411167355e-06, + "loss": -0.0811, + "step": 1306 + }, + { + "epoch": 0.13953116249299943, + "grad_norm": 11.228939123696309, + "learning_rate": 1.990522084637503e-06, + "loss": -0.0931, + "step": 1308 + }, + { + "epoch": 0.13974451289436488, + "grad_norm": 12.144929824556511, + "learning_rate": 1.990419508122914e-06, + "loss": -0.4765, + "step": 1310 + }, + { + "epoch": 0.13995786329573032, + "grad_norm": 20.656060496728355, + "learning_rate": 1.9903163821866854e-06, + "loss": 0.2659, + "step": 1312 + }, + { + "epoch": 0.14017121369709576, + "grad_norm": 14.649962220479885, + "learning_rate": 1.9902127068860254e-06, + "loss": 0.4123, + "step": 1314 + }, + { + "epoch": 0.1403845640984612, + "grad_norm": 10.2750178410792, + "learning_rate": 1.9901084822784454e-06, + "loss": 0.0955, + "step": 1316 + }, + { + "epoch": 0.14059791449982664, + "grad_norm": 5.90043808850163, + "learning_rate": 1.9900037084217634e-06, + "loss": -1.1914, + "step": 1318 + }, + { + "epoch": 0.14081126490119208, + "grad_norm": 15.885552307589213, + "learning_rate": 1.9898983853741016e-06, + "loss": 0.1728, + "step": 1320 + }, + { + "epoch": 0.14102461530255753, + "grad_norm": 9.891069798654515, + "learning_rate": 1.989792513193886e-06, + "loss": 0.1657, + "step": 1322 + }, + { + "epoch": 0.14123796570392297, + "grad_norm": 7.782182874303118, + "learning_rate": 1.9896860919398477e-06, + "loss": -0.3856, + "step": 1324 + }, + { + "epoch": 0.1414513161052884, + "grad_norm": 11.641176185781765, + "learning_rate": 1.989579121671022e-06, + "loss": -0.3068, + "step": 1326 + }, + { + "epoch": 0.14166466650665385, + "grad_norm": 4.805403193069003, + "learning_rate": 1.98947160244675e-06, + "loss": -0.4986, + "step": 1328 + }, + { + "epoch": 0.14187801690801932, + "grad_norm": 16.862956588429178, + "learning_rate": 1.9893635343266765e-06, + "loss": 0.3268, + "step": 1330 + }, + { + "epoch": 0.14209136730938476, + "grad_norm": 7.248484194373781, + "learning_rate": 1.9892549173707506e-06, + "loss": -0.0999, + "step": 1332 + }, + { + "epoch": 0.1423047177107502, + "grad_norm": 14.217945557585596, + "learning_rate": 1.9891457516392255e-06, + "loss": 0.2878, + "step": 1334 + }, + { + "epoch": 0.14251806811211565, + "grad_norm": 9.841439441353879, + "learning_rate": 1.9890360371926603e-06, + "loss": -0.6387, + "step": 1336 + }, + { + "epoch": 0.1427314185134811, + "grad_norm": 15.714978276816597, + "learning_rate": 1.9889257740919173e-06, + "loss": -0.0362, + "step": 1338 + }, + { + "epoch": 0.14294476891484653, + "grad_norm": 24.38558873057061, + "learning_rate": 1.988814962398163e-06, + "loss": -0.1845, + "step": 1340 + }, + { + "epoch": 0.14315811931621197, + "grad_norm": 19.37524940612685, + "learning_rate": 1.988703602172869e-06, + "loss": -0.051, + "step": 1342 + }, + { + "epoch": 0.14337146971757742, + "grad_norm": 17.53928719020653, + "learning_rate": 1.988591693477811e-06, + "loss": 0.3123, + "step": 1344 + }, + { + "epoch": 0.14358482011894286, + "grad_norm": 7.651959842433343, + "learning_rate": 1.988479236375068e-06, + "loss": 0.6345, + "step": 1346 + }, + { + "epoch": 0.1437981705203083, + "grad_norm": 12.849121747612804, + "learning_rate": 1.9883662309270255e-06, + "loss": -0.3795, + "step": 1348 + }, + { + "epoch": 0.14401152092167374, + "grad_norm": 10.42307066045314, + "learning_rate": 1.9882526771963705e-06, + "loss": 0.0273, + "step": 1350 + }, + { + "epoch": 0.14422487132303918, + "grad_norm": 10.998417457164084, + "learning_rate": 1.988138575246096e-06, + "loss": 0.068, + "step": 1352 + }, + { + "epoch": 0.14443822172440463, + "grad_norm": 8.217438827795817, + "learning_rate": 1.9880239251394984e-06, + "loss": 0.1393, + "step": 1354 + }, + { + "epoch": 0.14465157212577007, + "grad_norm": 7.09193880673836, + "learning_rate": 1.987908726940178e-06, + "loss": -0.4732, + "step": 1356 + }, + { + "epoch": 0.1448649225271355, + "grad_norm": 6.9152413451529045, + "learning_rate": 1.9877929807120394e-06, + "loss": -0.5944, + "step": 1358 + }, + { + "epoch": 0.14507827292850095, + "grad_norm": 21.317287958934948, + "learning_rate": 1.9876766865192917e-06, + "loss": -0.7226, + "step": 1360 + }, + { + "epoch": 0.1452916233298664, + "grad_norm": 7.0748819724154055, + "learning_rate": 1.987559844426447e-06, + "loss": 0.0925, + "step": 1362 + }, + { + "epoch": 0.14550497373123183, + "grad_norm": 8.277532517958686, + "learning_rate": 1.987442454498322e-06, + "loss": -0.1877, + "step": 1364 + }, + { + "epoch": 0.14571832413259728, + "grad_norm": 7.796349199058688, + "learning_rate": 1.9873245168000374e-06, + "loss": -1.2002, + "step": 1366 + }, + { + "epoch": 0.14593167453396272, + "grad_norm": 9.519972662407087, + "learning_rate": 1.9872060313970172e-06, + "loss": 0.1226, + "step": 1368 + }, + { + "epoch": 0.14614502493532816, + "grad_norm": 51.22794539984388, + "learning_rate": 1.98708699835499e-06, + "loss": 0.0132, + "step": 1370 + }, + { + "epoch": 0.1463583753366936, + "grad_norm": 9.89616190200503, + "learning_rate": 1.9869674177399875e-06, + "loss": -0.9277, + "step": 1372 + }, + { + "epoch": 0.14657172573805904, + "grad_norm": 11.215904956332496, + "learning_rate": 1.9868472896183447e-06, + "loss": -0.4874, + "step": 1374 + }, + { + "epoch": 0.1467850761394245, + "grad_norm": 13.872064184762836, + "learning_rate": 1.9867266140567022e-06, + "loss": -0.3183, + "step": 1376 + }, + { + "epoch": 0.14699842654078993, + "grad_norm": 7.995930683354116, + "learning_rate": 1.9866053911220023e-06, + "loss": -0.5589, + "step": 1378 + }, + { + "epoch": 0.14721177694215537, + "grad_norm": 16.378366719543713, + "learning_rate": 1.986483620881492e-06, + "loss": -0.2039, + "step": 1380 + }, + { + "epoch": 0.1474251273435208, + "grad_norm": 7.493508160443857, + "learning_rate": 1.9863613034027223e-06, + "loss": -0.0562, + "step": 1382 + }, + { + "epoch": 0.14763847774488625, + "grad_norm": 12.664830255838945, + "learning_rate": 1.986238438753546e-06, + "loss": -0.7556, + "step": 1384 + }, + { + "epoch": 0.1478518281462517, + "grad_norm": 19.397178687021295, + "learning_rate": 1.9861150270021217e-06, + "loss": -0.6734, + "step": 1386 + }, + { + "epoch": 0.14806517854761714, + "grad_norm": 5.704825687189972, + "learning_rate": 1.9859910682169094e-06, + "loss": 0.0273, + "step": 1388 + }, + { + "epoch": 0.14827852894898258, + "grad_norm": 18.74154359230761, + "learning_rate": 1.9858665624666736e-06, + "loss": -0.4775, + "step": 1390 + }, + { + "epoch": 0.14849187935034802, + "grad_norm": 5.773692619924285, + "learning_rate": 1.985741509820483e-06, + "loss": -0.6141, + "step": 1392 + }, + { + "epoch": 0.14870522975171346, + "grad_norm": 19.282598779403987, + "learning_rate": 1.9856159103477083e-06, + "loss": -0.8339, + "step": 1394 + }, + { + "epoch": 0.1489185801530789, + "grad_norm": 5.2700845447883005, + "learning_rate": 1.9854897641180243e-06, + "loss": 0.0385, + "step": 1396 + }, + { + "epoch": 0.14913193055444435, + "grad_norm": 9.026756714744467, + "learning_rate": 1.9853630712014084e-06, + "loss": -0.4472, + "step": 1398 + }, + { + "epoch": 0.1493452809558098, + "grad_norm": 10.545420164055413, + "learning_rate": 1.9852358316681423e-06, + "loss": -0.622, + "step": 1400 + }, + { + "epoch": 0.14955863135717523, + "grad_norm": 8.733561962250285, + "learning_rate": 1.98510804558881e-06, + "loss": 0.4292, + "step": 1402 + }, + { + "epoch": 0.14977198175854067, + "grad_norm": 6.668865041230664, + "learning_rate": 1.9849797130342994e-06, + "loss": -0.4685, + "step": 1404 + }, + { + "epoch": 0.14998533215990612, + "grad_norm": 3.5886436542959625, + "learning_rate": 1.984850834075801e-06, + "loss": 0.105, + "step": 1406 + }, + { + "epoch": 0.15019868256127156, + "grad_norm": 5.202738707402359, + "learning_rate": 1.9847214087848086e-06, + "loss": -0.2023, + "step": 1408 + }, + { + "epoch": 0.150412032962637, + "grad_norm": 15.567853060306508, + "learning_rate": 1.9845914372331193e-06, + "loss": 0.1152, + "step": 1410 + }, + { + "epoch": 0.15062538336400244, + "grad_norm": 8.300291377654755, + "learning_rate": 1.984460919492833e-06, + "loss": 0.4582, + "step": 1412 + }, + { + "epoch": 0.15083873376536788, + "grad_norm": 12.462553911552765, + "learning_rate": 1.9843298556363528e-06, + "loss": -1.0064, + "step": 1414 + }, + { + "epoch": 0.15105208416673335, + "grad_norm": 7.664251809863465, + "learning_rate": 1.9841982457363836e-06, + "loss": -0.6192, + "step": 1416 + }, + { + "epoch": 0.1512654345680988, + "grad_norm": 15.092728868230994, + "learning_rate": 1.9840660898659357e-06, + "loss": -1.0103, + "step": 1418 + }, + { + "epoch": 0.15147878496946424, + "grad_norm": 11.503676789333593, + "learning_rate": 1.98393338809832e-06, + "loss": 0.4677, + "step": 1420 + }, + { + "epoch": 0.15169213537082968, + "grad_norm": 4.562471025499068, + "learning_rate": 1.9838001405071504e-06, + "loss": -0.5226, + "step": 1422 + }, + { + "epoch": 0.15190548577219512, + "grad_norm": 4.567933913154999, + "learning_rate": 1.983666347166345e-06, + "loss": -0.4826, + "step": 1424 + }, + { + "epoch": 0.15211883617356056, + "grad_norm": 8.818307342546014, + "learning_rate": 1.983532008150124e-06, + "loss": 0.1747, + "step": 1426 + }, + { + "epoch": 0.152332186574926, + "grad_norm": 13.988100683233654, + "learning_rate": 1.9833971235330092e-06, + "loss": 0.0731, + "step": 1428 + }, + { + "epoch": 0.15254553697629145, + "grad_norm": 8.85657833332133, + "learning_rate": 1.9832616933898266e-06, + "loss": 0.171, + "step": 1430 + }, + { + "epoch": 0.1527588873776569, + "grad_norm": 6.619556052601107, + "learning_rate": 1.983125717795704e-06, + "loss": -0.1667, + "step": 1432 + }, + { + "epoch": 0.15297223777902233, + "grad_norm": 3.9770093006568827, + "learning_rate": 1.9829891968260724e-06, + "loss": -0.3355, + "step": 1434 + }, + { + "epoch": 0.15318558818038777, + "grad_norm": 10.59606320986383, + "learning_rate": 1.9828521305566644e-06, + "loss": 0.2674, + "step": 1436 + }, + { + "epoch": 0.1533989385817532, + "grad_norm": 13.074191446717647, + "learning_rate": 1.982714519063516e-06, + "loss": -0.6665, + "step": 1438 + }, + { + "epoch": 0.15361228898311866, + "grad_norm": 10.66118503206778, + "learning_rate": 1.9825763624229654e-06, + "loss": -0.0641, + "step": 1440 + }, + { + "epoch": 0.1538256393844841, + "grad_norm": 39.14537892546988, + "learning_rate": 1.9824376607116526e-06, + "loss": -0.0822, + "step": 1442 + }, + { + "epoch": 0.15403898978584954, + "grad_norm": 6.254113572658406, + "learning_rate": 1.9822984140065205e-06, + "loss": -0.2203, + "step": 1444 + }, + { + "epoch": 0.15425234018721498, + "grad_norm": 11.512879109806274, + "learning_rate": 1.982158622384815e-06, + "loss": 0.0883, + "step": 1446 + }, + { + "epoch": 0.15446569058858042, + "grad_norm": 10.839138101904458, + "learning_rate": 1.9820182859240824e-06, + "loss": -0.2504, + "step": 1448 + }, + { + "epoch": 0.15467904098994587, + "grad_norm": 27.771953229709435, + "learning_rate": 1.981877404702174e-06, + "loss": -0.4597, + "step": 1450 + }, + { + "epoch": 0.1548923913913113, + "grad_norm": 15.71893609797128, + "learning_rate": 1.9817359787972404e-06, + "loss": -0.5152, + "step": 1452 + }, + { + "epoch": 0.15510574179267675, + "grad_norm": 19.049058206585627, + "learning_rate": 1.9815940082877363e-06, + "loss": 0.2758, + "step": 1454 + }, + { + "epoch": 0.1553190921940422, + "grad_norm": 10.809514750469875, + "learning_rate": 1.9814514932524176e-06, + "loss": 0.4074, + "step": 1456 + }, + { + "epoch": 0.15553244259540763, + "grad_norm": 12.124473119383847, + "learning_rate": 1.981308433770343e-06, + "loss": -0.7027, + "step": 1458 + }, + { + "epoch": 0.15574579299677307, + "grad_norm": 11.324041223470877, + "learning_rate": 1.9811648299208726e-06, + "loss": -1.1957, + "step": 1460 + }, + { + "epoch": 0.15595914339813852, + "grad_norm": 12.422283117147003, + "learning_rate": 1.9810206817836682e-06, + "loss": 0.0484, + "step": 1462 + }, + { + "epoch": 0.15617249379950396, + "grad_norm": 12.624745930225414, + "learning_rate": 1.9808759894386945e-06, + "loss": -0.8822, + "step": 1464 + }, + { + "epoch": 0.1563858442008694, + "grad_norm": 13.78310812890328, + "learning_rate": 1.9807307529662174e-06, + "loss": -0.3963, + "step": 1466 + }, + { + "epoch": 0.15659919460223484, + "grad_norm": 4.4208276670995135, + "learning_rate": 1.9805849724468046e-06, + "loss": -0.0069, + "step": 1468 + }, + { + "epoch": 0.15681254500360028, + "grad_norm": 7.565092495263654, + "learning_rate": 1.9804386479613267e-06, + "loss": 0.3157, + "step": 1470 + }, + { + "epoch": 0.15702589540496573, + "grad_norm": 12.825713095133725, + "learning_rate": 1.980291779590954e-06, + "loss": 0.5112, + "step": 1472 + }, + { + "epoch": 0.15723924580633117, + "grad_norm": 7.5102366021430464, + "learning_rate": 1.980144367417161e-06, + "loss": -0.2039, + "step": 1474 + }, + { + "epoch": 0.1574525962076966, + "grad_norm": 6.8029536488511555, + "learning_rate": 1.979996411521722e-06, + "loss": -0.7558, + "step": 1476 + }, + { + "epoch": 0.15766594660906205, + "grad_norm": 15.641742410552869, + "learning_rate": 1.9798479119867133e-06, + "loss": -0.7574, + "step": 1478 + }, + { + "epoch": 0.1578792970104275, + "grad_norm": 12.63337979627095, + "learning_rate": 1.9796988688945125e-06, + "loss": -0.7699, + "step": 1480 + }, + { + "epoch": 0.15809264741179294, + "grad_norm": 11.011081565801963, + "learning_rate": 1.9795492823278006e-06, + "loss": -0.8634, + "step": 1482 + }, + { + "epoch": 0.15830599781315838, + "grad_norm": 8.113917603381717, + "learning_rate": 1.9793991523695575e-06, + "loss": 0.1113, + "step": 1484 + }, + { + "epoch": 0.15851934821452382, + "grad_norm": 5.879735279694428, + "learning_rate": 1.9792484791030664e-06, + "loss": -0.312, + "step": 1486 + }, + { + "epoch": 0.15873269861588926, + "grad_norm": 7.522005806855097, + "learning_rate": 1.979097262611911e-06, + "loss": 0.0032, + "step": 1488 + }, + { + "epoch": 0.1589460490172547, + "grad_norm": 13.65889902407673, + "learning_rate": 1.9789455029799764e-06, + "loss": -0.2951, + "step": 1490 + }, + { + "epoch": 0.15915939941862015, + "grad_norm": 8.700882293835516, + "learning_rate": 1.9787932002914495e-06, + "loss": -0.7851, + "step": 1492 + }, + { + "epoch": 0.1593727498199856, + "grad_norm": 7.9538304857949305, + "learning_rate": 1.978640354630818e-06, + "loss": 0.1009, + "step": 1494 + }, + { + "epoch": 0.15958610022135103, + "grad_norm": 10.861290314706679, + "learning_rate": 1.978486966082871e-06, + "loss": 0.3189, + "step": 1496 + }, + { + "epoch": 0.15979945062271647, + "grad_norm": 7.444611191380993, + "learning_rate": 1.9783330347326983e-06, + "loss": 0.515, + "step": 1498 + }, + { + "epoch": 0.1600128010240819, + "grad_norm": 11.912608153339749, + "learning_rate": 1.9781785606656914e-06, + "loss": 0.053, + "step": 1500 + }, + { + "epoch": 0.16022615142544738, + "grad_norm": 10.988092252697443, + "learning_rate": 1.978023543967543e-06, + "loss": -0.3173, + "step": 1502 + }, + { + "epoch": 0.16043950182681282, + "grad_norm": 19.7730500472891, + "learning_rate": 1.9778679847242463e-06, + "loss": -0.4978, + "step": 1504 + }, + { + "epoch": 0.16065285222817827, + "grad_norm": 7.565552992288537, + "learning_rate": 1.9777118830220954e-06, + "loss": -0.2651, + "step": 1506 + }, + { + "epoch": 0.1608662026295437, + "grad_norm": 7.04611081329321, + "learning_rate": 1.9775552389476863e-06, + "loss": -0.2007, + "step": 1508 + }, + { + "epoch": 0.16107955303090915, + "grad_norm": 21.76340794506112, + "learning_rate": 1.977398052587914e-06, + "loss": -0.3304, + "step": 1510 + }, + { + "epoch": 0.1612929034322746, + "grad_norm": 11.941091552743108, + "learning_rate": 1.9772403240299765e-06, + "loss": -0.5119, + "step": 1512 + }, + { + "epoch": 0.16150625383364003, + "grad_norm": 13.448296358290223, + "learning_rate": 1.977082053361371e-06, + "loss": -0.6099, + "step": 1514 + }, + { + "epoch": 0.16171960423500548, + "grad_norm": 8.098957923702608, + "learning_rate": 1.9769232406698964e-06, + "loss": -0.2868, + "step": 1516 + }, + { + "epoch": 0.16193295463637092, + "grad_norm": 15.658109835847851, + "learning_rate": 1.9767638860436518e-06, + "loss": -1.877, + "step": 1518 + }, + { + "epoch": 0.16214630503773636, + "grad_norm": 10.196319213005983, + "learning_rate": 1.9766039895710364e-06, + "loss": -0.0268, + "step": 1520 + }, + { + "epoch": 0.1623596554391018, + "grad_norm": 14.344746364689398, + "learning_rate": 1.9764435513407516e-06, + "loss": -0.676, + "step": 1522 + }, + { + "epoch": 0.16257300584046724, + "grad_norm": 14.616404912722821, + "learning_rate": 1.976282571441797e-06, + "loss": 0.1502, + "step": 1524 + }, + { + "epoch": 0.16278635624183269, + "grad_norm": 14.30221231141689, + "learning_rate": 1.976121049963475e-06, + "loss": -0.0454, + "step": 1526 + }, + { + "epoch": 0.16299970664319813, + "grad_norm": 13.915829726975026, + "learning_rate": 1.975958986995387e-06, + "loss": -0.5742, + "step": 1528 + }, + { + "epoch": 0.16321305704456357, + "grad_norm": 13.686767546017077, + "learning_rate": 1.9757963826274354e-06, + "loss": -0.1671, + "step": 1530 + }, + { + "epoch": 0.163426407445929, + "grad_norm": 3.9764711677563103, + "learning_rate": 1.975633236949823e-06, + "loss": -0.3207, + "step": 1532 + }, + { + "epoch": 0.16363975784729445, + "grad_norm": 3.1340618346223548, + "learning_rate": 1.9754695500530516e-06, + "loss": 0.0105, + "step": 1534 + }, + { + "epoch": 0.1638531082486599, + "grad_norm": 6.283022078366911, + "learning_rate": 1.975305322027926e-06, + "loss": -0.6137, + "step": 1536 + }, + { + "epoch": 0.16406645865002534, + "grad_norm": 20.324734801308242, + "learning_rate": 1.9751405529655473e-06, + "loss": -0.1634, + "step": 1538 + }, + { + "epoch": 0.16427980905139078, + "grad_norm": 9.296359495009186, + "learning_rate": 1.9749752429573204e-06, + "loss": 0.4357, + "step": 1540 + }, + { + "epoch": 0.16449315945275622, + "grad_norm": 18.66220522940308, + "learning_rate": 1.9748093920949485e-06, + "loss": -0.8267, + "step": 1542 + }, + { + "epoch": 0.16470650985412166, + "grad_norm": 36.09313965747721, + "learning_rate": 1.974643000470435e-06, + "loss": -0.535, + "step": 1544 + }, + { + "epoch": 0.1649198602554871, + "grad_norm": 14.698967454534976, + "learning_rate": 1.9744760681760832e-06, + "loss": 0.5589, + "step": 1546 + }, + { + "epoch": 0.16513321065685255, + "grad_norm": 12.989032273668574, + "learning_rate": 1.9743085953044963e-06, + "loss": -0.6752, + "step": 1548 + }, + { + "epoch": 0.165346561058218, + "grad_norm": 12.687629449396232, + "learning_rate": 1.9741405819485782e-06, + "loss": -0.2574, + "step": 1550 + }, + { + "epoch": 0.16555991145958343, + "grad_norm": 12.16900146230176, + "learning_rate": 1.973972028201532e-06, + "loss": 0.5595, + "step": 1552 + }, + { + "epoch": 0.16577326186094887, + "grad_norm": 7.84906480756588, + "learning_rate": 1.97380293415686e-06, + "loss": 0.3431, + "step": 1554 + }, + { + "epoch": 0.16598661226231431, + "grad_norm": 13.810151633893732, + "learning_rate": 1.9736332999083647e-06, + "loss": -0.1136, + "step": 1556 + }, + { + "epoch": 0.16619996266367976, + "grad_norm": 23.64843089807792, + "learning_rate": 1.973463125550149e-06, + "loss": -0.2038, + "step": 1558 + }, + { + "epoch": 0.1664133130650452, + "grad_norm": 6.1814145775741745, + "learning_rate": 1.9732924111766148e-06, + "loss": -0.0059, + "step": 1560 + }, + { + "epoch": 0.16662666346641064, + "grad_norm": 9.735355549985982, + "learning_rate": 1.973121156882463e-06, + "loss": -0.1945, + "step": 1562 + }, + { + "epoch": 0.16684001386777608, + "grad_norm": 10.150551002834774, + "learning_rate": 1.972949362762695e-06, + "loss": -0.9193, + "step": 1564 + }, + { + "epoch": 0.16705336426914152, + "grad_norm": 12.368494093848792, + "learning_rate": 1.972777028912611e-06, + "loss": -0.9772, + "step": 1566 + }, + { + "epoch": 0.16726671467050697, + "grad_norm": 20.463056118932432, + "learning_rate": 1.972604155427811e-06, + "loss": 0.3249, + "step": 1568 + }, + { + "epoch": 0.1674800650718724, + "grad_norm": 4.719164605200082, + "learning_rate": 1.9724307424041943e-06, + "loss": 0.4832, + "step": 1570 + }, + { + "epoch": 0.16769341547323785, + "grad_norm": 13.753871214297394, + "learning_rate": 1.972256789937959e-06, + "loss": 0.2147, + "step": 1572 + }, + { + "epoch": 0.1679067658746033, + "grad_norm": 8.370963490532764, + "learning_rate": 1.9720822981256032e-06, + "loss": -0.3386, + "step": 1574 + }, + { + "epoch": 0.16812011627596873, + "grad_norm": 6.477965817350643, + "learning_rate": 1.971907267063924e-06, + "loss": -0.5137, + "step": 1576 + }, + { + "epoch": 0.16833346667733418, + "grad_norm": 8.997371018929906, + "learning_rate": 1.9717316968500165e-06, + "loss": -0.0199, + "step": 1578 + }, + { + "epoch": 0.16854681707869962, + "grad_norm": 6.994891692529196, + "learning_rate": 1.971555587581277e-06, + "loss": -0.9777, + "step": 1580 + }, + { + "epoch": 0.16876016748006506, + "grad_norm": 12.156101613671975, + "learning_rate": 1.971378939355399e-06, + "loss": -0.0555, + "step": 1582 + }, + { + "epoch": 0.1689735178814305, + "grad_norm": 19.837439965100955, + "learning_rate": 1.971201752270376e-06, + "loss": -0.2187, + "step": 1584 + }, + { + "epoch": 0.16918686828279594, + "grad_norm": 23.06678942310109, + "learning_rate": 1.9710240264245003e-06, + "loss": -1.3332, + "step": 1586 + }, + { + "epoch": 0.1694002186841614, + "grad_norm": 6.863429399480339, + "learning_rate": 1.9708457619163627e-06, + "loss": 0.2579, + "step": 1588 + }, + { + "epoch": 0.16961356908552686, + "grad_norm": 3.240255593370213, + "learning_rate": 1.970666958844853e-06, + "loss": -0.2973, + "step": 1590 + }, + { + "epoch": 0.1698269194868923, + "grad_norm": 10.38048415473619, + "learning_rate": 1.9704876173091593e-06, + "loss": -1.2771, + "step": 1592 + }, + { + "epoch": 0.17004026988825774, + "grad_norm": 9.3643177662047, + "learning_rate": 1.9703077374087692e-06, + "loss": -0.8776, + "step": 1594 + }, + { + "epoch": 0.17025362028962318, + "grad_norm": 11.223809862732512, + "learning_rate": 1.9701273192434687e-06, + "loss": 0.1148, + "step": 1596 + }, + { + "epoch": 0.17046697069098862, + "grad_norm": 3.1834765213057814, + "learning_rate": 1.9699463629133423e-06, + "loss": 0.4166, + "step": 1598 + }, + { + "epoch": 0.17068032109235406, + "grad_norm": 18.265604789498173, + "learning_rate": 1.969764868518773e-06, + "loss": -1.0235, + "step": 1600 + }, + { + "epoch": 0.1708936714937195, + "grad_norm": 8.13932852787834, + "learning_rate": 1.9695828361604426e-06, + "loss": -0.2186, + "step": 1602 + }, + { + "epoch": 0.17110702189508495, + "grad_norm": 5.110377199191138, + "learning_rate": 1.9694002659393305e-06, + "loss": -1.507, + "step": 1604 + }, + { + "epoch": 0.1713203722964504, + "grad_norm": 9.437111240728154, + "learning_rate": 1.9692171579567153e-06, + "loss": -0.4275, + "step": 1606 + }, + { + "epoch": 0.17153372269781583, + "grad_norm": 6.317175247246368, + "learning_rate": 1.9690335123141736e-06, + "loss": -0.4688, + "step": 1608 + }, + { + "epoch": 0.17174707309918127, + "grad_norm": 8.211655480975368, + "learning_rate": 1.96884932911358e-06, + "loss": -1.2091, + "step": 1610 + }, + { + "epoch": 0.17196042350054672, + "grad_norm": 9.167271377226179, + "learning_rate": 1.9686646084571088e-06, + "loss": 0.0305, + "step": 1612 + }, + { + "epoch": 0.17217377390191216, + "grad_norm": 18.132861007920468, + "learning_rate": 1.9684793504472297e-06, + "loss": 0.9484, + "step": 1614 + }, + { + "epoch": 0.1723871243032776, + "grad_norm": 9.030723482308117, + "learning_rate": 1.9682935551867126e-06, + "loss": -0.6449, + "step": 1616 + }, + { + "epoch": 0.17260047470464304, + "grad_norm": 7.535160574600248, + "learning_rate": 1.9681072227786257e-06, + "loss": -0.6224, + "step": 1618 + }, + { + "epoch": 0.17281382510600848, + "grad_norm": 6.8318680978024595, + "learning_rate": 1.9679203533263333e-06, + "loss": -0.466, + "step": 1620 + }, + { + "epoch": 0.17302717550737393, + "grad_norm": 11.853415128133399, + "learning_rate": 1.967732946933499e-06, + "loss": -0.3621, + "step": 1622 + }, + { + "epoch": 0.17324052590873937, + "grad_norm": 8.62809433845374, + "learning_rate": 1.9675450037040835e-06, + "loss": -0.7561, + "step": 1624 + }, + { + "epoch": 0.1734538763101048, + "grad_norm": 10.182519096586946, + "learning_rate": 1.967356523742347e-06, + "loss": -0.0543, + "step": 1626 + }, + { + "epoch": 0.17366722671147025, + "grad_norm": 4.9289897884289635, + "learning_rate": 1.967167507152845e-06, + "loss": 0.1213, + "step": 1628 + }, + { + "epoch": 0.1738805771128357, + "grad_norm": 13.67496482188253, + "learning_rate": 1.9669779540404317e-06, + "loss": 0.0458, + "step": 1630 + }, + { + "epoch": 0.17409392751420114, + "grad_norm": 3.9691478655360264, + "learning_rate": 1.9667878645102602e-06, + "loss": -0.7859, + "step": 1632 + }, + { + "epoch": 0.17430727791556658, + "grad_norm": 8.028314392529628, + "learning_rate": 1.9665972386677795e-06, + "loss": -0.1328, + "step": 1634 + }, + { + "epoch": 0.17452062831693202, + "grad_norm": 23.734954082296028, + "learning_rate": 1.9664060766187363e-06, + "loss": 0.0432, + "step": 1636 + }, + { + "epoch": 0.17473397871829746, + "grad_norm": 2.062739370041361, + "learning_rate": 1.9662143784691755e-06, + "loss": -0.421, + "step": 1638 + }, + { + "epoch": 0.1749473291196629, + "grad_norm": 10.085952838950082, + "learning_rate": 1.966022144325439e-06, + "loss": -1.5764, + "step": 1640 + }, + { + "epoch": 0.17516067952102835, + "grad_norm": 12.13344771384856, + "learning_rate": 1.9658293742941664e-06, + "loss": -0.715, + "step": 1642 + }, + { + "epoch": 0.1753740299223938, + "grad_norm": 5.349791815496225, + "learning_rate": 1.9656360684822936e-06, + "loss": -0.0556, + "step": 1644 + }, + { + "epoch": 0.17558738032375923, + "grad_norm": 25.20028676441013, + "learning_rate": 1.9654422269970543e-06, + "loss": 0.1781, + "step": 1646 + }, + { + "epoch": 0.17580073072512467, + "grad_norm": 10.844013728581611, + "learning_rate": 1.96524784994598e-06, + "loss": -0.1389, + "step": 1648 + }, + { + "epoch": 0.1760140811264901, + "grad_norm": 6.66565646129455, + "learning_rate": 1.9650529374368986e-06, + "loss": -0.6699, + "step": 1650 + }, + { + "epoch": 0.17622743152785555, + "grad_norm": 8.012849338836023, + "learning_rate": 1.9648574895779347e-06, + "loss": -0.2213, + "step": 1652 + }, + { + "epoch": 0.176440781929221, + "grad_norm": 17.49951854490355, + "learning_rate": 1.9646615064775105e-06, + "loss": -0.121, + "step": 1654 + }, + { + "epoch": 0.17665413233058644, + "grad_norm": 23.843475618352024, + "learning_rate": 1.9644649882443453e-06, + "loss": -0.7095, + "step": 1656 + }, + { + "epoch": 0.17686748273195188, + "grad_norm": 10.069359913823204, + "learning_rate": 1.9642679349874544e-06, + "loss": -0.2746, + "step": 1658 + }, + { + "epoch": 0.17708083313331732, + "grad_norm": 8.436240471923892, + "learning_rate": 1.9640703468161507e-06, + "loss": -0.2465, + "step": 1660 + }, + { + "epoch": 0.17729418353468276, + "grad_norm": 19.20945740161944, + "learning_rate": 1.9638722238400433e-06, + "loss": -0.4992, + "step": 1662 + }, + { + "epoch": 0.1775075339360482, + "grad_norm": 15.246967161527042, + "learning_rate": 1.963673566169038e-06, + "loss": 0.0541, + "step": 1664 + }, + { + "epoch": 0.17772088433741365, + "grad_norm": 22.910616685380933, + "learning_rate": 1.9634743739133387e-06, + "loss": -0.0508, + "step": 1666 + }, + { + "epoch": 0.1779342347387791, + "grad_norm": 9.253592958142804, + "learning_rate": 1.963274647183443e-06, + "loss": -0.6378, + "step": 1668 + }, + { + "epoch": 0.17814758514014453, + "grad_norm": 7.223034507517141, + "learning_rate": 1.963074386090147e-06, + "loss": -0.7481, + "step": 1670 + }, + { + "epoch": 0.17836093554150997, + "grad_norm": 20.0730233250035, + "learning_rate": 1.9628735907445437e-06, + "loss": -1.1028, + "step": 1672 + }, + { + "epoch": 0.17857428594287544, + "grad_norm": 11.506960048731827, + "learning_rate": 1.96267226125802e-06, + "loss": -0.1797, + "step": 1674 + }, + { + "epoch": 0.17878763634424089, + "grad_norm": 8.61513532588195, + "learning_rate": 1.962470397742262e-06, + "loss": 0.3524, + "step": 1676 + }, + { + "epoch": 0.17900098674560633, + "grad_norm": 8.862740228622737, + "learning_rate": 1.9622680003092503e-06, + "loss": -0.146, + "step": 1678 + }, + { + "epoch": 0.17921433714697177, + "grad_norm": 12.139553131432608, + "learning_rate": 1.9620650690712618e-06, + "loss": -1.4315, + "step": 1680 + }, + { + "epoch": 0.1794276875483372, + "grad_norm": 10.44652923791041, + "learning_rate": 1.9618616041408703e-06, + "loss": 0.3068, + "step": 1682 + }, + { + "epoch": 0.17964103794970265, + "grad_norm": 14.831352041276144, + "learning_rate": 1.9616576056309447e-06, + "loss": 0.1342, + "step": 1684 + }, + { + "epoch": 0.1798543883510681, + "grad_norm": 6.713153774669659, + "learning_rate": 1.9614530736546507e-06, + "loss": -0.3189, + "step": 1686 + }, + { + "epoch": 0.18006773875243354, + "grad_norm": 12.095730762690835, + "learning_rate": 1.9612480083254496e-06, + "loss": -0.659, + "step": 1688 + }, + { + "epoch": 0.18028108915379898, + "grad_norm": 5.424181191521463, + "learning_rate": 1.9610424097570983e-06, + "loss": 0.4658, + "step": 1690 + }, + { + "epoch": 0.18049443955516442, + "grad_norm": 7.526381273568718, + "learning_rate": 1.9608362780636503e-06, + "loss": -0.1184, + "step": 1692 + }, + { + "epoch": 0.18070778995652986, + "grad_norm": 50.05765764524042, + "learning_rate": 1.9606296133594538e-06, + "loss": 0.3701, + "step": 1694 + }, + { + "epoch": 0.1809211403578953, + "grad_norm": 20.147742107794826, + "learning_rate": 1.9604224157591537e-06, + "loss": 0.2118, + "step": 1696 + }, + { + "epoch": 0.18113449075926075, + "grad_norm": 7.219813897301699, + "learning_rate": 1.960214685377689e-06, + "loss": -0.3109, + "step": 1698 + }, + { + "epoch": 0.1813478411606262, + "grad_norm": 8.540044296402842, + "learning_rate": 1.960006422330297e-06, + "loss": -0.6274, + "step": 1700 + }, + { + "epoch": 0.18156119156199163, + "grad_norm": 8.159546369231789, + "learning_rate": 1.9597976267325072e-06, + "loss": -0.4328, + "step": 1702 + }, + { + "epoch": 0.18177454196335707, + "grad_norm": 5.447742906774788, + "learning_rate": 1.959588298700147e-06, + "loss": -0.4139, + "step": 1704 + }, + { + "epoch": 0.18198789236472251, + "grad_norm": 4.784512936635263, + "learning_rate": 1.9593784383493377e-06, + "loss": -0.2258, + "step": 1706 + }, + { + "epoch": 0.18220124276608796, + "grad_norm": 12.694443490406826, + "learning_rate": 1.959168045796497e-06, + "loss": -0.402, + "step": 1708 + }, + { + "epoch": 0.1824145931674534, + "grad_norm": 20.749368077941035, + "learning_rate": 1.9589571211583367e-06, + "loss": -0.8768, + "step": 1710 + }, + { + "epoch": 0.18262794356881884, + "grad_norm": 10.332727657869777, + "learning_rate": 1.958745664551865e-06, + "loss": -0.4763, + "step": 1712 + }, + { + "epoch": 0.18284129397018428, + "grad_norm": 14.081473828895792, + "learning_rate": 1.9585336760943838e-06, + "loss": -0.7508, + "step": 1714 + }, + { + "epoch": 0.18305464437154972, + "grad_norm": 8.083033120627904, + "learning_rate": 1.9583211559034912e-06, + "loss": 0.2669, + "step": 1716 + }, + { + "epoch": 0.18326799477291517, + "grad_norm": 12.716253561595414, + "learning_rate": 1.9581081040970803e-06, + "loss": -0.3489, + "step": 1718 + }, + { + "epoch": 0.1834813451742806, + "grad_norm": 14.831224862239058, + "learning_rate": 1.9578945207933378e-06, + "loss": -0.257, + "step": 1720 + }, + { + "epoch": 0.18369469557564605, + "grad_norm": 10.0598638106194, + "learning_rate": 1.9576804061107468e-06, + "loss": -0.2818, + "step": 1722 + }, + { + "epoch": 0.1839080459770115, + "grad_norm": 11.916352138065047, + "learning_rate": 1.9574657601680837e-06, + "loss": -0.3157, + "step": 1724 + }, + { + "epoch": 0.18412139637837693, + "grad_norm": 13.293812004739776, + "learning_rate": 1.957250583084421e-06, + "loss": -1.3926, + "step": 1726 + }, + { + "epoch": 0.18433474677974238, + "grad_norm": 17.673810422897805, + "learning_rate": 1.9570348749791258e-06, + "loss": 0.3864, + "step": 1728 + }, + { + "epoch": 0.18454809718110782, + "grad_norm": 8.56725810020144, + "learning_rate": 1.956818635971858e-06, + "loss": 0.1284, + "step": 1730 + }, + { + "epoch": 0.18476144758247326, + "grad_norm": 7.539384803035816, + "learning_rate": 1.9566018661825735e-06, + "loss": -0.0348, + "step": 1732 + }, + { + "epoch": 0.1849747979838387, + "grad_norm": 9.139522421612835, + "learning_rate": 1.9563845657315233e-06, + "loss": -0.44, + "step": 1734 + }, + { + "epoch": 0.18518814838520414, + "grad_norm": 5.987861862159247, + "learning_rate": 1.956166734739251e-06, + "loss": -0.1993, + "step": 1736 + }, + { + "epoch": 0.18540149878656959, + "grad_norm": 8.191304855710381, + "learning_rate": 1.955948373326595e-06, + "loss": 0.0556, + "step": 1738 + }, + { + "epoch": 0.18561484918793503, + "grad_norm": 10.731204984938158, + "learning_rate": 1.9557294816146896e-06, + "loss": -0.3543, + "step": 1740 + }, + { + "epoch": 0.18582819958930047, + "grad_norm": 19.367490367006926, + "learning_rate": 1.9555100597249606e-06, + "loss": 0.349, + "step": 1742 + }, + { + "epoch": 0.1860415499906659, + "grad_norm": 26.972692980980362, + "learning_rate": 1.9552901077791305e-06, + "loss": -0.7377, + "step": 1744 + }, + { + "epoch": 0.18625490039203135, + "grad_norm": 15.656004899623815, + "learning_rate": 1.9550696258992135e-06, + "loss": -0.0474, + "step": 1746 + }, + { + "epoch": 0.1864682507933968, + "grad_norm": 7.872925492553763, + "learning_rate": 1.9548486142075195e-06, + "loss": -1.9635, + "step": 1748 + }, + { + "epoch": 0.18668160119476224, + "grad_norm": 11.0120416086597, + "learning_rate": 1.954627072826652e-06, + "loss": 0.3609, + "step": 1750 + }, + { + "epoch": 0.18689495159612768, + "grad_norm": 16.318707337248572, + "learning_rate": 1.9544050018795075e-06, + "loss": -0.2341, + "step": 1752 + }, + { + "epoch": 0.18710830199749312, + "grad_norm": 6.793061500721003, + "learning_rate": 1.9541824014892766e-06, + "loss": -0.8057, + "step": 1754 + }, + { + "epoch": 0.18732165239885856, + "grad_norm": 8.257666721715685, + "learning_rate": 1.953959271779445e-06, + "loss": -1.125, + "step": 1756 + }, + { + "epoch": 0.187535002800224, + "grad_norm": 2.2312803347930865, + "learning_rate": 1.953735612873789e-06, + "loss": -0.529, + "step": 1758 + }, + { + "epoch": 0.18774835320158947, + "grad_norm": 6.494227985980778, + "learning_rate": 1.9535114248963823e-06, + "loss": 0.4977, + "step": 1760 + }, + { + "epoch": 0.18796170360295492, + "grad_norm": 2.6225868521760884, + "learning_rate": 1.9532867079715885e-06, + "loss": 0.1818, + "step": 1762 + }, + { + "epoch": 0.18817505400432036, + "grad_norm": 13.169535473158854, + "learning_rate": 1.953061462224067e-06, + "loss": -1.5464, + "step": 1764 + }, + { + "epoch": 0.1883884044056858, + "grad_norm": 5.961894477938251, + "learning_rate": 1.95283568777877e-06, + "loss": -0.0843, + "step": 1766 + }, + { + "epoch": 0.18860175480705124, + "grad_norm": 17.790751191487725, + "learning_rate": 1.9526093847609425e-06, + "loss": -0.193, + "step": 1768 + }, + { + "epoch": 0.18881510520841668, + "grad_norm": 4.894851849522952, + "learning_rate": 1.9523825532961226e-06, + "loss": 0.3596, + "step": 1770 + }, + { + "epoch": 0.18902845560978213, + "grad_norm": 26.609231468601866, + "learning_rate": 1.9521551935101422e-06, + "loss": -1.0124, + "step": 1772 + }, + { + "epoch": 0.18924180601114757, + "grad_norm": 11.973906564469583, + "learning_rate": 1.9519273055291264e-06, + "loss": -0.7964, + "step": 1774 + }, + { + "epoch": 0.189455156412513, + "grad_norm": 5.530101076170713, + "learning_rate": 1.9516988894794926e-06, + "loss": -0.6191, + "step": 1776 + }, + { + "epoch": 0.18966850681387845, + "grad_norm": 8.679591909640877, + "learning_rate": 1.9514699454879515e-06, + "loss": -0.4146, + "step": 1778 + }, + { + "epoch": 0.1898818572152439, + "grad_norm": 11.728585457641236, + "learning_rate": 1.951240473681507e-06, + "loss": -0.4557, + "step": 1780 + }, + { + "epoch": 0.19009520761660934, + "grad_norm": 10.564389393242319, + "learning_rate": 1.9510104741874544e-06, + "loss": 0.2524, + "step": 1782 + }, + { + "epoch": 0.19030855801797478, + "grad_norm": 9.41590267221953, + "learning_rate": 1.9507799471333838e-06, + "loss": -0.6369, + "step": 1784 + }, + { + "epoch": 0.19052190841934022, + "grad_norm": 37.30175714716622, + "learning_rate": 1.950548892647177e-06, + "loss": -0.248, + "step": 1786 + }, + { + "epoch": 0.19073525882070566, + "grad_norm": 3.3075363360363474, + "learning_rate": 1.9503173108570073e-06, + "loss": 0.5654, + "step": 1788 + }, + { + "epoch": 0.1909486092220711, + "grad_norm": 24.48449478152585, + "learning_rate": 1.9500852018913423e-06, + "loss": -0.2383, + "step": 1790 + }, + { + "epoch": 0.19116195962343654, + "grad_norm": 15.853128801306614, + "learning_rate": 1.9498525658789415e-06, + "loss": -0.0832, + "step": 1792 + }, + { + "epoch": 0.191375310024802, + "grad_norm": 12.829677105255076, + "learning_rate": 1.9496194029488554e-06, + "loss": -0.9211, + "step": 1794 + }, + { + "epoch": 0.19158866042616743, + "grad_norm": 12.212923633405069, + "learning_rate": 1.949385713230429e-06, + "loss": -0.7124, + "step": 1796 + }, + { + "epoch": 0.19180201082753287, + "grad_norm": 9.920453632812915, + "learning_rate": 1.949151496853298e-06, + "loss": -0.0601, + "step": 1798 + }, + { + "epoch": 0.1920153612288983, + "grad_norm": 28.477961536263486, + "learning_rate": 1.9489167539473913e-06, + "loss": -0.2516, + "step": 1800 + }, + { + "epoch": 0.19222871163026375, + "grad_norm": 9.29396040625149, + "learning_rate": 1.9486814846429283e-06, + "loss": -0.5495, + "step": 1802 + }, + { + "epoch": 0.1924420620316292, + "grad_norm": 10.643418633887284, + "learning_rate": 1.948445689070422e-06, + "loss": -0.0461, + "step": 1804 + }, + { + "epoch": 0.19265541243299464, + "grad_norm": 6.505476361510747, + "learning_rate": 1.9482093673606772e-06, + "loss": -0.5896, + "step": 1806 + }, + { + "epoch": 0.19286876283436008, + "grad_norm": 10.585505862392617, + "learning_rate": 1.9479725196447896e-06, + "loss": 0.2174, + "step": 1808 + }, + { + "epoch": 0.19308211323572552, + "grad_norm": 10.74347579890419, + "learning_rate": 1.9477351460541472e-06, + "loss": 0.1333, + "step": 1810 + }, + { + "epoch": 0.19329546363709096, + "grad_norm": 7.644940311892833, + "learning_rate": 1.9474972467204294e-06, + "loss": -0.2822, + "step": 1812 + }, + { + "epoch": 0.1935088140384564, + "grad_norm": 4.730509511046018, + "learning_rate": 1.9472588217756086e-06, + "loss": 0.3581, + "step": 1814 + }, + { + "epoch": 0.19372216443982185, + "grad_norm": 5.724269421319521, + "learning_rate": 1.947019871351947e-06, + "loss": 0.567, + "step": 1816 + }, + { + "epoch": 0.1939355148411873, + "grad_norm": 11.725234173450394, + "learning_rate": 1.946780395581999e-06, + "loss": -0.1575, + "step": 1818 + }, + { + "epoch": 0.19414886524255273, + "grad_norm": 13.16677586483919, + "learning_rate": 1.946540394598611e-06, + "loss": -0.3832, + "step": 1820 + }, + { + "epoch": 0.19436221564391817, + "grad_norm": 12.990609192207351, + "learning_rate": 1.94629986853492e-06, + "loss": -0.4566, + "step": 1822 + }, + { + "epoch": 0.19457556604528362, + "grad_norm": 9.364327074112744, + "learning_rate": 1.9460588175243548e-06, + "loss": -0.1461, + "step": 1824 + }, + { + "epoch": 0.19478891644664906, + "grad_norm": 7.775551872545612, + "learning_rate": 1.9458172417006346e-06, + "loss": -0.9538, + "step": 1826 + }, + { + "epoch": 0.1950022668480145, + "grad_norm": 9.883774622479446, + "learning_rate": 1.9455751411977707e-06, + "loss": -0.6082, + "step": 1828 + }, + { + "epoch": 0.19521561724937994, + "grad_norm": 3.7575112892591123, + "learning_rate": 1.9453325161500646e-06, + "loss": -0.5966, + "step": 1830 + }, + { + "epoch": 0.19542896765074538, + "grad_norm": 5.420267155635353, + "learning_rate": 1.9450893666921098e-06, + "loss": 0.5389, + "step": 1832 + }, + { + "epoch": 0.19564231805211083, + "grad_norm": 8.197562118545699, + "learning_rate": 1.9448456929587898e-06, + "loss": 0.44, + "step": 1834 + }, + { + "epoch": 0.19585566845347627, + "grad_norm": 8.873455599519605, + "learning_rate": 1.9446014950852793e-06, + "loss": -0.6835, + "step": 1836 + }, + { + "epoch": 0.1960690188548417, + "grad_norm": 14.915885818798811, + "learning_rate": 1.9443567732070434e-06, + "loss": -0.6561, + "step": 1838 + }, + { + "epoch": 0.19628236925620715, + "grad_norm": 4.877252604264543, + "learning_rate": 1.944111527459839e-06, + "loss": -1.046, + "step": 1840 + }, + { + "epoch": 0.1964957196575726, + "grad_norm": 12.056248833737559, + "learning_rate": 1.9438657579797125e-06, + "loss": 0.0056, + "step": 1842 + }, + { + "epoch": 0.19670907005893803, + "grad_norm": 13.13224191691086, + "learning_rate": 1.9436194649030004e-06, + "loss": -0.0449, + "step": 1844 + }, + { + "epoch": 0.1969224204603035, + "grad_norm": 17.727545929984956, + "learning_rate": 1.9433726483663314e-06, + "loss": -0.6246, + "step": 1846 + }, + { + "epoch": 0.19713577086166895, + "grad_norm": 17.78918925352426, + "learning_rate": 1.943125308506623e-06, + "loss": -0.824, + "step": 1848 + }, + { + "epoch": 0.1973491212630344, + "grad_norm": 14.969243154757692, + "learning_rate": 1.942877445461084e-06, + "loss": 0.241, + "step": 1850 + }, + { + "epoch": 0.19756247166439983, + "grad_norm": 6.31904123376743, + "learning_rate": 1.942629059367213e-06, + "loss": -0.2239, + "step": 1852 + }, + { + "epoch": 0.19777582206576527, + "grad_norm": 8.57957389022486, + "learning_rate": 1.942380150362798e-06, + "loss": -0.3675, + "step": 1854 + }, + { + "epoch": 0.19798917246713071, + "grad_norm": 9.365987948081294, + "learning_rate": 1.9421307185859188e-06, + "loss": -0.4777, + "step": 1856 + }, + { + "epoch": 0.19820252286849616, + "grad_norm": 8.654835539439935, + "learning_rate": 1.941880764174944e-06, + "loss": -0.7596, + "step": 1858 + }, + { + "epoch": 0.1984158732698616, + "grad_norm": 8.098551922582622, + "learning_rate": 1.941630287268532e-06, + "loss": -0.8832, + "step": 1860 + }, + { + "epoch": 0.19862922367122704, + "grad_norm": 18.988381161750784, + "learning_rate": 1.941379288005632e-06, + "loss": 0.4242, + "step": 1862 + }, + { + "epoch": 0.19884257407259248, + "grad_norm": 16.079436898990718, + "learning_rate": 1.941127766525482e-06, + "loss": -0.3194, + "step": 1864 + }, + { + "epoch": 0.19905592447395792, + "grad_norm": 4.718296280940598, + "learning_rate": 1.9408757229676104e-06, + "loss": -0.324, + "step": 1866 + }, + { + "epoch": 0.19926927487532337, + "grad_norm": 5.673677060566251, + "learning_rate": 1.9406231574718343e-06, + "loss": -0.672, + "step": 1868 + }, + { + "epoch": 0.1994826252766888, + "grad_norm": 10.416325499823452, + "learning_rate": 1.9403700701782616e-06, + "loss": -0.1505, + "step": 1870 + }, + { + "epoch": 0.19969597567805425, + "grad_norm": 12.46215797459305, + "learning_rate": 1.9401164612272888e-06, + "loss": -0.9961, + "step": 1872 + }, + { + "epoch": 0.1999093260794197, + "grad_norm": 6.846444168934443, + "learning_rate": 1.939862330759602e-06, + "loss": 0.1682, + "step": 1874 + }, + { + "epoch": 0.20012267648078513, + "grad_norm": 9.365315641036963, + "learning_rate": 1.939607678916176e-06, + "loss": -0.4999, + "step": 1876 + }, + { + "epoch": 0.20033602688215058, + "grad_norm": 9.893208766151144, + "learning_rate": 1.9393525058382767e-06, + "loss": -0.3294, + "step": 1878 + }, + { + "epoch": 0.20054937728351602, + "grad_norm": 5.540154917275656, + "learning_rate": 1.939096811667457e-06, + "loss": 0.812, + "step": 1880 + }, + { + "epoch": 0.20076272768488146, + "grad_norm": 9.249926531646922, + "learning_rate": 1.9388405965455594e-06, + "loss": -1.6695, + "step": 1882 + }, + { + "epoch": 0.2009760780862469, + "grad_norm": 5.89678233946676, + "learning_rate": 1.9385838606147167e-06, + "loss": -0.0591, + "step": 1884 + }, + { + "epoch": 0.20118942848761234, + "grad_norm": 13.14256888070574, + "learning_rate": 1.938326604017349e-06, + "loss": -1.4884, + "step": 1886 + }, + { + "epoch": 0.20140277888897778, + "grad_norm": 7.94507307842013, + "learning_rate": 1.938068826896166e-06, + "loss": -0.6971, + "step": 1888 + }, + { + "epoch": 0.20161612929034323, + "grad_norm": 8.240327403029974, + "learning_rate": 1.9378105293941654e-06, + "loss": -0.0613, + "step": 1890 + }, + { + "epoch": 0.20182947969170867, + "grad_norm": 9.818936849267306, + "learning_rate": 1.9375517116546355e-06, + "loss": 0.3111, + "step": 1892 + }, + { + "epoch": 0.2020428300930741, + "grad_norm": 17.41443650473765, + "learning_rate": 1.9372923738211513e-06, + "loss": 0.5569, + "step": 1894 + }, + { + "epoch": 0.20225618049443955, + "grad_norm": 14.009083870989118, + "learning_rate": 1.9370325160375765e-06, + "loss": -0.4879, + "step": 1896 + }, + { + "epoch": 0.202469530895805, + "grad_norm": 6.68968872808115, + "learning_rate": 1.936772138448064e-06, + "loss": 0.165, + "step": 1898 + }, + { + "epoch": 0.20268288129717044, + "grad_norm": 11.073425085887829, + "learning_rate": 1.9365112411970546e-06, + "loss": -0.4699, + "step": 1900 + }, + { + "epoch": 0.20289623169853588, + "grad_norm": 13.72806606533039, + "learning_rate": 1.9362498244292777e-06, + "loss": -0.4613, + "step": 1902 + }, + { + "epoch": 0.20310958209990132, + "grad_norm": 7.420626928846953, + "learning_rate": 1.9359878882897504e-06, + "loss": 0.1518, + "step": 1904 + }, + { + "epoch": 0.20332293250126676, + "grad_norm": 24.333305223572363, + "learning_rate": 1.9357254329237782e-06, + "loss": -2.3156, + "step": 1906 + }, + { + "epoch": 0.2035362829026322, + "grad_norm": 9.517310195720865, + "learning_rate": 1.935462458476955e-06, + "loss": -0.1623, + "step": 1908 + }, + { + "epoch": 0.20374963330399765, + "grad_norm": 6.549306906184669, + "learning_rate": 1.9351989650951617e-06, + "loss": -0.0076, + "step": 1910 + }, + { + "epoch": 0.2039629837053631, + "grad_norm": 12.084117634395213, + "learning_rate": 1.934934952924568e-06, + "loss": -0.8675, + "step": 1912 + }, + { + "epoch": 0.20417633410672853, + "grad_norm": 23.006346032724178, + "learning_rate": 1.9346704221116304e-06, + "loss": 0.6388, + "step": 1914 + }, + { + "epoch": 0.20438968450809397, + "grad_norm": 11.25152656365477, + "learning_rate": 1.934405372803095e-06, + "loss": -0.2131, + "step": 1916 + }, + { + "epoch": 0.2046030349094594, + "grad_norm": 15.449233578378294, + "learning_rate": 1.934139805145993e-06, + "loss": -0.0512, + "step": 1918 + }, + { + "epoch": 0.20481638531082486, + "grad_norm": 25.967275303877503, + "learning_rate": 1.9338737192876455e-06, + "loss": -0.6831, + "step": 1920 + }, + { + "epoch": 0.2050297357121903, + "grad_norm": 22.999602017359916, + "learning_rate": 1.933607115375659e-06, + "loss": -0.1199, + "step": 1922 + }, + { + "epoch": 0.20524308611355574, + "grad_norm": 4.605511082587932, + "learning_rate": 1.933339993557929e-06, + "loss": 0.4724, + "step": 1924 + }, + { + "epoch": 0.20545643651492118, + "grad_norm": 11.515265673965152, + "learning_rate": 1.933072353982637e-06, + "loss": -0.601, + "step": 1926 + }, + { + "epoch": 0.20566978691628662, + "grad_norm": 5.596883504256079, + "learning_rate": 1.9328041967982535e-06, + "loss": 0.0537, + "step": 1928 + }, + { + "epoch": 0.20588313731765207, + "grad_norm": 9.822112001208648, + "learning_rate": 1.932535522153534e-06, + "loss": -0.0105, + "step": 1930 + }, + { + "epoch": 0.20609648771901753, + "grad_norm": 7.789424086801903, + "learning_rate": 1.9322663301975227e-06, + "loss": -0.9059, + "step": 1932 + }, + { + "epoch": 0.20630983812038298, + "grad_norm": 5.653484007709651, + "learning_rate": 1.9319966210795497e-06, + "loss": -0.6785, + "step": 1934 + }, + { + "epoch": 0.20652318852174842, + "grad_norm": 5.746107769290736, + "learning_rate": 1.9317263949492324e-06, + "loss": -0.5469, + "step": 1936 + }, + { + "epoch": 0.20673653892311386, + "grad_norm": 21.10901273760898, + "learning_rate": 1.9314556519564753e-06, + "loss": -0.6998, + "step": 1938 + }, + { + "epoch": 0.2069498893244793, + "grad_norm": 7.46443143657138, + "learning_rate": 1.93118439225147e-06, + "loss": -0.4524, + "step": 1940 + }, + { + "epoch": 0.20716323972584474, + "grad_norm": 7.024088389819032, + "learning_rate": 1.930912615984693e-06, + "loss": -0.3906, + "step": 1942 + }, + { + "epoch": 0.2073765901272102, + "grad_norm": 5.505478739537021, + "learning_rate": 1.9306403233069085e-06, + "loss": -0.4535, + "step": 1944 + }, + { + "epoch": 0.20758994052857563, + "grad_norm": 8.886336660167451, + "learning_rate": 1.930367514369168e-06, + "loss": -0.3084, + "step": 1946 + }, + { + "epoch": 0.20780329092994107, + "grad_norm": 5.907548024119052, + "learning_rate": 1.930094189322808e-06, + "loss": -0.6485, + "step": 1948 + }, + { + "epoch": 0.2080166413313065, + "grad_norm": 7.122391838967096, + "learning_rate": 1.9298203483194515e-06, + "loss": -0.2278, + "step": 1950 + }, + { + "epoch": 0.20822999173267195, + "grad_norm": 11.029026530105963, + "learning_rate": 1.929545991511009e-06, + "loss": -0.2532, + "step": 1952 + }, + { + "epoch": 0.2084433421340374, + "grad_norm": 14.189880824138287, + "learning_rate": 1.929271119049675e-06, + "loss": -0.3456, + "step": 1954 + }, + { + "epoch": 0.20865669253540284, + "grad_norm": 4.676708758764112, + "learning_rate": 1.9289957310879317e-06, + "loss": -0.7007, + "step": 1956 + }, + { + "epoch": 0.20887004293676828, + "grad_norm": 7.005194914139183, + "learning_rate": 1.9287198277785472e-06, + "loss": -0.417, + "step": 1958 + }, + { + "epoch": 0.20908339333813372, + "grad_norm": 11.66782653144503, + "learning_rate": 1.928443409274575e-06, + "loss": 0.045, + "step": 1960 + }, + { + "epoch": 0.20929674373949916, + "grad_norm": 8.522371273190833, + "learning_rate": 1.9281664757293535e-06, + "loss": -0.3577, + "step": 1962 + }, + { + "epoch": 0.2095100941408646, + "grad_norm": 7.8639416248062615, + "learning_rate": 1.9278890272965093e-06, + "loss": -0.3496, + "step": 1964 + }, + { + "epoch": 0.20972344454223005, + "grad_norm": 19.19411125509426, + "learning_rate": 1.927611064129952e-06, + "loss": -0.2211, + "step": 1966 + }, + { + "epoch": 0.2099367949435955, + "grad_norm": 13.62906798006574, + "learning_rate": 1.927332586383878e-06, + "loss": -0.3254, + "step": 1968 + }, + { + "epoch": 0.21015014534496093, + "grad_norm": 8.84634286062035, + "learning_rate": 1.9270535942127693e-06, + "loss": -0.9648, + "step": 1970 + }, + { + "epoch": 0.21036349574632637, + "grad_norm": 12.82625400789908, + "learning_rate": 1.9267740877713934e-06, + "loss": -0.9653, + "step": 1972 + }, + { + "epoch": 0.21057684614769182, + "grad_norm": 29.386125708909894, + "learning_rate": 1.9264940672148015e-06, + "loss": -0.2959, + "step": 1974 + }, + { + "epoch": 0.21079019654905726, + "grad_norm": 6.726999887691662, + "learning_rate": 1.9262135326983323e-06, + "loss": 0.0497, + "step": 1976 + }, + { + "epoch": 0.2110035469504227, + "grad_norm": 19.572986593090715, + "learning_rate": 1.925932484377608e-06, + "loss": -0.5565, + "step": 1978 + }, + { + "epoch": 0.21121689735178814, + "grad_norm": 10.420268612115242, + "learning_rate": 1.925650922408536e-06, + "loss": -0.2276, + "step": 1980 + }, + { + "epoch": 0.21143024775315358, + "grad_norm": 14.918445885705127, + "learning_rate": 1.925368846947309e-06, + "loss": -0.7972, + "step": 1982 + }, + { + "epoch": 0.21164359815451902, + "grad_norm": 15.183450954373223, + "learning_rate": 1.9250862581504054e-06, + "loss": -0.9765, + "step": 1984 + }, + { + "epoch": 0.21185694855588447, + "grad_norm": 12.655482376031465, + "learning_rate": 1.924803156174586e-06, + "loss": 0.2892, + "step": 1986 + }, + { + "epoch": 0.2120702989572499, + "grad_norm": 12.319565480053717, + "learning_rate": 1.924519541176899e-06, + "loss": 0.0639, + "step": 1988 + }, + { + "epoch": 0.21228364935861535, + "grad_norm": 11.682148008802292, + "learning_rate": 1.9242354133146755e-06, + "loss": -0.2449, + "step": 1990 + }, + { + "epoch": 0.2124969997599808, + "grad_norm": 10.683369380156918, + "learning_rate": 1.923950772745531e-06, + "loss": -0.6252, + "step": 1992 + }, + { + "epoch": 0.21271035016134623, + "grad_norm": 8.342190814941272, + "learning_rate": 1.9236656196273675e-06, + "loss": -0.3836, + "step": 1994 + }, + { + "epoch": 0.21292370056271168, + "grad_norm": 9.132897947243281, + "learning_rate": 1.9233799541183673e-06, + "loss": 0.242, + "step": 1996 + }, + { + "epoch": 0.21313705096407712, + "grad_norm": 8.302317272541245, + "learning_rate": 1.923093776377002e-06, + "loss": 0.2858, + "step": 1998 + }, + { + "epoch": 0.21335040136544256, + "grad_norm": 15.276476769675117, + "learning_rate": 1.922807086562023e-06, + "loss": -0.3019, + "step": 2000 + }, + { + "epoch": 0.213563751766808, + "grad_norm": 11.619082047088398, + "learning_rate": 1.9225198848324686e-06, + "loss": 0.7847, + "step": 2002 + }, + { + "epoch": 0.21377710216817344, + "grad_norm": 3.525780465406695, + "learning_rate": 1.922232171347659e-06, + "loss": -0.0267, + "step": 2004 + }, + { + "epoch": 0.21399045256953889, + "grad_norm": 13.017357127057464, + "learning_rate": 1.9219439462672005e-06, + "loss": -0.0434, + "step": 2006 + }, + { + "epoch": 0.21420380297090433, + "grad_norm": 8.743693252679565, + "learning_rate": 1.9216552097509813e-06, + "loss": -0.5469, + "step": 2008 + }, + { + "epoch": 0.21441715337226977, + "grad_norm": 8.531453317652758, + "learning_rate": 1.921365961959174e-06, + "loss": -0.5015, + "step": 2010 + }, + { + "epoch": 0.2146305037736352, + "grad_norm": 25.86829560721428, + "learning_rate": 1.921076203052235e-06, + "loss": -1.4061, + "step": 2012 + }, + { + "epoch": 0.21484385417500065, + "grad_norm": 6.313096967252886, + "learning_rate": 1.920785933190904e-06, + "loss": 0.4307, + "step": 2014 + }, + { + "epoch": 0.2150572045763661, + "grad_norm": 12.031438307717737, + "learning_rate": 1.9204951525362043e-06, + "loss": -0.1798, + "step": 2016 + }, + { + "epoch": 0.21527055497773157, + "grad_norm": 8.961712843572498, + "learning_rate": 1.9202038612494425e-06, + "loss": 0.5622, + "step": 2018 + }, + { + "epoch": 0.215483905379097, + "grad_norm": 10.317606430884625, + "learning_rate": 1.9199120594922086e-06, + "loss": 0.2731, + "step": 2020 + }, + { + "epoch": 0.21569725578046245, + "grad_norm": 6.844568645485288, + "learning_rate": 1.919619747426375e-06, + "loss": -0.2694, + "step": 2022 + }, + { + "epoch": 0.2159106061818279, + "grad_norm": 9.827733315135845, + "learning_rate": 1.9193269252140987e-06, + "loss": 0.1818, + "step": 2024 + }, + { + "epoch": 0.21612395658319333, + "grad_norm": 11.343018497482833, + "learning_rate": 1.919033593017818e-06, + "loss": -1.0073, + "step": 2026 + }, + { + "epoch": 0.21633730698455877, + "grad_norm": 28.273270087787377, + "learning_rate": 1.9187397510002556e-06, + "loss": -0.4873, + "step": 2028 + }, + { + "epoch": 0.21655065738592422, + "grad_norm": 8.284068681666643, + "learning_rate": 1.918445399324416e-06, + "loss": 0.2736, + "step": 2030 + }, + { + "epoch": 0.21676400778728966, + "grad_norm": 14.638969215885052, + "learning_rate": 1.918150538153586e-06, + "loss": 0.2588, + "step": 2032 + }, + { + "epoch": 0.2169773581886551, + "grad_norm": 22.94032037481521, + "learning_rate": 1.9178551676513374e-06, + "loss": -0.4289, + "step": 2034 + }, + { + "epoch": 0.21719070859002054, + "grad_norm": 6.7471627466004005, + "learning_rate": 1.9175592879815217e-06, + "loss": -0.5378, + "step": 2036 + }, + { + "epoch": 0.21740405899138598, + "grad_norm": 8.94487355277408, + "learning_rate": 1.9172628993082743e-06, + "loss": -0.3464, + "step": 2038 + }, + { + "epoch": 0.21761740939275143, + "grad_norm": 6.462015369560506, + "learning_rate": 1.9169660017960134e-06, + "loss": -0.3104, + "step": 2040 + }, + { + "epoch": 0.21783075979411687, + "grad_norm": 10.741585094567146, + "learning_rate": 1.916668595609438e-06, + "loss": -0.3802, + "step": 2042 + }, + { + "epoch": 0.2180441101954823, + "grad_norm": 4.590541070024735, + "learning_rate": 1.9163706809135305e-06, + "loss": -0.4504, + "step": 2044 + }, + { + "epoch": 0.21825746059684775, + "grad_norm": 7.9137038139570155, + "learning_rate": 1.916072257873555e-06, + "loss": -0.522, + "step": 2046 + }, + { + "epoch": 0.2184708109982132, + "grad_norm": 13.423712642720652, + "learning_rate": 1.915773326655057e-06, + "loss": -0.591, + "step": 2048 + }, + { + "epoch": 0.21868416139957864, + "grad_norm": 15.129478098463679, + "learning_rate": 1.915473887423866e-06, + "loss": -0.1386, + "step": 2050 + }, + { + "epoch": 0.21889751180094408, + "grad_norm": 16.08402082469581, + "learning_rate": 1.91517394034609e-06, + "loss": -1.3189, + "step": 2052 + }, + { + "epoch": 0.21911086220230952, + "grad_norm": 14.673361229198562, + "learning_rate": 1.9148734855881216e-06, + "loss": -0.4009, + "step": 2054 + }, + { + "epoch": 0.21932421260367496, + "grad_norm": 11.660044385661074, + "learning_rate": 1.9145725233166343e-06, + "loss": -1.1933, + "step": 2056 + }, + { + "epoch": 0.2195375630050404, + "grad_norm": 17.923848622734944, + "learning_rate": 1.9142710536985815e-06, + "loss": 0.2626, + "step": 2058 + }, + { + "epoch": 0.21975091340640585, + "grad_norm": 3.4973266521741135, + "learning_rate": 1.9139690769012e-06, + "loss": -0.172, + "step": 2060 + }, + { + "epoch": 0.2199642638077713, + "grad_norm": 7.976576276743693, + "learning_rate": 1.9136665930920075e-06, + "loss": 0.6982, + "step": 2062 + }, + { + "epoch": 0.22017761420913673, + "grad_norm": 7.634128727862831, + "learning_rate": 1.9133636024388025e-06, + "loss": 0.4352, + "step": 2064 + }, + { + "epoch": 0.22039096461050217, + "grad_norm": 7.972896156127331, + "learning_rate": 1.913060105109665e-06, + "loss": -0.1603, + "step": 2066 + }, + { + "epoch": 0.2206043150118676, + "grad_norm": 12.60188340929588, + "learning_rate": 1.912756101272956e-06, + "loss": 0.184, + "step": 2068 + }, + { + "epoch": 0.22081766541323306, + "grad_norm": 5.530674604822452, + "learning_rate": 1.9124515910973175e-06, + "loss": 0.2249, + "step": 2070 + }, + { + "epoch": 0.2210310158145985, + "grad_norm": 8.673190380791217, + "learning_rate": 1.9121465747516723e-06, + "loss": 0.2098, + "step": 2072 + }, + { + "epoch": 0.22124436621596394, + "grad_norm": 11.938116236594583, + "learning_rate": 1.911841052405224e-06, + "loss": -0.0012, + "step": 2074 + }, + { + "epoch": 0.22145771661732938, + "grad_norm": 15.000894923275073, + "learning_rate": 1.9115350242274565e-06, + "loss": -0.5997, + "step": 2076 + }, + { + "epoch": 0.22167106701869482, + "grad_norm": 13.084239961748656, + "learning_rate": 1.9112284903881357e-06, + "loss": -0.2505, + "step": 2078 + }, + { + "epoch": 0.22188441742006026, + "grad_norm": 25.251876912066013, + "learning_rate": 1.910921451057306e-06, + "loss": 0.5584, + "step": 2080 + }, + { + "epoch": 0.2220977678214257, + "grad_norm": 4.428008465032094, + "learning_rate": 1.9106139064052945e-06, + "loss": -0.5053, + "step": 2082 + }, + { + "epoch": 0.22231111822279115, + "grad_norm": 14.230541196268089, + "learning_rate": 1.910305856602706e-06, + "loss": -0.8245, + "step": 2084 + }, + { + "epoch": 0.2225244686241566, + "grad_norm": 7.0912436584168885, + "learning_rate": 1.909997301820428e-06, + "loss": -0.0826, + "step": 2086 + }, + { + "epoch": 0.22273781902552203, + "grad_norm": 8.156236127060636, + "learning_rate": 1.909688242229626e-06, + "loss": -1.6469, + "step": 2088 + }, + { + "epoch": 0.22295116942688747, + "grad_norm": 7.462748690551923, + "learning_rate": 1.9093786780017473e-06, + "loss": -0.441, + "step": 2090 + }, + { + "epoch": 0.22316451982825292, + "grad_norm": 7.556875177907825, + "learning_rate": 1.9090686093085186e-06, + "loss": -0.1868, + "step": 2092 + }, + { + "epoch": 0.22337787022961836, + "grad_norm": 8.074190404531432, + "learning_rate": 1.908758036321946e-06, + "loss": -0.6482, + "step": 2094 + }, + { + "epoch": 0.2235912206309838, + "grad_norm": 7.352300765570886, + "learning_rate": 1.908446959214315e-06, + "loss": 0.0087, + "step": 2096 + }, + { + "epoch": 0.22380457103234924, + "grad_norm": 19.643712400767132, + "learning_rate": 1.908135378158192e-06, + "loss": -0.1984, + "step": 2098 + }, + { + "epoch": 0.22401792143371468, + "grad_norm": 30.880473528270596, + "learning_rate": 1.9078232933264226e-06, + "loss": -0.4678, + "step": 2100 + }, + { + "epoch": 0.22423127183508013, + "grad_norm": 16.206460542957338, + "learning_rate": 1.907510704892131e-06, + "loss": -0.9372, + "step": 2102 + }, + { + "epoch": 0.2244446222364456, + "grad_norm": 11.207394954968997, + "learning_rate": 1.907197613028721e-06, + "loss": -0.3638, + "step": 2104 + }, + { + "epoch": 0.22465797263781104, + "grad_norm": 19.59690555861532, + "learning_rate": 1.9068840179098773e-06, + "loss": -0.2888, + "step": 2106 + }, + { + "epoch": 0.22487132303917648, + "grad_norm": 5.0064925129941376, + "learning_rate": 1.9065699197095615e-06, + "loss": 0.7401, + "step": 2108 + }, + { + "epoch": 0.22508467344054192, + "grad_norm": 9.163762043830785, + "learning_rate": 1.906255318602015e-06, + "loss": -0.5969, + "step": 2110 + }, + { + "epoch": 0.22529802384190736, + "grad_norm": 28.58997459140813, + "learning_rate": 1.9059402147617596e-06, + "loss": -0.7027, + "step": 2112 + }, + { + "epoch": 0.2255113742432728, + "grad_norm": 8.007982963485334, + "learning_rate": 1.9056246083635941e-06, + "loss": -0.9328, + "step": 2114 + }, + { + "epoch": 0.22572472464463825, + "grad_norm": 11.576637630010811, + "learning_rate": 1.9053084995825967e-06, + "loss": -0.1907, + "step": 2116 + }, + { + "epoch": 0.2259380750460037, + "grad_norm": 6.310277942518293, + "learning_rate": 1.9049918885941246e-06, + "loss": -1.1929, + "step": 2118 + }, + { + "epoch": 0.22615142544736913, + "grad_norm": 4.709824078091413, + "learning_rate": 1.9046747755738136e-06, + "loss": 0.2748, + "step": 2120 + }, + { + "epoch": 0.22636477584873457, + "grad_norm": 10.164719659031523, + "learning_rate": 1.9043571606975775e-06, + "loss": -0.373, + "step": 2122 + }, + { + "epoch": 0.22657812625010001, + "grad_norm": 14.655873994066797, + "learning_rate": 1.904039044141609e-06, + "loss": -0.6651, + "step": 2124 + }, + { + "epoch": 0.22679147665146546, + "grad_norm": 6.4111231342896495, + "learning_rate": 1.9037204260823785e-06, + "loss": -0.3842, + "step": 2126 + }, + { + "epoch": 0.2270048270528309, + "grad_norm": 12.295730157536532, + "learning_rate": 1.9034013066966356e-06, + "loss": -0.691, + "step": 2128 + }, + { + "epoch": 0.22721817745419634, + "grad_norm": 22.951864429058734, + "learning_rate": 1.903081686161407e-06, + "loss": -1.0715, + "step": 2130 + }, + { + "epoch": 0.22743152785556178, + "grad_norm": 5.816477108146452, + "learning_rate": 1.902761564653998e-06, + "loss": 0.2179, + "step": 2132 + }, + { + "epoch": 0.22764487825692722, + "grad_norm": 7.195074237627301, + "learning_rate": 1.9024409423519918e-06, + "loss": -0.0609, + "step": 2134 + }, + { + "epoch": 0.22785822865829267, + "grad_norm": 4.115971600041351, + "learning_rate": 1.9021198194332486e-06, + "loss": 0.1863, + "step": 2136 + }, + { + "epoch": 0.2280715790596581, + "grad_norm": 7.8396642857766485, + "learning_rate": 1.9017981960759072e-06, + "loss": -0.035, + "step": 2138 + }, + { + "epoch": 0.22828492946102355, + "grad_norm": 8.093340509337287, + "learning_rate": 1.9014760724583843e-06, + "loss": -0.032, + "step": 2140 + }, + { + "epoch": 0.228498279862389, + "grad_norm": 5.739007014210951, + "learning_rate": 1.901153448759373e-06, + "loss": -1.3343, + "step": 2142 + }, + { + "epoch": 0.22871163026375443, + "grad_norm": 6.451272014147544, + "learning_rate": 1.9008303251578442e-06, + "loss": 0.271, + "step": 2144 + }, + { + "epoch": 0.22892498066511988, + "grad_norm": 14.93163679272063, + "learning_rate": 1.9005067018330466e-06, + "loss": -0.6068, + "step": 2146 + }, + { + "epoch": 0.22913833106648532, + "grad_norm": 6.146740059828187, + "learning_rate": 1.900182578964506e-06, + "loss": -0.5449, + "step": 2148 + }, + { + "epoch": 0.22935168146785076, + "grad_norm": 4.058446027597233, + "learning_rate": 1.899857956732025e-06, + "loss": 0.4319, + "step": 2150 + }, + { + "epoch": 0.2295650318692162, + "grad_norm": 8.03450930760207, + "learning_rate": 1.899532835315683e-06, + "loss": -2.0862, + "step": 2152 + }, + { + "epoch": 0.22977838227058164, + "grad_norm": 10.29416339638645, + "learning_rate": 1.8992072148958367e-06, + "loss": -0.5918, + "step": 2154 + }, + { + "epoch": 0.22999173267194709, + "grad_norm": 6.839940264702009, + "learning_rate": 1.8988810956531199e-06, + "loss": 0.802, + "step": 2156 + }, + { + "epoch": 0.23020508307331253, + "grad_norm": 3.3657321385518157, + "learning_rate": 1.8985544777684425e-06, + "loss": -0.4858, + "step": 2158 + }, + { + "epoch": 0.23041843347467797, + "grad_norm": 4.831622255484031, + "learning_rate": 1.8982273614229915e-06, + "loss": -1.2581, + "step": 2160 + }, + { + "epoch": 0.2306317838760434, + "grad_norm": 10.448841520364184, + "learning_rate": 1.89789974679823e-06, + "loss": 0.6303, + "step": 2162 + }, + { + "epoch": 0.23084513427740885, + "grad_norm": 10.907611514407419, + "learning_rate": 1.897571634075898e-06, + "loss": 0.0288, + "step": 2164 + }, + { + "epoch": 0.2310584846787743, + "grad_norm": 14.545172431998253, + "learning_rate": 1.8972430234380112e-06, + "loss": -0.5095, + "step": 2166 + }, + { + "epoch": 0.23127183508013974, + "grad_norm": 6.533910079035624, + "learning_rate": 1.8969139150668622e-06, + "loss": -1.2148, + "step": 2168 + }, + { + "epoch": 0.23148518548150518, + "grad_norm": 15.995777148237812, + "learning_rate": 1.8965843091450192e-06, + "loss": 0.5337, + "step": 2170 + }, + { + "epoch": 0.23169853588287062, + "grad_norm": 13.54176406092635, + "learning_rate": 1.896254205855326e-06, + "loss": 0.515, + "step": 2172 + }, + { + "epoch": 0.23191188628423606, + "grad_norm": 7.719544270592234, + "learning_rate": 1.8959236053809038e-06, + "loss": 0.0471, + "step": 2174 + }, + { + "epoch": 0.2321252366856015, + "grad_norm": 10.598168740855463, + "learning_rate": 1.8955925079051482e-06, + "loss": -1.0524, + "step": 2176 + }, + { + "epoch": 0.23233858708696695, + "grad_norm": 36.37614463964094, + "learning_rate": 1.895260913611731e-06, + "loss": -1.3743, + "step": 2178 + }, + { + "epoch": 0.2325519374883324, + "grad_norm": 6.571035963967562, + "learning_rate": 1.8949288226845996e-06, + "loss": -0.1517, + "step": 2180 + }, + { + "epoch": 0.23276528788969783, + "grad_norm": 8.071430658220358, + "learning_rate": 1.8945962353079772e-06, + "loss": -0.7766, + "step": 2182 + }, + { + "epoch": 0.23297863829106327, + "grad_norm": 7.25892306887894, + "learning_rate": 1.8942631516663617e-06, + "loss": 0.4177, + "step": 2184 + }, + { + "epoch": 0.23319198869242871, + "grad_norm": 6.365349446271159, + "learning_rate": 1.8939295719445266e-06, + "loss": 0.2848, + "step": 2186 + }, + { + "epoch": 0.23340533909379416, + "grad_norm": 9.638039270016808, + "learning_rate": 1.893595496327521e-06, + "loss": -0.913, + "step": 2188 + }, + { + "epoch": 0.23361868949515963, + "grad_norm": 10.439771015875486, + "learning_rate": 1.8932609250006685e-06, + "loss": 0.5524, + "step": 2190 + }, + { + "epoch": 0.23383203989652507, + "grad_norm": 13.808518153837964, + "learning_rate": 1.8929258581495683e-06, + "loss": -0.1752, + "step": 2192 + }, + { + "epoch": 0.2340453902978905, + "grad_norm": 12.522051166061008, + "learning_rate": 1.892590295960094e-06, + "loss": -0.5719, + "step": 2194 + }, + { + "epoch": 0.23425874069925595, + "grad_norm": 11.841442410136267, + "learning_rate": 1.8922542386183939e-06, + "loss": -0.6225, + "step": 2196 + }, + { + "epoch": 0.2344720911006214, + "grad_norm": 3.815044461918325, + "learning_rate": 1.8919176863108914e-06, + "loss": -0.887, + "step": 2198 + }, + { + "epoch": 0.23468544150198684, + "grad_norm": 4.49105706199299, + "learning_rate": 1.8915806392242844e-06, + "loss": 0.1367, + "step": 2200 + }, + { + "epoch": 0.23489879190335228, + "grad_norm": 16.345355639549688, + "learning_rate": 1.8912430975455446e-06, + "loss": -0.4096, + "step": 2202 + }, + { + "epoch": 0.23511214230471772, + "grad_norm": 8.407716846145457, + "learning_rate": 1.8909050614619195e-06, + "loss": -0.0712, + "step": 2204 + }, + { + "epoch": 0.23532549270608316, + "grad_norm": 14.366397845363286, + "learning_rate": 1.890566531160929e-06, + "loss": 0.5799, + "step": 2206 + }, + { + "epoch": 0.2355388431074486, + "grad_norm": 14.191723389550512, + "learning_rate": 1.890227506830369e-06, + "loss": -0.5409, + "step": 2208 + }, + { + "epoch": 0.23575219350881405, + "grad_norm": 7.135790668212273, + "learning_rate": 1.8898879886583078e-06, + "loss": 0.085, + "step": 2210 + }, + { + "epoch": 0.2359655439101795, + "grad_norm": 7.391101446479228, + "learning_rate": 1.8895479768330893e-06, + "loss": -0.2057, + "step": 2212 + }, + { + "epoch": 0.23617889431154493, + "grad_norm": 14.674680488502363, + "learning_rate": 1.8892074715433299e-06, + "loss": 0.3252, + "step": 2214 + }, + { + "epoch": 0.23639224471291037, + "grad_norm": 17.01256516370184, + "learning_rate": 1.8888664729779202e-06, + "loss": 0.2447, + "step": 2216 + }, + { + "epoch": 0.2366055951142758, + "grad_norm": 36.16909610954152, + "learning_rate": 1.8885249813260248e-06, + "loss": 0.5304, + "step": 2218 + }, + { + "epoch": 0.23681894551564125, + "grad_norm": 14.119709954785915, + "learning_rate": 1.8881829967770809e-06, + "loss": -1.1017, + "step": 2220 + }, + { + "epoch": 0.2370322959170067, + "grad_norm": 6.982776203277627, + "learning_rate": 1.8878405195208004e-06, + "loss": -0.3025, + "step": 2222 + }, + { + "epoch": 0.23724564631837214, + "grad_norm": 12.717983826344323, + "learning_rate": 1.8874975497471676e-06, + "loss": -0.3265, + "step": 2224 + }, + { + "epoch": 0.23745899671973758, + "grad_norm": 7.295940732692786, + "learning_rate": 1.8871540876464402e-06, + "loss": -1.1326, + "step": 2226 + }, + { + "epoch": 0.23767234712110302, + "grad_norm": 16.019023665374803, + "learning_rate": 1.8868101334091492e-06, + "loss": -0.0291, + "step": 2228 + }, + { + "epoch": 0.23788569752246846, + "grad_norm": 11.942061426924056, + "learning_rate": 1.8864656872260985e-06, + "loss": -0.91, + "step": 2230 + }, + { + "epoch": 0.2380990479238339, + "grad_norm": 7.779109824391155, + "learning_rate": 1.8861207492883648e-06, + "loss": -0.2579, + "step": 2232 + }, + { + "epoch": 0.23831239832519935, + "grad_norm": 32.924364769086296, + "learning_rate": 1.8857753197872978e-06, + "loss": -0.5007, + "step": 2234 + }, + { + "epoch": 0.2385257487265648, + "grad_norm": 7.122888553196807, + "learning_rate": 1.8854293989145198e-06, + "loss": 0.3607, + "step": 2236 + }, + { + "epoch": 0.23873909912793023, + "grad_norm": 6.541430984020485, + "learning_rate": 1.8850829868619256e-06, + "loss": -0.4923, + "step": 2238 + }, + { + "epoch": 0.23895244952929567, + "grad_norm": 12.255213691954728, + "learning_rate": 1.8847360838216824e-06, + "loss": -0.0115, + "step": 2240 + }, + { + "epoch": 0.23916579993066112, + "grad_norm": 24.17513986308764, + "learning_rate": 1.8843886899862302e-06, + "loss": 0.1068, + "step": 2242 + }, + { + "epoch": 0.23937915033202656, + "grad_norm": 6.427296279370608, + "learning_rate": 1.8840408055482806e-06, + "loss": -1.4625, + "step": 2244 + }, + { + "epoch": 0.239592500733392, + "grad_norm": 4.057332435817227, + "learning_rate": 1.883692430700818e-06, + "loss": -1.0755, + "step": 2246 + }, + { + "epoch": 0.23980585113475744, + "grad_norm": 9.280279614228682, + "learning_rate": 1.8833435656370984e-06, + "loss": 0.4493, + "step": 2248 + }, + { + "epoch": 0.24001920153612288, + "grad_norm": 15.08946963378993, + "learning_rate": 1.8829942105506502e-06, + "loss": -0.5134, + "step": 2250 + }, + { + "epoch": 0.24023255193748833, + "grad_norm": 8.285147921021803, + "learning_rate": 1.8826443656352729e-06, + "loss": -0.2185, + "step": 2252 + }, + { + "epoch": 0.24044590233885377, + "grad_norm": 5.186081541990516, + "learning_rate": 1.8822940310850383e-06, + "loss": -0.841, + "step": 2254 + }, + { + "epoch": 0.2406592527402192, + "grad_norm": 12.892860884719964, + "learning_rate": 1.8819432070942903e-06, + "loss": -0.1838, + "step": 2256 + }, + { + "epoch": 0.24087260314158465, + "grad_norm": 9.786430202215282, + "learning_rate": 1.8815918938576427e-06, + "loss": 0.2237, + "step": 2258 + }, + { + "epoch": 0.2410859535429501, + "grad_norm": 11.474072273615928, + "learning_rate": 1.8812400915699826e-06, + "loss": -0.1261, + "step": 2260 + }, + { + "epoch": 0.24129930394431554, + "grad_norm": 11.833383877747751, + "learning_rate": 1.8808878004264668e-06, + "loss": 0.4263, + "step": 2262 + }, + { + "epoch": 0.24151265434568098, + "grad_norm": 22.976330366798624, + "learning_rate": 1.8805350206225246e-06, + "loss": -1.0103, + "step": 2264 + }, + { + "epoch": 0.24172600474704642, + "grad_norm": 10.5386166784528, + "learning_rate": 1.880181752353855e-06, + "loss": -0.2876, + "step": 2266 + }, + { + "epoch": 0.24193935514841186, + "grad_norm": 8.413708416616572, + "learning_rate": 1.8798279958164294e-06, + "loss": 0.1234, + "step": 2268 + }, + { + "epoch": 0.2421527055497773, + "grad_norm": 2.9593915851857346, + "learning_rate": 1.8794737512064888e-06, + "loss": -0.5359, + "step": 2270 + }, + { + "epoch": 0.24236605595114274, + "grad_norm": 9.484434089050737, + "learning_rate": 1.8791190187205463e-06, + "loss": -0.1062, + "step": 2272 + }, + { + "epoch": 0.24257940635250821, + "grad_norm": 11.888830573780263, + "learning_rate": 1.8787637985553843e-06, + "loss": 0.2508, + "step": 2274 + }, + { + "epoch": 0.24279275675387366, + "grad_norm": 3.1868895736649017, + "learning_rate": 1.8784080909080566e-06, + "loss": -0.0724, + "step": 2276 + }, + { + "epoch": 0.2430061071552391, + "grad_norm": 13.600473840973399, + "learning_rate": 1.878051895975887e-06, + "loss": -0.2573, + "step": 2278 + }, + { + "epoch": 0.24321945755660454, + "grad_norm": 26.27419949986061, + "learning_rate": 1.8776952139564695e-06, + "loss": 0.1723, + "step": 2280 + }, + { + "epoch": 0.24343280795796998, + "grad_norm": 4.746369885066603, + "learning_rate": 1.877338045047669e-06, + "loss": -0.052, + "step": 2282 + }, + { + "epoch": 0.24364615835933542, + "grad_norm": 19.86372771706047, + "learning_rate": 1.87698038944762e-06, + "loss": -0.293, + "step": 2284 + }, + { + "epoch": 0.24385950876070087, + "grad_norm": 5.177844555605487, + "learning_rate": 1.8766222473547269e-06, + "loss": -0.1477, + "step": 2286 + }, + { + "epoch": 0.2440728591620663, + "grad_norm": 3.4158091451466683, + "learning_rate": 1.8762636189676639e-06, + "loss": -0.6331, + "step": 2288 + }, + { + "epoch": 0.24428620956343175, + "grad_norm": 15.453945464573792, + "learning_rate": 1.8759045044853756e-06, + "loss": -0.1638, + "step": 2290 + }, + { + "epoch": 0.2444995599647972, + "grad_norm": 10.596600020719738, + "learning_rate": 1.8755449041070757e-06, + "loss": -1.1852, + "step": 2292 + }, + { + "epoch": 0.24471291036616263, + "grad_norm": 14.990629186643753, + "learning_rate": 1.8751848180322474e-06, + "loss": -0.338, + "step": 2294 + }, + { + "epoch": 0.24492626076752808, + "grad_norm": 11.30656679550947, + "learning_rate": 1.8748242464606437e-06, + "loss": 0.8451, + "step": 2296 + }, + { + "epoch": 0.24513961116889352, + "grad_norm": 5.928347319065744, + "learning_rate": 1.8744631895922867e-06, + "loss": -0.713, + "step": 2298 + }, + { + "epoch": 0.24535296157025896, + "grad_norm": 6.444139189078738, + "learning_rate": 1.8741016476274675e-06, + "loss": -1.0358, + "step": 2300 + }, + { + "epoch": 0.2455663119716244, + "grad_norm": 7.9684465119036485, + "learning_rate": 1.873739620766747e-06, + "loss": -0.6313, + "step": 2302 + }, + { + "epoch": 0.24577966237298984, + "grad_norm": 5.1428053879279565, + "learning_rate": 1.8733771092109544e-06, + "loss": -0.3503, + "step": 2304 + }, + { + "epoch": 0.24599301277435529, + "grad_norm": 7.8338959307743945, + "learning_rate": 1.873014113161188e-06, + "loss": -0.8449, + "step": 2306 + }, + { + "epoch": 0.24620636317572073, + "grad_norm": 8.329030605753001, + "learning_rate": 1.872650632818815e-06, + "loss": -0.8559, + "step": 2308 + }, + { + "epoch": 0.24641971357708617, + "grad_norm": 6.004484154776549, + "learning_rate": 1.8722866683854707e-06, + "loss": -0.1221, + "step": 2310 + }, + { + "epoch": 0.2466330639784516, + "grad_norm": 18.041740979343107, + "learning_rate": 1.8719222200630603e-06, + "loss": -0.8495, + "step": 2312 + }, + { + "epoch": 0.24684641437981705, + "grad_norm": 13.8993422113462, + "learning_rate": 1.8715572880537553e-06, + "loss": -0.584, + "step": 2314 + }, + { + "epoch": 0.2470597647811825, + "grad_norm": 7.603286049786936, + "learning_rate": 1.8711918725599975e-06, + "loss": 0.409, + "step": 2316 + }, + { + "epoch": 0.24727311518254794, + "grad_norm": 12.383128947530018, + "learning_rate": 1.8708259737844963e-06, + "loss": -0.1898, + "step": 2318 + }, + { + "epoch": 0.24748646558391338, + "grad_norm": 14.117241969233739, + "learning_rate": 1.8704595919302283e-06, + "loss": 0.5188, + "step": 2320 + }, + { + "epoch": 0.24769981598527882, + "grad_norm": 5.893210917326283, + "learning_rate": 1.8700927272004394e-06, + "loss": -0.5056, + "step": 2322 + }, + { + "epoch": 0.24791316638664426, + "grad_norm": 18.31287282961217, + "learning_rate": 1.8697253797986427e-06, + "loss": 0.2288, + "step": 2324 + }, + { + "epoch": 0.2481265167880097, + "grad_norm": 11.08866098282751, + "learning_rate": 1.8693575499286189e-06, + "loss": -0.2383, + "step": 2326 + }, + { + "epoch": 0.24833986718937515, + "grad_norm": 11.779301185879273, + "learning_rate": 1.8689892377944168e-06, + "loss": 0.2183, + "step": 2328 + }, + { + "epoch": 0.2485532175907406, + "grad_norm": 10.166191617998152, + "learning_rate": 1.8686204436003523e-06, + "loss": -0.3263, + "step": 2330 + }, + { + "epoch": 0.24876656799210603, + "grad_norm": 4.784977748146536, + "learning_rate": 1.8682511675510089e-06, + "loss": -0.1976, + "step": 2332 + }, + { + "epoch": 0.24897991839347147, + "grad_norm": 3.0284113912137456, + "learning_rate": 1.8678814098512378e-06, + "loss": 0.0196, + "step": 2334 + }, + { + "epoch": 0.24919326879483691, + "grad_norm": 7.914949027770823, + "learning_rate": 1.8675111707061567e-06, + "loss": 0.669, + "step": 2336 + }, + { + "epoch": 0.24940661919620236, + "grad_norm": 19.649999873606166, + "learning_rate": 1.867140450321151e-06, + "loss": 0.4264, + "step": 2338 + }, + { + "epoch": 0.2496199695975678, + "grad_norm": 8.968509283269004, + "learning_rate": 1.866769248901872e-06, + "loss": -0.437, + "step": 2340 + }, + { + "epoch": 0.24983331999893324, + "grad_norm": 9.853570349845704, + "learning_rate": 1.8663975666542395e-06, + "loss": -0.3988, + "step": 2342 + }, + { + "epoch": 0.2500466704002987, + "grad_norm": 7.207985884633396, + "learning_rate": 1.8660254037844386e-06, + "loss": -0.0546, + "step": 2344 + }, + { + "epoch": 0.25026002080166415, + "grad_norm": 7.15372288874833, + "learning_rate": 1.8656527604989216e-06, + "loss": -0.319, + "step": 2346 + }, + { + "epoch": 0.2504733712030296, + "grad_norm": 5.494342553996051, + "learning_rate": 1.8652796370044074e-06, + "loss": -0.3355, + "step": 2348 + }, + { + "epoch": 0.25068672160439504, + "grad_norm": 25.58370443408704, + "learning_rate": 1.8649060335078813e-06, + "loss": -0.1819, + "step": 2350 + }, + { + "epoch": 0.2509000720057605, + "grad_norm": 3.5730622573588953, + "learning_rate": 1.8645319502165941e-06, + "loss": 0.9055, + "step": 2352 + }, + { + "epoch": 0.2511134224071259, + "grad_norm": 13.020477400763882, + "learning_rate": 1.8641573873380637e-06, + "loss": -0.6351, + "step": 2354 + }, + { + "epoch": 0.25132677280849136, + "grad_norm": 12.680924597312218, + "learning_rate": 1.8637823450800743e-06, + "loss": -1.0723, + "step": 2356 + }, + { + "epoch": 0.2515401232098568, + "grad_norm": 14.018213573049106, + "learning_rate": 1.8634068236506745e-06, + "loss": -0.6361, + "step": 2358 + }, + { + "epoch": 0.25175347361122224, + "grad_norm": 10.987083846791398, + "learning_rate": 1.8630308232581804e-06, + "loss": -1.6505, + "step": 2360 + }, + { + "epoch": 0.2519668240125877, + "grad_norm": 10.33719425881593, + "learning_rate": 1.8626543441111728e-06, + "loss": 0.0228, + "step": 2362 + }, + { + "epoch": 0.25218017441395313, + "grad_norm": 12.832494385336979, + "learning_rate": 1.862277386418498e-06, + "loss": 0.1092, + "step": 2364 + }, + { + "epoch": 0.25239352481531857, + "grad_norm": 11.929711951297579, + "learning_rate": 1.8618999503892688e-06, + "loss": 0.2937, + "step": 2366 + }, + { + "epoch": 0.252606875216684, + "grad_norm": 13.862139471610936, + "learning_rate": 1.8615220362328618e-06, + "loss": -0.9202, + "step": 2368 + }, + { + "epoch": 0.25282022561804945, + "grad_norm": 9.682976773402793, + "learning_rate": 1.8611436441589205e-06, + "loss": -0.617, + "step": 2370 + }, + { + "epoch": 0.2530335760194149, + "grad_norm": 5.9430895533585435, + "learning_rate": 1.8607647743773523e-06, + "loss": -0.4079, + "step": 2372 + }, + { + "epoch": 0.25324692642078034, + "grad_norm": 8.283939675455104, + "learning_rate": 1.86038542709833e-06, + "loss": -0.3549, + "step": 2374 + }, + { + "epoch": 0.2534602768221458, + "grad_norm": 4.86946504359841, + "learning_rate": 1.8600056025322914e-06, + "loss": 0.2803, + "step": 2376 + }, + { + "epoch": 0.2536736272235112, + "grad_norm": 5.169292281313504, + "learning_rate": 1.8596253008899392e-06, + "loss": -0.3912, + "step": 2378 + }, + { + "epoch": 0.25388697762487666, + "grad_norm": 11.973300956867748, + "learning_rate": 1.85924452238224e-06, + "loss": -0.7652, + "step": 2380 + }, + { + "epoch": 0.2541003280262421, + "grad_norm": 7.745942596898536, + "learning_rate": 1.858863267220426e-06, + "loss": -1.0226, + "step": 2382 + }, + { + "epoch": 0.25431367842760755, + "grad_norm": 5.509556435115394, + "learning_rate": 1.8584815356159932e-06, + "loss": 1.0754, + "step": 2384 + }, + { + "epoch": 0.254527028828973, + "grad_norm": 15.406258747843344, + "learning_rate": 1.8580993277807014e-06, + "loss": -0.0632, + "step": 2386 + }, + { + "epoch": 0.25474037923033843, + "grad_norm": 14.010785290828032, + "learning_rate": 1.8577166439265754e-06, + "loss": -0.4037, + "step": 2388 + }, + { + "epoch": 0.2549537296317039, + "grad_norm": 5.69097663391526, + "learning_rate": 1.8573334842659043e-06, + "loss": -0.3808, + "step": 2390 + }, + { + "epoch": 0.2551670800330693, + "grad_norm": 5.524540431896903, + "learning_rate": 1.8569498490112402e-06, + "loss": -0.7019, + "step": 2392 + }, + { + "epoch": 0.25538043043443476, + "grad_norm": 16.252506033736484, + "learning_rate": 1.8565657383753997e-06, + "loss": -0.203, + "step": 2394 + }, + { + "epoch": 0.2555937808358002, + "grad_norm": 16.390311743520662, + "learning_rate": 1.8561811525714628e-06, + "loss": -0.7718, + "step": 2396 + }, + { + "epoch": 0.25580713123716564, + "grad_norm": 9.509455098271982, + "learning_rate": 1.8557960918127732e-06, + "loss": -1.2194, + "step": 2398 + }, + { + "epoch": 0.2560204816385311, + "grad_norm": 10.362931031584816, + "learning_rate": 1.8554105563129382e-06, + "loss": -0.5091, + "step": 2400 + }, + { + "epoch": 0.2562338320398965, + "grad_norm": 5.326740917454088, + "learning_rate": 1.8550245462858282e-06, + "loss": -0.5786, + "step": 2402 + }, + { + "epoch": 0.25644718244126197, + "grad_norm": 9.92303362337753, + "learning_rate": 1.854638061945577e-06, + "loss": -0.5637, + "step": 2404 + }, + { + "epoch": 0.2566605328426274, + "grad_norm": 20.64341362241303, + "learning_rate": 1.8542511035065816e-06, + "loss": -0.8239, + "step": 2406 + }, + { + "epoch": 0.25687388324399285, + "grad_norm": 18.527825723400422, + "learning_rate": 1.8538636711835016e-06, + "loss": -0.3911, + "step": 2408 + }, + { + "epoch": 0.2570872336453583, + "grad_norm": 9.854426517968191, + "learning_rate": 1.85347576519126e-06, + "loss": 0.044, + "step": 2410 + }, + { + "epoch": 0.25730058404672373, + "grad_norm": 18.527333803050997, + "learning_rate": 1.8530873857450423e-06, + "loss": -0.0423, + "step": 2412 + }, + { + "epoch": 0.2575139344480892, + "grad_norm": 16.540776664160777, + "learning_rate": 1.8526985330602968e-06, + "loss": 0.3152, + "step": 2414 + }, + { + "epoch": 0.2577272848494546, + "grad_norm": 16.133122153196123, + "learning_rate": 1.852309207352734e-06, + "loss": -0.8271, + "step": 2416 + }, + { + "epoch": 0.25794063525082006, + "grad_norm": 11.764404789500542, + "learning_rate": 1.851919408838327e-06, + "loss": 0.3818, + "step": 2418 + }, + { + "epoch": 0.2581539856521855, + "grad_norm": 15.646202125713115, + "learning_rate": 1.851529137733311e-06, + "loss": 0.0704, + "step": 2420 + }, + { + "epoch": 0.25836733605355094, + "grad_norm": 8.459610898269183, + "learning_rate": 1.8511383942541843e-06, + "loss": -0.6179, + "step": 2422 + }, + { + "epoch": 0.2585806864549164, + "grad_norm": 6.519953711225802, + "learning_rate": 1.8507471786177055e-06, + "loss": -0.512, + "step": 2424 + }, + { + "epoch": 0.25879403685628183, + "grad_norm": 3.5281760859495566, + "learning_rate": 1.8503554910408968e-06, + "loss": 0.0613, + "step": 2426 + }, + { + "epoch": 0.25900738725764727, + "grad_norm": 8.681546138750042, + "learning_rate": 1.8499633317410411e-06, + "loss": 0.3849, + "step": 2428 + }, + { + "epoch": 0.2592207376590127, + "grad_norm": 15.139915570216488, + "learning_rate": 1.8495707009356837e-06, + "loss": -1.2365, + "step": 2430 + }, + { + "epoch": 0.25943408806037815, + "grad_norm": 4.968344194456267, + "learning_rate": 1.849177598842631e-06, + "loss": -0.6141, + "step": 2432 + }, + { + "epoch": 0.2596474384617436, + "grad_norm": 4.051182514579996, + "learning_rate": 1.8487840256799512e-06, + "loss": -0.0551, + "step": 2434 + }, + { + "epoch": 0.25986078886310904, + "grad_norm": 8.604650501124002, + "learning_rate": 1.8483899816659732e-06, + "loss": -0.8036, + "step": 2436 + }, + { + "epoch": 0.2600741392644745, + "grad_norm": 4.3512634473064145, + "learning_rate": 1.8479954670192877e-06, + "loss": -0.5158, + "step": 2438 + }, + { + "epoch": 0.2602874896658399, + "grad_norm": 10.650761914416353, + "learning_rate": 1.8476004819587465e-06, + "loss": 0.3953, + "step": 2440 + }, + { + "epoch": 0.26050084006720536, + "grad_norm": 8.03235489049282, + "learning_rate": 1.8472050267034618e-06, + "loss": -0.374, + "step": 2442 + }, + { + "epoch": 0.2607141904685708, + "grad_norm": 10.712706708962088, + "learning_rate": 1.8468091014728074e-06, + "loss": 0.1436, + "step": 2444 + }, + { + "epoch": 0.26092754086993625, + "grad_norm": 9.84410354807994, + "learning_rate": 1.846412706486417e-06, + "loss": -0.4266, + "step": 2446 + }, + { + "epoch": 0.2611408912713017, + "grad_norm": 9.63948385943898, + "learning_rate": 1.8460158419641854e-06, + "loss": -1.2869, + "step": 2448 + }, + { + "epoch": 0.26135424167266713, + "grad_norm": 13.191559958327717, + "learning_rate": 1.845618508126268e-06, + "loss": -0.6911, + "step": 2450 + }, + { + "epoch": 0.2615675920740326, + "grad_norm": 13.443723086422374, + "learning_rate": 1.8452207051930799e-06, + "loss": -0.0246, + "step": 2452 + }, + { + "epoch": 0.261780942475398, + "grad_norm": 8.966521342435126, + "learning_rate": 1.8448224333852971e-06, + "loss": 0.8678, + "step": 2454 + }, + { + "epoch": 0.26199429287676346, + "grad_norm": 10.803004561160542, + "learning_rate": 1.8444236929238553e-06, + "loss": -0.5297, + "step": 2456 + }, + { + "epoch": 0.2622076432781289, + "grad_norm": 5.664680249517186, + "learning_rate": 1.8440244840299504e-06, + "loss": -0.2487, + "step": 2458 + }, + { + "epoch": 0.26242099367949434, + "grad_norm": 6.476889421129532, + "learning_rate": 1.8436248069250377e-06, + "loss": -0.3167, + "step": 2460 + }, + { + "epoch": 0.2626343440808598, + "grad_norm": 60.621958887425755, + "learning_rate": 1.8432246618308334e-06, + "loss": 1.3885, + "step": 2462 + }, + { + "epoch": 0.2628476944822252, + "grad_norm": 5.048892451024854, + "learning_rate": 1.8428240489693114e-06, + "loss": -0.7003, + "step": 2464 + }, + { + "epoch": 0.26306104488359067, + "grad_norm": 12.145511558233862, + "learning_rate": 1.8424229685627073e-06, + "loss": -0.2115, + "step": 2466 + }, + { + "epoch": 0.2632743952849561, + "grad_norm": 9.823397448062972, + "learning_rate": 1.8420214208335142e-06, + "loss": 0.1169, + "step": 2468 + }, + { + "epoch": 0.26348774568632155, + "grad_norm": 8.928946855746572, + "learning_rate": 1.8416194060044852e-06, + "loss": -0.6803, + "step": 2470 + }, + { + "epoch": 0.263701096087687, + "grad_norm": 8.329235687672043, + "learning_rate": 1.8412169242986334e-06, + "loss": -0.1897, + "step": 2472 + }, + { + "epoch": 0.26391444648905243, + "grad_norm": 6.669798699897304, + "learning_rate": 1.8408139759392287e-06, + "loss": 0.0079, + "step": 2474 + }, + { + "epoch": 0.2641277968904179, + "grad_norm": 9.80509245504525, + "learning_rate": 1.8404105611498019e-06, + "loss": -0.4576, + "step": 2476 + }, + { + "epoch": 0.2643411472917833, + "grad_norm": 7.492487465918569, + "learning_rate": 1.840006680154142e-06, + "loss": -0.4397, + "step": 2478 + }, + { + "epoch": 0.26455449769314876, + "grad_norm": 1.5039927465580198, + "learning_rate": 1.839602333176296e-06, + "loss": -0.0611, + "step": 2480 + }, + { + "epoch": 0.2647678480945142, + "grad_norm": 5.1532880795631115, + "learning_rate": 1.8391975204405702e-06, + "loss": -1.0668, + "step": 2482 + }, + { + "epoch": 0.26498119849587964, + "grad_norm": 5.688869225043968, + "learning_rate": 1.8387922421715287e-06, + "loss": 0.201, + "step": 2484 + }, + { + "epoch": 0.2651945488972451, + "grad_norm": 9.523530894903491, + "learning_rate": 1.8383864985939941e-06, + "loss": -0.928, + "step": 2486 + }, + { + "epoch": 0.26540789929861053, + "grad_norm": 68.90389467031252, + "learning_rate": 1.8379802899330472e-06, + "loss": 0.3866, + "step": 2488 + }, + { + "epoch": 0.265621249699976, + "grad_norm": 36.87785082526099, + "learning_rate": 1.8375736164140263e-06, + "loss": -0.9569, + "step": 2490 + }, + { + "epoch": 0.26583460010134147, + "grad_norm": 3.0632647743772052, + "learning_rate": 1.8371664782625285e-06, + "loss": -1.1399, + "step": 2492 + }, + { + "epoch": 0.2660479505027069, + "grad_norm": 10.537091217260052, + "learning_rate": 1.8367588757044078e-06, + "loss": 0.0511, + "step": 2494 + }, + { + "epoch": 0.26626130090407235, + "grad_norm": 7.915103584027036, + "learning_rate": 1.836350808965776e-06, + "loss": -1.125, + "step": 2496 + }, + { + "epoch": 0.2664746513054378, + "grad_norm": 10.951278352638651, + "learning_rate": 1.8359422782730029e-06, + "loss": -0.7042, + "step": 2498 + }, + { + "epoch": 0.26668800170680323, + "grad_norm": 40.783052789155896, + "learning_rate": 1.8355332838527148e-06, + "loss": -0.1665, + "step": 2500 + }, + { + "epoch": 0.2669013521081687, + "grad_norm": 6.568601998114741, + "learning_rate": 1.8351238259317965e-06, + "loss": -1.2587, + "step": 2502 + }, + { + "epoch": 0.2671147025095341, + "grad_norm": 7.832794691165237, + "learning_rate": 1.8347139047373884e-06, + "loss": -0.2094, + "step": 2504 + }, + { + "epoch": 0.26732805291089956, + "grad_norm": 8.855819639851914, + "learning_rate": 1.834303520496889e-06, + "loss": -0.2047, + "step": 2506 + }, + { + "epoch": 0.267541403312265, + "grad_norm": 16.321302791071346, + "learning_rate": 1.8338926734379534e-06, + "loss": -0.7656, + "step": 2508 + }, + { + "epoch": 0.26775475371363044, + "grad_norm": 16.187083261564556, + "learning_rate": 1.8334813637884932e-06, + "loss": -0.7535, + "step": 2510 + }, + { + "epoch": 0.2679681041149959, + "grad_norm": 11.06662208086572, + "learning_rate": 1.833069591776677e-06, + "loss": -0.5981, + "step": 2512 + }, + { + "epoch": 0.26818145451636133, + "grad_norm": 11.481556279635466, + "learning_rate": 1.8326573576309293e-06, + "loss": -0.7452, + "step": 2514 + }, + { + "epoch": 0.26839480491772677, + "grad_norm": 4.061235201425208, + "learning_rate": 1.8322446615799316e-06, + "loss": 0.0828, + "step": 2516 + }, + { + "epoch": 0.2686081553190922, + "grad_norm": 87.78174667980181, + "learning_rate": 1.8318315038526215e-06, + "loss": -1.331, + "step": 2518 + }, + { + "epoch": 0.26882150572045765, + "grad_norm": 9.821629130828537, + "learning_rate": 1.8314178846781922e-06, + "loss": -0.4002, + "step": 2520 + }, + { + "epoch": 0.2690348561218231, + "grad_norm": 8.121196716858476, + "learning_rate": 1.8310038042860936e-06, + "loss": 0.1667, + "step": 2522 + }, + { + "epoch": 0.26924820652318854, + "grad_norm": 10.654610870823232, + "learning_rate": 1.8305892629060312e-06, + "loss": -0.3266, + "step": 2524 + }, + { + "epoch": 0.269461556924554, + "grad_norm": 10.367715170660912, + "learning_rate": 1.8301742607679657e-06, + "loss": -0.5792, + "step": 2526 + }, + { + "epoch": 0.2696749073259194, + "grad_norm": 12.9858009274359, + "learning_rate": 1.8297587981021143e-06, + "loss": -0.1316, + "step": 2528 + }, + { + "epoch": 0.26988825772728486, + "grad_norm": 8.271263760108752, + "learning_rate": 1.8293428751389488e-06, + "loss": 0.3458, + "step": 2530 + }, + { + "epoch": 0.2701016081286503, + "grad_norm": 12.906054049788713, + "learning_rate": 1.8289264921091973e-06, + "loss": -0.5006, + "step": 2532 + }, + { + "epoch": 0.27031495853001575, + "grad_norm": 7.06302816854579, + "learning_rate": 1.828509649243842e-06, + "loss": -0.64, + "step": 2534 + }, + { + "epoch": 0.2705283089313812, + "grad_norm": 2.9732041951111188, + "learning_rate": 1.828092346774121e-06, + "loss": 0.1808, + "step": 2536 + }, + { + "epoch": 0.27074165933274663, + "grad_norm": 17.41936591956641, + "learning_rate": 1.8276745849315275e-06, + "loss": -1.5929, + "step": 2538 + }, + { + "epoch": 0.2709550097341121, + "grad_norm": 3.295527455883477, + "learning_rate": 1.8272563639478085e-06, + "loss": 0.6698, + "step": 2540 + }, + { + "epoch": 0.2711683601354775, + "grad_norm": 5.178355621955981, + "learning_rate": 1.8268376840549666e-06, + "loss": -0.1764, + "step": 2542 + }, + { + "epoch": 0.27138171053684296, + "grad_norm": 16.84263048981545, + "learning_rate": 1.826418545485259e-06, + "loss": -0.8027, + "step": 2544 + }, + { + "epoch": 0.2715950609382084, + "grad_norm": 17.17430793080092, + "learning_rate": 1.8259989484711966e-06, + "loss": 0.4206, + "step": 2546 + }, + { + "epoch": 0.27180841133957384, + "grad_norm": 16.566192813090407, + "learning_rate": 1.8255788932455457e-06, + "loss": -0.2747, + "step": 2548 + }, + { + "epoch": 0.2720217617409393, + "grad_norm": 11.692453805030112, + "learning_rate": 1.8251583800413252e-06, + "loss": 0.1833, + "step": 2550 + }, + { + "epoch": 0.2722351121423047, + "grad_norm": 19.985947095637556, + "learning_rate": 1.8247374090918103e-06, + "loss": -0.8139, + "step": 2552 + }, + { + "epoch": 0.27244846254367017, + "grad_norm": 4.173871522790509, + "learning_rate": 1.8243159806305277e-06, + "loss": -0.6897, + "step": 2554 + }, + { + "epoch": 0.2726618129450356, + "grad_norm": 3.420896855606482, + "learning_rate": 1.8238940948912597e-06, + "loss": -0.0617, + "step": 2556 + }, + { + "epoch": 0.27287516334640105, + "grad_norm": 14.251377776265553, + "learning_rate": 1.8234717521080418e-06, + "loss": -0.1553, + "step": 2558 + }, + { + "epoch": 0.2730885137477665, + "grad_norm": 16.743464532908714, + "learning_rate": 1.8230489525151625e-06, + "loss": -0.3327, + "step": 2560 + }, + { + "epoch": 0.27330186414913193, + "grad_norm": 4.507149901882717, + "learning_rate": 1.8226256963471638e-06, + "loss": -0.1407, + "step": 2562 + }, + { + "epoch": 0.2735152145504974, + "grad_norm": 6.402152097612623, + "learning_rate": 1.822201983838842e-06, + "loss": 0.21, + "step": 2564 + }, + { + "epoch": 0.2737285649518628, + "grad_norm": 6.811412538626924, + "learning_rate": 1.821777815225245e-06, + "loss": -0.1084, + "step": 2566 + }, + { + "epoch": 0.27394191535322826, + "grad_norm": 18.648142281712236, + "learning_rate": 1.821353190741675e-06, + "loss": -0.1173, + "step": 2568 + }, + { + "epoch": 0.2741552657545937, + "grad_norm": 9.833151471120507, + "learning_rate": 1.820928110623687e-06, + "loss": -0.0365, + "step": 2570 + }, + { + "epoch": 0.27436861615595914, + "grad_norm": 19.07679569867876, + "learning_rate": 1.8205025751070875e-06, + "loss": -0.4961, + "step": 2572 + }, + { + "epoch": 0.2745819665573246, + "grad_norm": 8.958547557560038, + "learning_rate": 1.8200765844279368e-06, + "loss": -0.3358, + "step": 2574 + }, + { + "epoch": 0.27479531695869003, + "grad_norm": 9.826421359564362, + "learning_rate": 1.8196501388225479e-06, + "loss": -0.5162, + "step": 2576 + }, + { + "epoch": 0.27500866736005547, + "grad_norm": 5.32275087089654, + "learning_rate": 1.8192232385274848e-06, + "loss": -0.0326, + "step": 2578 + }, + { + "epoch": 0.2752220177614209, + "grad_norm": 7.766675803769117, + "learning_rate": 1.8187958837795651e-06, + "loss": -0.9823, + "step": 2580 + }, + { + "epoch": 0.27543536816278635, + "grad_norm": 22.893974195396556, + "learning_rate": 1.818368074815858e-06, + "loss": -0.4845, + "step": 2582 + }, + { + "epoch": 0.2756487185641518, + "grad_norm": 4.920758120884345, + "learning_rate": 1.8179398118736843e-06, + "loss": -0.0882, + "step": 2584 + }, + { + "epoch": 0.27586206896551724, + "grad_norm": 9.890175767027781, + "learning_rate": 1.8175110951906176e-06, + "loss": 0.6432, + "step": 2586 + }, + { + "epoch": 0.2760754193668827, + "grad_norm": 5.486788363043506, + "learning_rate": 1.817081925004482e-06, + "loss": -0.5219, + "step": 2588 + }, + { + "epoch": 0.2762887697682481, + "grad_norm": 14.231129008887725, + "learning_rate": 1.8166523015533541e-06, + "loss": 0.3312, + "step": 2590 + }, + { + "epoch": 0.27650212016961356, + "grad_norm": 17.970512425860175, + "learning_rate": 1.8162222250755614e-06, + "loss": -0.5459, + "step": 2592 + }, + { + "epoch": 0.276715470570979, + "grad_norm": 3.2997796865619375, + "learning_rate": 1.8157916958096835e-06, + "loss": 0.0362, + "step": 2594 + }, + { + "epoch": 0.27692882097234445, + "grad_norm": 14.031084642708048, + "learning_rate": 1.8153607139945498e-06, + "loss": 0.1325, + "step": 2596 + }, + { + "epoch": 0.2771421713737099, + "grad_norm": 11.494977389350202, + "learning_rate": 1.8149292798692421e-06, + "loss": -0.0861, + "step": 2598 + }, + { + "epoch": 0.27735552177507533, + "grad_norm": 19.735453988696122, + "learning_rate": 1.8144973936730921e-06, + "loss": -0.2073, + "step": 2600 + }, + { + "epoch": 0.2775688721764408, + "grad_norm": 9.750239989556706, + "learning_rate": 1.8140650556456834e-06, + "loss": 0.2016, + "step": 2602 + }, + { + "epoch": 0.2777822225778062, + "grad_norm": 4.2461151131177735, + "learning_rate": 1.813632266026849e-06, + "loss": -0.6845, + "step": 2604 + }, + { + "epoch": 0.27799557297917166, + "grad_norm": 11.302412825668707, + "learning_rate": 1.813199025056673e-06, + "loss": -1.0265, + "step": 2606 + }, + { + "epoch": 0.2782089233805371, + "grad_norm": 9.34767054874116, + "learning_rate": 1.8127653329754906e-06, + "loss": 0.1478, + "step": 2608 + }, + { + "epoch": 0.27842227378190254, + "grad_norm": 10.373257149332765, + "learning_rate": 1.8123311900238858e-06, + "loss": -0.2979, + "step": 2610 + }, + { + "epoch": 0.278635624183268, + "grad_norm": 15.884726509746132, + "learning_rate": 1.8118965964426938e-06, + "loss": -0.5932, + "step": 2612 + }, + { + "epoch": 0.2788489745846334, + "grad_norm": 5.1577287228952535, + "learning_rate": 1.8114615524729993e-06, + "loss": 0.575, + "step": 2614 + }, + { + "epoch": 0.27906232498599887, + "grad_norm": 12.77835467323993, + "learning_rate": 1.8110260583561367e-06, + "loss": -1.0547, + "step": 2616 + }, + { + "epoch": 0.2792756753873643, + "grad_norm": 8.805085849875123, + "learning_rate": 1.8105901143336912e-06, + "loss": -1.3148, + "step": 2618 + }, + { + "epoch": 0.27948902578872975, + "grad_norm": 6.35276215648025, + "learning_rate": 1.810153720647496e-06, + "loss": -1.8331, + "step": 2620 + }, + { + "epoch": 0.2797023761900952, + "grad_norm": 14.705394633389961, + "learning_rate": 1.8097168775396349e-06, + "loss": -0.5171, + "step": 2622 + }, + { + "epoch": 0.27991572659146063, + "grad_norm": 5.503429019775704, + "learning_rate": 1.8092795852524403e-06, + "loss": 0.1605, + "step": 2624 + }, + { + "epoch": 0.2801290769928261, + "grad_norm": 13.564057146401895, + "learning_rate": 1.8088418440284943e-06, + "loss": -0.645, + "step": 2626 + }, + { + "epoch": 0.2803424273941915, + "grad_norm": 4.993200318528139, + "learning_rate": 1.808403654110628e-06, + "loss": -0.5095, + "step": 2628 + }, + { + "epoch": 0.28055577779555696, + "grad_norm": 10.984988087132448, + "learning_rate": 1.8079650157419212e-06, + "loss": 0.8913, + "step": 2630 + }, + { + "epoch": 0.2807691281969224, + "grad_norm": 18.74377180780051, + "learning_rate": 1.807525929165702e-06, + "loss": 0.1649, + "step": 2632 + }, + { + "epoch": 0.28098247859828784, + "grad_norm": 4.657595145179148, + "learning_rate": 1.8070863946255482e-06, + "loss": -0.4679, + "step": 2634 + }, + { + "epoch": 0.2811958289996533, + "grad_norm": 6.261051359311306, + "learning_rate": 1.8066464123652855e-06, + "loss": -0.3555, + "step": 2636 + }, + { + "epoch": 0.2814091794010187, + "grad_norm": 51.013522190922025, + "learning_rate": 1.806205982628988e-06, + "loss": -1.669, + "step": 2638 + }, + { + "epoch": 0.28162252980238417, + "grad_norm": 4.470256778096593, + "learning_rate": 1.8057651056609782e-06, + "loss": -0.7639, + "step": 2640 + }, + { + "epoch": 0.2818358802037496, + "grad_norm": 8.758858836451818, + "learning_rate": 1.805323781705826e-06, + "loss": -0.1754, + "step": 2642 + }, + { + "epoch": 0.28204923060511505, + "grad_norm": 4.86911202838136, + "learning_rate": 1.8048820110083502e-06, + "loss": -0.5532, + "step": 2644 + }, + { + "epoch": 0.2822625810064805, + "grad_norm": 18.823649491664668, + "learning_rate": 1.8044397938136171e-06, + "loss": 0.29, + "step": 2646 + }, + { + "epoch": 0.28247593140784594, + "grad_norm": 9.430650317748489, + "learning_rate": 1.8039971303669406e-06, + "loss": -0.0296, + "step": 2648 + }, + { + "epoch": 0.2826892818092114, + "grad_norm": 10.165589151973604, + "learning_rate": 1.8035540209138817e-06, + "loss": -1.2569, + "step": 2650 + }, + { + "epoch": 0.2829026322105768, + "grad_norm": 9.411149891933695, + "learning_rate": 1.80311046570025e-06, + "loss": -0.0413, + "step": 2652 + }, + { + "epoch": 0.28311598261194226, + "grad_norm": 11.985757833040966, + "learning_rate": 1.8026664649721014e-06, + "loss": -0.4883, + "step": 2654 + }, + { + "epoch": 0.2833293330133077, + "grad_norm": 10.706984815853794, + "learning_rate": 1.802222018975739e-06, + "loss": -0.8616, + "step": 2656 + }, + { + "epoch": 0.28354268341467315, + "grad_norm": 10.666065588024301, + "learning_rate": 1.8017771279577137e-06, + "loss": 0.1746, + "step": 2658 + }, + { + "epoch": 0.28375603381603864, + "grad_norm": 14.51842860049976, + "learning_rate": 1.801331792164822e-06, + "loss": 0.7129, + "step": 2660 + }, + { + "epoch": 0.2839693842174041, + "grad_norm": 12.989206797295825, + "learning_rate": 1.800886011844108e-06, + "loss": 0.0133, + "step": 2662 + }, + { + "epoch": 0.28418273461876953, + "grad_norm": 5.294805781203281, + "learning_rate": 1.8004397872428625e-06, + "loss": 0.023, + "step": 2664 + }, + { + "epoch": 0.28439608502013497, + "grad_norm": 5.8458958722364835, + "learning_rate": 1.7999931186086221e-06, + "loss": -0.6007, + "step": 2666 + }, + { + "epoch": 0.2846094354215004, + "grad_norm": 7.66940620173898, + "learning_rate": 1.7995460061891707e-06, + "loss": -0.4881, + "step": 2668 + }, + { + "epoch": 0.28482278582286585, + "grad_norm": 8.199958083191563, + "learning_rate": 1.7990984502325372e-06, + "loss": -0.0656, + "step": 2670 + }, + { + "epoch": 0.2850361362242313, + "grad_norm": 3.3465245252532885, + "learning_rate": 1.7986504509869974e-06, + "loss": -0.8557, + "step": 2672 + }, + { + "epoch": 0.28524948662559674, + "grad_norm": 9.006299109629683, + "learning_rate": 1.7982020087010725e-06, + "loss": 0.2031, + "step": 2674 + }, + { + "epoch": 0.2854628370269622, + "grad_norm": 10.096558616293146, + "learning_rate": 1.7977531236235296e-06, + "loss": -0.4802, + "step": 2676 + }, + { + "epoch": 0.2856761874283276, + "grad_norm": 10.76728277437739, + "learning_rate": 1.7973037960033823e-06, + "loss": -0.3836, + "step": 2678 + }, + { + "epoch": 0.28588953782969306, + "grad_norm": 10.718588641939473, + "learning_rate": 1.7968540260898884e-06, + "loss": -0.7246, + "step": 2680 + }, + { + "epoch": 0.2861028882310585, + "grad_norm": 9.299732942828491, + "learning_rate": 1.7964038141325515e-06, + "loss": 0.5323, + "step": 2682 + }, + { + "epoch": 0.28631623863242395, + "grad_norm": 3.37327462557939, + "learning_rate": 1.7959531603811207e-06, + "loss": -0.5815, + "step": 2684 + }, + { + "epoch": 0.2865295890337894, + "grad_norm": 9.062964349787686, + "learning_rate": 1.79550206508559e-06, + "loss": -0.8566, + "step": 2686 + }, + { + "epoch": 0.28674293943515483, + "grad_norm": 21.747005582729148, + "learning_rate": 1.7950505284961981e-06, + "loss": -0.5522, + "step": 2688 + }, + { + "epoch": 0.2869562898365203, + "grad_norm": 13.804116674242035, + "learning_rate": 1.7945985508634287e-06, + "loss": -0.8123, + "step": 2690 + }, + { + "epoch": 0.2871696402378857, + "grad_norm": 11.36689447617446, + "learning_rate": 1.7941461324380106e-06, + "loss": -0.0186, + "step": 2692 + }, + { + "epoch": 0.28738299063925116, + "grad_norm": 11.958241755496314, + "learning_rate": 1.7936932734709168e-06, + "loss": 0.1659, + "step": 2694 + }, + { + "epoch": 0.2875963410406166, + "grad_norm": 12.748944057852567, + "learning_rate": 1.793239974213364e-06, + "loss": -0.7666, + "step": 2696 + }, + { + "epoch": 0.28780969144198204, + "grad_norm": 8.14683489850067, + "learning_rate": 1.7927862349168141e-06, + "loss": -0.4021, + "step": 2698 + }, + { + "epoch": 0.2880230418433475, + "grad_norm": 11.944364902156492, + "learning_rate": 1.792332055832973e-06, + "loss": -1.1424, + "step": 2700 + }, + { + "epoch": 0.2882363922447129, + "grad_norm": 5.672731300193287, + "learning_rate": 1.7918774372137896e-06, + "loss": -0.6347, + "step": 2702 + }, + { + "epoch": 0.28844974264607837, + "grad_norm": 14.366058561755963, + "learning_rate": 1.7914223793114577e-06, + "loss": -0.3728, + "step": 2704 + }, + { + "epoch": 0.2886630930474438, + "grad_norm": 10.567273754752735, + "learning_rate": 1.7909668823784147e-06, + "loss": -0.4319, + "step": 2706 + }, + { + "epoch": 0.28887644344880925, + "grad_norm": 5.834302256330665, + "learning_rate": 1.7905109466673412e-06, + "loss": -0.9572, + "step": 2708 + }, + { + "epoch": 0.2890897938501747, + "grad_norm": 15.92558361336889, + "learning_rate": 1.7900545724311611e-06, + "loss": -0.1671, + "step": 2710 + }, + { + "epoch": 0.28930314425154013, + "grad_norm": 9.660706647920104, + "learning_rate": 1.7895977599230417e-06, + "loss": -0.4654, + "step": 2712 + }, + { + "epoch": 0.2895164946529056, + "grad_norm": 7.377295032733003, + "learning_rate": 1.7891405093963937e-06, + "loss": -0.4804, + "step": 2714 + }, + { + "epoch": 0.289729845054271, + "grad_norm": 37.202120366304165, + "learning_rate": 1.78868282110487e-06, + "loss": -0.4606, + "step": 2716 + }, + { + "epoch": 0.28994319545563646, + "grad_norm": 23.357959605269212, + "learning_rate": 1.7882246953023679e-06, + "loss": -0.1408, + "step": 2718 + }, + { + "epoch": 0.2901565458570019, + "grad_norm": 10.563156178485283, + "learning_rate": 1.7877661322430258e-06, + "loss": -0.1044, + "step": 2720 + }, + { + "epoch": 0.29036989625836734, + "grad_norm": 9.829464940192288, + "learning_rate": 1.7873071321812249e-06, + "loss": -1.1245, + "step": 2722 + }, + { + "epoch": 0.2905832466597328, + "grad_norm": 6.523698947178823, + "learning_rate": 1.7868476953715899e-06, + "loss": -0.3949, + "step": 2724 + }, + { + "epoch": 0.2907965970610982, + "grad_norm": 5.315585011273102, + "learning_rate": 1.7863878220689868e-06, + "loss": -0.0797, + "step": 2726 + }, + { + "epoch": 0.29100994746246367, + "grad_norm": 7.9678573078610615, + "learning_rate": 1.7859275125285238e-06, + "loss": -0.5287, + "step": 2728 + }, + { + "epoch": 0.2912232978638291, + "grad_norm": 10.339825352648624, + "learning_rate": 1.7854667670055516e-06, + "loss": -0.4685, + "step": 2730 + }, + { + "epoch": 0.29143664826519455, + "grad_norm": 26.813161324781007, + "learning_rate": 1.7850055857556625e-06, + "loss": 0.0806, + "step": 2732 + }, + { + "epoch": 0.29164999866656, + "grad_norm": 6.19533238402814, + "learning_rate": 1.7845439690346902e-06, + "loss": -0.8934, + "step": 2734 + }, + { + "epoch": 0.29186334906792544, + "grad_norm": 11.935482260443736, + "learning_rate": 1.7840819170987102e-06, + "loss": -0.5398, + "step": 2736 + }, + { + "epoch": 0.2920766994692909, + "grad_norm": 15.208681373343325, + "learning_rate": 1.7836194302040398e-06, + "loss": -0.0419, + "step": 2738 + }, + { + "epoch": 0.2922900498706563, + "grad_norm": 19.377643083984783, + "learning_rate": 1.783156508607237e-06, + "loss": 0.5769, + "step": 2740 + }, + { + "epoch": 0.29250340027202176, + "grad_norm": 7.598054618897178, + "learning_rate": 1.7826931525651015e-06, + "loss": 0.4084, + "step": 2742 + }, + { + "epoch": 0.2927167506733872, + "grad_norm": 9.067483390785537, + "learning_rate": 1.7822293623346734e-06, + "loss": -0.0687, + "step": 2744 + }, + { + "epoch": 0.29293010107475265, + "grad_norm": 7.870027336512355, + "learning_rate": 1.7817651381732341e-06, + "loss": -0.0261, + "step": 2746 + }, + { + "epoch": 0.2931434514761181, + "grad_norm": 13.9173527568906, + "learning_rate": 1.7813004803383058e-06, + "loss": 0.1512, + "step": 2748 + }, + { + "epoch": 0.29335680187748353, + "grad_norm": 11.059540980454912, + "learning_rate": 1.7808353890876507e-06, + "loss": -0.2108, + "step": 2750 + }, + { + "epoch": 0.293570152278849, + "grad_norm": 9.53504786293431, + "learning_rate": 1.7803698646792718e-06, + "loss": -1.0388, + "step": 2752 + }, + { + "epoch": 0.2937835026802144, + "grad_norm": 5.687859737269777, + "learning_rate": 1.7799039073714127e-06, + "loss": 0.0883, + "step": 2754 + }, + { + "epoch": 0.29399685308157986, + "grad_norm": 11.075415172684645, + "learning_rate": 1.7794375174225565e-06, + "loss": 0.3117, + "step": 2756 + }, + { + "epoch": 0.2942102034829453, + "grad_norm": 10.36266916460832, + "learning_rate": 1.7789706950914268e-06, + "loss": -0.2675, + "step": 2758 + }, + { + "epoch": 0.29442355388431074, + "grad_norm": 3.1296807815127035, + "learning_rate": 1.7785034406369865e-06, + "loss": -0.5151, + "step": 2760 + }, + { + "epoch": 0.2946369042856762, + "grad_norm": 6.588878587224345, + "learning_rate": 1.7780357543184393e-06, + "loss": -0.8396, + "step": 2762 + }, + { + "epoch": 0.2948502546870416, + "grad_norm": 12.265817123436767, + "learning_rate": 1.7775676363952275e-06, + "loss": -0.0767, + "step": 2764 + }, + { + "epoch": 0.29506360508840707, + "grad_norm": 12.121176109768198, + "learning_rate": 1.777099087127033e-06, + "loss": -0.3531, + "step": 2766 + }, + { + "epoch": 0.2952769554897725, + "grad_norm": 5.4020371196605845, + "learning_rate": 1.7766301067737773e-06, + "loss": -0.2694, + "step": 2768 + }, + { + "epoch": 0.29549030589113795, + "grad_norm": 15.797755880753726, + "learning_rate": 1.77616069559562e-06, + "loss": 0.2502, + "step": 2770 + }, + { + "epoch": 0.2957036562925034, + "grad_norm": 23.07562871046939, + "learning_rate": 1.7756908538529615e-06, + "loss": 0.0013, + "step": 2772 + }, + { + "epoch": 0.29591700669386883, + "grad_norm": 11.984050959618228, + "learning_rate": 1.77522058180644e-06, + "loss": -0.8811, + "step": 2774 + }, + { + "epoch": 0.2961303570952343, + "grad_norm": 16.357199041219108, + "learning_rate": 1.7747498797169318e-06, + "loss": -1.491, + "step": 2776 + }, + { + "epoch": 0.2963437074965997, + "grad_norm": 4.30674861193802, + "learning_rate": 1.7742787478455527e-06, + "loss": 0.2769, + "step": 2778 + }, + { + "epoch": 0.29655705789796516, + "grad_norm": 10.117858674169552, + "learning_rate": 1.773807186453657e-06, + "loss": -1.2981, + "step": 2780 + }, + { + "epoch": 0.2967704082993306, + "grad_norm": 8.275792095536548, + "learning_rate": 1.7733351958028368e-06, + "loss": -0.4654, + "step": 2782 + }, + { + "epoch": 0.29698375870069604, + "grad_norm": 9.879106180048032, + "learning_rate": 1.772862776154922e-06, + "loss": -1.4983, + "step": 2784 + }, + { + "epoch": 0.2971971091020615, + "grad_norm": 9.259990158226586, + "learning_rate": 1.7723899277719813e-06, + "loss": -0.6973, + "step": 2786 + }, + { + "epoch": 0.2974104595034269, + "grad_norm": 10.94242036226749, + "learning_rate": 1.7719166509163208e-06, + "loss": -0.0928, + "step": 2788 + }, + { + "epoch": 0.29762380990479237, + "grad_norm": 15.192848863804105, + "learning_rate": 1.7714429458504843e-06, + "loss": -0.5346, + "step": 2790 + }, + { + "epoch": 0.2978371603061578, + "grad_norm": 6.420983020541523, + "learning_rate": 1.7709688128372533e-06, + "loss": 0.3499, + "step": 2792 + }, + { + "epoch": 0.29805051070752325, + "grad_norm": 6.910838026896336, + "learning_rate": 1.7704942521396464e-06, + "loss": -0.8162, + "step": 2794 + }, + { + "epoch": 0.2982638611088887, + "grad_norm": 7.972387086269549, + "learning_rate": 1.7700192640209198e-06, + "loss": -0.7928, + "step": 2796 + }, + { + "epoch": 0.29847721151025414, + "grad_norm": 5.895304208647447, + "learning_rate": 1.7695438487445667e-06, + "loss": 0.6118, + "step": 2798 + }, + { + "epoch": 0.2986905619116196, + "grad_norm": 16.971841564182863, + "learning_rate": 1.7690680065743166e-06, + "loss": -0.203, + "step": 2800 + }, + { + "epoch": 0.298903912312985, + "grad_norm": 9.997538057225917, + "learning_rate": 1.7685917377741374e-06, + "loss": 0.2321, + "step": 2802 + }, + { + "epoch": 0.29911726271435046, + "grad_norm": 9.727151862129713, + "learning_rate": 1.768115042608232e-06, + "loss": -1.061, + "step": 2804 + }, + { + "epoch": 0.2993306131157159, + "grad_norm": 9.132276188575766, + "learning_rate": 1.7676379213410407e-06, + "loss": -0.2183, + "step": 2806 + }, + { + "epoch": 0.29954396351708135, + "grad_norm": 12.67501067288393, + "learning_rate": 1.7671603742372398e-06, + "loss": -1.6843, + "step": 2808 + }, + { + "epoch": 0.2997573139184468, + "grad_norm": 10.177987953982509, + "learning_rate": 1.7666824015617425e-06, + "loss": -0.0483, + "step": 2810 + }, + { + "epoch": 0.29997066431981223, + "grad_norm": 7.060156372400542, + "learning_rate": 1.766204003579697e-06, + "loss": -0.4331, + "step": 2812 + }, + { + "epoch": 0.30018401472117767, + "grad_norm": 11.335971311807478, + "learning_rate": 1.7657251805564885e-06, + "loss": -0.7517, + "step": 2814 + }, + { + "epoch": 0.3003973651225431, + "grad_norm": 5.284520008504851, + "learning_rate": 1.7652459327577373e-06, + "loss": -0.8055, + "step": 2816 + }, + { + "epoch": 0.30061071552390856, + "grad_norm": 10.439845990531815, + "learning_rate": 1.7647662604492993e-06, + "loss": -0.0944, + "step": 2818 + }, + { + "epoch": 0.300824065925274, + "grad_norm": 3.922650075484994, + "learning_rate": 1.7642861638972665e-06, + "loss": -0.1679, + "step": 2820 + }, + { + "epoch": 0.30103741632663944, + "grad_norm": 11.350909450263366, + "learning_rate": 1.763805643367966e-06, + "loss": -0.326, + "step": 2822 + }, + { + "epoch": 0.3012507667280049, + "grad_norm": 7.4238523632800835, + "learning_rate": 1.7633246991279594e-06, + "loss": 0.3381, + "step": 2824 + }, + { + "epoch": 0.3014641171293703, + "grad_norm": 32.23675167083024, + "learning_rate": 1.7628433314440444e-06, + "loss": -0.2843, + "step": 2826 + }, + { + "epoch": 0.30167746753073577, + "grad_norm": 12.832381120294913, + "learning_rate": 1.762361540583253e-06, + "loss": -0.4265, + "step": 2828 + }, + { + "epoch": 0.3018908179321012, + "grad_norm": 10.924992511434162, + "learning_rate": 1.7618793268128518e-06, + "loss": -0.6991, + "step": 2830 + }, + { + "epoch": 0.3021041683334667, + "grad_norm": 16.745328435712025, + "learning_rate": 1.7613966904003425e-06, + "loss": -0.5289, + "step": 2832 + }, + { + "epoch": 0.30231751873483215, + "grad_norm": 20.337099351092906, + "learning_rate": 1.7609136316134614e-06, + "loss": -0.6279, + "step": 2834 + }, + { + "epoch": 0.3025308691361976, + "grad_norm": 9.126098630479362, + "learning_rate": 1.7604301507201781e-06, + "loss": -0.1357, + "step": 2836 + }, + { + "epoch": 0.30274421953756303, + "grad_norm": 9.559078970254996, + "learning_rate": 1.7599462479886973e-06, + "loss": -0.1126, + "step": 2838 + }, + { + "epoch": 0.3029575699389285, + "grad_norm": 9.296157012615843, + "learning_rate": 1.7594619236874575e-06, + "loss": 0.0803, + "step": 2840 + }, + { + "epoch": 0.3031709203402939, + "grad_norm": 10.55814750292512, + "learning_rate": 1.7589771780851309e-06, + "loss": -0.1387, + "step": 2842 + }, + { + "epoch": 0.30338427074165936, + "grad_norm": 10.319013610986776, + "learning_rate": 1.7584920114506233e-06, + "loss": -0.6003, + "step": 2844 + }, + { + "epoch": 0.3035976211430248, + "grad_norm": 6.106139905207281, + "learning_rate": 1.7580064240530744e-06, + "loss": -0.1054, + "step": 2846 + }, + { + "epoch": 0.30381097154439024, + "grad_norm": 6.703392798878849, + "learning_rate": 1.757520416161857e-06, + "loss": 0.8563, + "step": 2848 + }, + { + "epoch": 0.3040243219457557, + "grad_norm": 14.825477653821945, + "learning_rate": 1.7570339880465776e-06, + "loss": -1.1311, + "step": 2850 + }, + { + "epoch": 0.3042376723471211, + "grad_norm": 6.266679277486579, + "learning_rate": 1.7565471399770752e-06, + "loss": 0.3022, + "step": 2852 + }, + { + "epoch": 0.30445102274848657, + "grad_norm": 6.249314091321261, + "learning_rate": 1.7560598722234224e-06, + "loss": -1.3, + "step": 2854 + }, + { + "epoch": 0.304664373149852, + "grad_norm": 12.716491627386429, + "learning_rate": 1.755572185055924e-06, + "loss": -1.0244, + "step": 2856 + }, + { + "epoch": 0.30487772355121745, + "grad_norm": 9.90411282680557, + "learning_rate": 1.755084078745118e-06, + "loss": 0.1782, + "step": 2858 + }, + { + "epoch": 0.3050910739525829, + "grad_norm": 7.771238430103722, + "learning_rate": 1.7545955535617742e-06, + "loss": -0.3729, + "step": 2860 + }, + { + "epoch": 0.30530442435394833, + "grad_norm": 4.718467771085636, + "learning_rate": 1.754106609776896e-06, + "loss": -0.2853, + "step": 2862 + }, + { + "epoch": 0.3055177747553138, + "grad_norm": 9.705759418403023, + "learning_rate": 1.7536172476617178e-06, + "loss": -0.3391, + "step": 2864 + }, + { + "epoch": 0.3057311251566792, + "grad_norm": 9.911575437619387, + "learning_rate": 1.7531274674877067e-06, + "loss": -0.8393, + "step": 2866 + }, + { + "epoch": 0.30594447555804466, + "grad_norm": 8.461639774831735, + "learning_rate": 1.7526372695265616e-06, + "loss": -1.3139, + "step": 2868 + }, + { + "epoch": 0.3061578259594101, + "grad_norm": 13.328466601854974, + "learning_rate": 1.7521466540502127e-06, + "loss": -0.0534, + "step": 2870 + }, + { + "epoch": 0.30637117636077554, + "grad_norm": 7.063127903865726, + "learning_rate": 1.7516556213308226e-06, + "loss": -0.7124, + "step": 2872 + }, + { + "epoch": 0.306584526762141, + "grad_norm": 12.688427899475334, + "learning_rate": 1.7511641716407849e-06, + "loss": -0.7787, + "step": 2874 + }, + { + "epoch": 0.3067978771635064, + "grad_norm": 6.750082152092763, + "learning_rate": 1.7506723052527242e-06, + "loss": -0.0491, + "step": 2876 + }, + { + "epoch": 0.30701122756487187, + "grad_norm": 9.287368380996943, + "learning_rate": 1.7501800224394972e-06, + "loss": -0.3771, + "step": 2878 + }, + { + "epoch": 0.3072245779662373, + "grad_norm": 10.161814155595016, + "learning_rate": 1.7496873234741907e-06, + "loss": 0.3586, + "step": 2880 + }, + { + "epoch": 0.30743792836760275, + "grad_norm": 7.410928339783802, + "learning_rate": 1.7491942086301225e-06, + "loss": -0.405, + "step": 2882 + }, + { + "epoch": 0.3076512787689682, + "grad_norm": 15.260363202263452, + "learning_rate": 1.748700678180842e-06, + "loss": 0.2674, + "step": 2884 + }, + { + "epoch": 0.30786462917033364, + "grad_norm": 5.744444781485549, + "learning_rate": 1.7482067324001277e-06, + "loss": -0.1325, + "step": 2886 + }, + { + "epoch": 0.3080779795716991, + "grad_norm": 10.467980026087549, + "learning_rate": 1.7477123715619897e-06, + "loss": 0.2441, + "step": 2888 + }, + { + "epoch": 0.3082913299730645, + "grad_norm": 5.7283079759989946, + "learning_rate": 1.7472175959406677e-06, + "loss": -0.1382, + "step": 2890 + }, + { + "epoch": 0.30850468037442996, + "grad_norm": 9.41465115620282, + "learning_rate": 1.7467224058106317e-06, + "loss": -0.0935, + "step": 2892 + }, + { + "epoch": 0.3087180307757954, + "grad_norm": 15.649903160325573, + "learning_rate": 1.746226801446582e-06, + "loss": 0.1144, + "step": 2894 + }, + { + "epoch": 0.30893138117716085, + "grad_norm": 11.300775186136208, + "learning_rate": 1.7457307831234475e-06, + "loss": 0.8083, + "step": 2896 + }, + { + "epoch": 0.3091447315785263, + "grad_norm": 10.861551087357146, + "learning_rate": 1.7452343511163883e-06, + "loss": -0.5417, + "step": 2898 + }, + { + "epoch": 0.30935808197989173, + "grad_norm": 11.554703698801825, + "learning_rate": 1.744737505700793e-06, + "loss": -0.3645, + "step": 2900 + }, + { + "epoch": 0.30957143238125717, + "grad_norm": 16.26518600294038, + "learning_rate": 1.74424024715228e-06, + "loss": 0.0521, + "step": 2902 + }, + { + "epoch": 0.3097847827826226, + "grad_norm": 10.730179703460292, + "learning_rate": 1.743742575746696e-06, + "loss": -0.7332, + "step": 2904 + }, + { + "epoch": 0.30999813318398806, + "grad_norm": 4.79248186748041, + "learning_rate": 1.743244491760118e-06, + "loss": 0.1332, + "step": 2906 + }, + { + "epoch": 0.3102114835853535, + "grad_norm": 10.493873462890468, + "learning_rate": 1.7427459954688515e-06, + "loss": -0.0377, + "step": 2908 + }, + { + "epoch": 0.31042483398671894, + "grad_norm": 3.7802123937555874, + "learning_rate": 1.7422470871494298e-06, + "loss": -0.4143, + "step": 2910 + }, + { + "epoch": 0.3106381843880844, + "grad_norm": 6.058440256632867, + "learning_rate": 1.7417477670786156e-06, + "loss": -0.9159, + "step": 2912 + }, + { + "epoch": 0.3108515347894498, + "grad_norm": 13.218030730490323, + "learning_rate": 1.7412480355334002e-06, + "loss": -1.3396, + "step": 2914 + }, + { + "epoch": 0.31106488519081527, + "grad_norm": 5.935886247031169, + "learning_rate": 1.7407478927910023e-06, + "loss": -0.9298, + "step": 2916 + }, + { + "epoch": 0.3112782355921807, + "grad_norm": 7.006989263482837, + "learning_rate": 1.7402473391288699e-06, + "loss": 0.3035, + "step": 2918 + }, + { + "epoch": 0.31149158599354615, + "grad_norm": 8.191774890066174, + "learning_rate": 1.7397463748246778e-06, + "loss": -0.9406, + "step": 2920 + }, + { + "epoch": 0.3117049363949116, + "grad_norm": 5.363753912381776, + "learning_rate": 1.739245000156329e-06, + "loss": -0.3519, + "step": 2922 + }, + { + "epoch": 0.31191828679627703, + "grad_norm": 12.526973910552128, + "learning_rate": 1.7387432154019548e-06, + "loss": 0.0832, + "step": 2924 + }, + { + "epoch": 0.3121316371976425, + "grad_norm": 38.681759515200184, + "learning_rate": 1.7382410208399127e-06, + "loss": -0.3007, + "step": 2926 + }, + { + "epoch": 0.3123449875990079, + "grad_norm": 5.437476752704303, + "learning_rate": 1.737738416748789e-06, + "loss": -0.2634, + "step": 2928 + }, + { + "epoch": 0.31255833800037336, + "grad_norm": 5.666548423554775, + "learning_rate": 1.7372354034073956e-06, + "loss": -1.0394, + "step": 2930 + }, + { + "epoch": 0.3127716884017388, + "grad_norm": 8.411785231022325, + "learning_rate": 1.7367319810947727e-06, + "loss": -0.3104, + "step": 2932 + }, + { + "epoch": 0.31298503880310424, + "grad_norm": 12.092899913849372, + "learning_rate": 1.736228150090187e-06, + "loss": 0.3813, + "step": 2934 + }, + { + "epoch": 0.3131983892044697, + "grad_norm": 13.994107808942296, + "learning_rate": 1.7357239106731317e-06, + "loss": -0.7699, + "step": 2936 + }, + { + "epoch": 0.3134117396058351, + "grad_norm": 10.73474084696605, + "learning_rate": 1.7352192631233265e-06, + "loss": -1.6973, + "step": 2938 + }, + { + "epoch": 0.31362509000720057, + "grad_norm": 8.147344424934365, + "learning_rate": 1.7347142077207182e-06, + "loss": -0.0864, + "step": 2940 + }, + { + "epoch": 0.313838440408566, + "grad_norm": 9.544503195295048, + "learning_rate": 1.734208744745479e-06, + "loss": 0.463, + "step": 2942 + }, + { + "epoch": 0.31405179080993145, + "grad_norm": 19.64215598986802, + "learning_rate": 1.7337028744780077e-06, + "loss": 0.1109, + "step": 2944 + }, + { + "epoch": 0.3142651412112969, + "grad_norm": 9.966014481466955, + "learning_rate": 1.7331965971989289e-06, + "loss": -0.3279, + "step": 2946 + }, + { + "epoch": 0.31447849161266234, + "grad_norm": 11.93520274747805, + "learning_rate": 1.7326899131890924e-06, + "loss": 0.5031, + "step": 2948 + }, + { + "epoch": 0.3146918420140278, + "grad_norm": 13.108906815251888, + "learning_rate": 1.7321828227295752e-06, + "loss": -0.6378, + "step": 2950 + }, + { + "epoch": 0.3149051924153932, + "grad_norm": 10.8908721251518, + "learning_rate": 1.7316753261016782e-06, + "loss": 0.5964, + "step": 2952 + }, + { + "epoch": 0.31511854281675866, + "grad_norm": 8.5240740399807, + "learning_rate": 1.7311674235869283e-06, + "loss": -0.5387, + "step": 2954 + }, + { + "epoch": 0.3153318932181241, + "grad_norm": 11.602750987337258, + "learning_rate": 1.7306591154670775e-06, + "loss": -0.4253, + "step": 2956 + }, + { + "epoch": 0.31554524361948955, + "grad_norm": 16.704761353033927, + "learning_rate": 1.7301504020241026e-06, + "loss": -0.0403, + "step": 2958 + }, + { + "epoch": 0.315758594020855, + "grad_norm": 10.078151447346247, + "learning_rate": 1.7296412835402055e-06, + "loss": -0.4997, + "step": 2960 + }, + { + "epoch": 0.31597194442222043, + "grad_norm": 4.913264407339711, + "learning_rate": 1.7291317602978129e-06, + "loss": 0.002, + "step": 2962 + }, + { + "epoch": 0.31618529482358587, + "grad_norm": 19.58741422185664, + "learning_rate": 1.7286218325795754e-06, + "loss": -1.2726, + "step": 2964 + }, + { + "epoch": 0.3163986452249513, + "grad_norm": 10.98228012098515, + "learning_rate": 1.7281115006683687e-06, + "loss": 0.8124, + "step": 2966 + }, + { + "epoch": 0.31661199562631676, + "grad_norm": 16.450320792354887, + "learning_rate": 1.7276007648472926e-06, + "loss": -0.0522, + "step": 2968 + }, + { + "epoch": 0.3168253460276822, + "grad_norm": 3.200159272645382, + "learning_rate": 1.7270896253996703e-06, + "loss": -0.5279, + "step": 2970 + }, + { + "epoch": 0.31703869642904764, + "grad_norm": 5.463940504180102, + "learning_rate": 1.7265780826090497e-06, + "loss": 0.117, + "step": 2972 + }, + { + "epoch": 0.3172520468304131, + "grad_norm": 4.42207369617342, + "learning_rate": 1.726066136759202e-06, + "loss": -0.5697, + "step": 2974 + }, + { + "epoch": 0.3174653972317785, + "grad_norm": 7.430476701092665, + "learning_rate": 1.7255537881341225e-06, + "loss": -0.1709, + "step": 2976 + }, + { + "epoch": 0.31767874763314397, + "grad_norm": 4.901152303480771, + "learning_rate": 1.7250410370180292e-06, + "loss": 0.0231, + "step": 2978 + }, + { + "epoch": 0.3178920980345094, + "grad_norm": 12.64586858767348, + "learning_rate": 1.724527883695364e-06, + "loss": 0.1277, + "step": 2980 + }, + { + "epoch": 0.31810544843587485, + "grad_norm": 7.147242560376244, + "learning_rate": 1.7240143284507916e-06, + "loss": 0.1304, + "step": 2982 + }, + { + "epoch": 0.3183187988372403, + "grad_norm": 12.222789057303444, + "learning_rate": 1.7235003715691994e-06, + "loss": -1.3749, + "step": 2984 + }, + { + "epoch": 0.31853214923860573, + "grad_norm": 4.117811931706003, + "learning_rate": 1.7229860133356985e-06, + "loss": 0.1378, + "step": 2986 + }, + { + "epoch": 0.3187454996399712, + "grad_norm": 54.422041496710804, + "learning_rate": 1.7224712540356215e-06, + "loss": -0.3549, + "step": 2988 + }, + { + "epoch": 0.3189588500413366, + "grad_norm": 9.11752175587024, + "learning_rate": 1.7219560939545242e-06, + "loss": -0.129, + "step": 2990 + }, + { + "epoch": 0.31917220044270206, + "grad_norm": 7.967581604621134, + "learning_rate": 1.721440533378185e-06, + "loss": -0.8812, + "step": 2992 + }, + { + "epoch": 0.3193855508440675, + "grad_norm": 21.432871523933642, + "learning_rate": 1.720924572592604e-06, + "loss": -1.1379, + "step": 2994 + }, + { + "epoch": 0.31959890124543294, + "grad_norm": 6.955067111293734, + "learning_rate": 1.720408211884003e-06, + "loss": -0.3025, + "step": 2996 + }, + { + "epoch": 0.3198122516467984, + "grad_norm": 6.096470591642468, + "learning_rate": 1.719891451538826e-06, + "loss": -0.9891, + "step": 2998 + }, + { + "epoch": 0.3200256020481638, + "grad_norm": 6.531527814141025, + "learning_rate": 1.719374291843739e-06, + "loss": -0.9327, + "step": 3000 + }, + { + "epoch": 0.32023895244952927, + "grad_norm": 13.386255122599865, + "learning_rate": 1.7188567330856291e-06, + "loss": -1.2464, + "step": 3002 + }, + { + "epoch": 0.32045230285089477, + "grad_norm": 12.426470616235903, + "learning_rate": 1.7183387755516053e-06, + "loss": -0.6217, + "step": 3004 + }, + { + "epoch": 0.3206656532522602, + "grad_norm": 16.304558071825454, + "learning_rate": 1.717820419528997e-06, + "loss": -0.6603, + "step": 3006 + }, + { + "epoch": 0.32087900365362565, + "grad_norm": 4.264332470799236, + "learning_rate": 1.717301665305355e-06, + "loss": -1.1113, + "step": 3008 + }, + { + "epoch": 0.3210923540549911, + "grad_norm": 6.085407743169536, + "learning_rate": 1.7167825131684511e-06, + "loss": -0.1234, + "step": 3010 + }, + { + "epoch": 0.32130570445635653, + "grad_norm": 6.295542219401869, + "learning_rate": 1.7162629634062783e-06, + "loss": 0.4483, + "step": 3012 + }, + { + "epoch": 0.321519054857722, + "grad_norm": 4.0286633634210185, + "learning_rate": 1.7157430163070496e-06, + "loss": -0.5352, + "step": 3014 + }, + { + "epoch": 0.3217324052590874, + "grad_norm": 6.32486146002465, + "learning_rate": 1.7152226721591983e-06, + "loss": -0.6175, + "step": 3016 + }, + { + "epoch": 0.32194575566045286, + "grad_norm": 7.400820922346224, + "learning_rate": 1.714701931251378e-06, + "loss": -1.5838, + "step": 3018 + }, + { + "epoch": 0.3221591060618183, + "grad_norm": 8.099890893582849, + "learning_rate": 1.7141807938724627e-06, + "loss": 0.2871, + "step": 3020 + }, + { + "epoch": 0.32237245646318374, + "grad_norm": 9.803631301093835, + "learning_rate": 1.7136592603115466e-06, + "loss": -0.597, + "step": 3022 + }, + { + "epoch": 0.3225858068645492, + "grad_norm": 8.179724001097899, + "learning_rate": 1.7131373308579428e-06, + "loss": -0.0799, + "step": 3024 + }, + { + "epoch": 0.3227991572659146, + "grad_norm": 7.858218053119798, + "learning_rate": 1.7126150058011848e-06, + "loss": 1.1274, + "step": 3026 + }, + { + "epoch": 0.32301250766728007, + "grad_norm": 4.725656860251144, + "learning_rate": 1.7120922854310256e-06, + "loss": -1.4562, + "step": 3028 + }, + { + "epoch": 0.3232258580686455, + "grad_norm": 6.159953465853282, + "learning_rate": 1.7115691700374363e-06, + "loss": -1.3495, + "step": 3030 + }, + { + "epoch": 0.32343920847001095, + "grad_norm": 3.4620591869840656, + "learning_rate": 1.7110456599106084e-06, + "loss": 0.1381, + "step": 3032 + }, + { + "epoch": 0.3236525588713764, + "grad_norm": 5.003531400077255, + "learning_rate": 1.7105217553409522e-06, + "loss": 0.0701, + "step": 3034 + }, + { + "epoch": 0.32386590927274184, + "grad_norm": 11.693640569297688, + "learning_rate": 1.7099974566190962e-06, + "loss": -0.7734, + "step": 3036 + }, + { + "epoch": 0.3240792596741073, + "grad_norm": 39.088857397311756, + "learning_rate": 1.7094727640358881e-06, + "loss": -0.3107, + "step": 3038 + }, + { + "epoch": 0.3242926100754727, + "grad_norm": 5.443276841043003, + "learning_rate": 1.708947677882394e-06, + "loss": -0.0289, + "step": 3040 + }, + { + "epoch": 0.32450596047683816, + "grad_norm": 8.217257490657822, + "learning_rate": 1.7084221984498982e-06, + "loss": -0.0188, + "step": 3042 + }, + { + "epoch": 0.3247193108782036, + "grad_norm": 17.781485459450163, + "learning_rate": 1.7078963260299028e-06, + "loss": -0.0999, + "step": 3044 + }, + { + "epoch": 0.32493266127956905, + "grad_norm": 4.453734215905154, + "learning_rate": 1.7073700609141294e-06, + "loss": -1.0199, + "step": 3046 + }, + { + "epoch": 0.3251460116809345, + "grad_norm": 8.784767018086846, + "learning_rate": 1.706843403394515e-06, + "loss": 0.4665, + "step": 3048 + }, + { + "epoch": 0.32535936208229993, + "grad_norm": 14.527151035435475, + "learning_rate": 1.7063163537632165e-06, + "loss": -0.3844, + "step": 3050 + }, + { + "epoch": 0.32557271248366537, + "grad_norm": 9.315999314442346, + "learning_rate": 1.7057889123126073e-06, + "loss": -0.0373, + "step": 3052 + }, + { + "epoch": 0.3257860628850308, + "grad_norm": 5.5314193199282045, + "learning_rate": 1.705261079335278e-06, + "loss": 1.1543, + "step": 3054 + }, + { + "epoch": 0.32599941328639626, + "grad_norm": 24.13469506045593, + "learning_rate": 1.7047328551240368e-06, + "loss": 0.387, + "step": 3056 + }, + { + "epoch": 0.3262127636877617, + "grad_norm": 6.156294112263579, + "learning_rate": 1.7042042399719093e-06, + "loss": 0.2246, + "step": 3058 + }, + { + "epoch": 0.32642611408912714, + "grad_norm": 8.841168736939753, + "learning_rate": 1.7036752341721373e-06, + "loss": -0.3831, + "step": 3060 + }, + { + "epoch": 0.3266394644904926, + "grad_norm": 9.640386934892431, + "learning_rate": 1.703145838018179e-06, + "loss": -0.5793, + "step": 3062 + }, + { + "epoch": 0.326852814891858, + "grad_norm": 8.410223740218925, + "learning_rate": 1.7026160518037103e-06, + "loss": 0.2049, + "step": 3064 + }, + { + "epoch": 0.32706616529322347, + "grad_norm": 12.439400849952515, + "learning_rate": 1.7020858758226226e-06, + "loss": -0.5213, + "step": 3066 + }, + { + "epoch": 0.3272795156945889, + "grad_norm": 8.26514321433625, + "learning_rate": 1.7015553103690235e-06, + "loss": -1.1538, + "step": 3068 + }, + { + "epoch": 0.32749286609595435, + "grad_norm": 10.704114888232002, + "learning_rate": 1.7010243557372372e-06, + "loss": -0.7313, + "step": 3070 + }, + { + "epoch": 0.3277062164973198, + "grad_norm": 9.543709665128198, + "learning_rate": 1.7004930122218038e-06, + "loss": -0.829, + "step": 3072 + }, + { + "epoch": 0.32791956689868523, + "grad_norm": 10.209561525417685, + "learning_rate": 1.6999612801174781e-06, + "loss": 0.1054, + "step": 3074 + }, + { + "epoch": 0.3281329173000507, + "grad_norm": 11.842439957247946, + "learning_rate": 1.6994291597192318e-06, + "loss": 0.0487, + "step": 3076 + }, + { + "epoch": 0.3283462677014161, + "grad_norm": 7.530227211908117, + "learning_rate": 1.6988966513222511e-06, + "loss": -0.6392, + "step": 3078 + }, + { + "epoch": 0.32855961810278156, + "grad_norm": 9.39009088547636, + "learning_rate": 1.698363755221938e-06, + "loss": -1.8414, + "step": 3080 + }, + { + "epoch": 0.328772968504147, + "grad_norm": 4.590873881955238, + "learning_rate": 1.6978304717139086e-06, + "loss": 0.3225, + "step": 3082 + }, + { + "epoch": 0.32898631890551244, + "grad_norm": 9.541413046775592, + "learning_rate": 1.6972968010939952e-06, + "loss": -0.7564, + "step": 3084 + }, + { + "epoch": 0.3291996693068779, + "grad_norm": 10.451942737829206, + "learning_rate": 1.6967627436582444e-06, + "loss": -1.5702, + "step": 3086 + }, + { + "epoch": 0.3294130197082433, + "grad_norm": 4.304025094497326, + "learning_rate": 1.696228299702917e-06, + "loss": 0.4697, + "step": 3088 + }, + { + "epoch": 0.32962637010960877, + "grad_norm": 19.13994189431934, + "learning_rate": 1.6956934695244884e-06, + "loss": -0.4502, + "step": 3090 + }, + { + "epoch": 0.3298397205109742, + "grad_norm": 20.606001811708634, + "learning_rate": 1.6951582534196485e-06, + "loss": -0.4758, + "step": 3092 + }, + { + "epoch": 0.33005307091233965, + "grad_norm": 15.001606048337031, + "learning_rate": 1.694622651685301e-06, + "loss": 0.0416, + "step": 3094 + }, + { + "epoch": 0.3302664213137051, + "grad_norm": 5.061720578939279, + "learning_rate": 1.6940866646185633e-06, + "loss": 0.1878, + "step": 3096 + }, + { + "epoch": 0.33047977171507054, + "grad_norm": 2.9825654154910746, + "learning_rate": 1.6935502925167673e-06, + "loss": -0.5476, + "step": 3098 + }, + { + "epoch": 0.330693122116436, + "grad_norm": 4.4437366020666555, + "learning_rate": 1.6930135356774578e-06, + "loss": -0.3807, + "step": 3100 + }, + { + "epoch": 0.3309064725178014, + "grad_norm": 18.504456243702883, + "learning_rate": 1.6924763943983935e-06, + "loss": -0.9451, + "step": 3102 + }, + { + "epoch": 0.33111982291916686, + "grad_norm": 3.671361377254186, + "learning_rate": 1.6919388689775461e-06, + "loss": 0.0806, + "step": 3104 + }, + { + "epoch": 0.3313331733205323, + "grad_norm": 14.550844288867811, + "learning_rate": 1.6914009597131007e-06, + "loss": 0.3961, + "step": 3106 + }, + { + "epoch": 0.33154652372189775, + "grad_norm": 22.069924951701303, + "learning_rate": 1.6908626669034543e-06, + "loss": -0.5638, + "step": 3108 + }, + { + "epoch": 0.3317598741232632, + "grad_norm": 12.589023008755671, + "learning_rate": 1.6903239908472182e-06, + "loss": -0.3152, + "step": 3110 + }, + { + "epoch": 0.33197322452462863, + "grad_norm": 9.110466158707583, + "learning_rate": 1.6897849318432153e-06, + "loss": -0.3695, + "step": 3112 + }, + { + "epoch": 0.33218657492599407, + "grad_norm": 5.785915465359906, + "learning_rate": 1.6892454901904813e-06, + "loss": -0.6666, + "step": 3114 + }, + { + "epoch": 0.3323999253273595, + "grad_norm": 5.477117496845735, + "learning_rate": 1.6887056661882641e-06, + "loss": -1.2713, + "step": 3116 + }, + { + "epoch": 0.33261327572872496, + "grad_norm": 9.84926747892214, + "learning_rate": 1.6881654601360234e-06, + "loss": -0.0477, + "step": 3118 + }, + { + "epoch": 0.3328266261300904, + "grad_norm": 16.69992366831979, + "learning_rate": 1.6876248723334313e-06, + "loss": -0.3342, + "step": 3120 + }, + { + "epoch": 0.33303997653145584, + "grad_norm": 6.341684913031641, + "learning_rate": 1.6870839030803712e-06, + "loss": -0.6441, + "step": 3122 + }, + { + "epoch": 0.3332533269328213, + "grad_norm": 10.15617511635818, + "learning_rate": 1.6865425526769386e-06, + "loss": -0.5041, + "step": 3124 + }, + { + "epoch": 0.3334666773341867, + "grad_norm": 9.141269849161347, + "learning_rate": 1.6860008214234402e-06, + "loss": -0.3129, + "step": 3126 + }, + { + "epoch": 0.33368002773555216, + "grad_norm": 7.650238140142207, + "learning_rate": 1.685458709620394e-06, + "loss": -0.2322, + "step": 3128 + }, + { + "epoch": 0.3338933781369176, + "grad_norm": 11.737458965973284, + "learning_rate": 1.6849162175685294e-06, + "loss": -0.0456, + "step": 3130 + }, + { + "epoch": 0.33410672853828305, + "grad_norm": 12.680108128616446, + "learning_rate": 1.684373345568786e-06, + "loss": -0.3259, + "step": 3132 + }, + { + "epoch": 0.3343200789396485, + "grad_norm": 5.677739809426428, + "learning_rate": 1.6838300939223141e-06, + "loss": -0.2712, + "step": 3134 + }, + { + "epoch": 0.33453342934101393, + "grad_norm": 3.267972945195637, + "learning_rate": 1.6832864629304763e-06, + "loss": -1.0814, + "step": 3136 + }, + { + "epoch": 0.3347467797423794, + "grad_norm": 5.461825531004228, + "learning_rate": 1.6827424528948436e-06, + "loss": -0.5526, + "step": 3138 + }, + { + "epoch": 0.3349601301437448, + "grad_norm": 4.085213264779208, + "learning_rate": 1.6821980641171982e-06, + "loss": -1.2285, + "step": 3140 + }, + { + "epoch": 0.33517348054511026, + "grad_norm": 13.538985422878795, + "learning_rate": 1.6816532968995328e-06, + "loss": -0.0317, + "step": 3142 + }, + { + "epoch": 0.3353868309464757, + "grad_norm": 12.161389939811688, + "learning_rate": 1.681108151544049e-06, + "loss": -0.4708, + "step": 3144 + }, + { + "epoch": 0.33560018134784114, + "grad_norm": 12.473989679063589, + "learning_rate": 1.680562628353159e-06, + "loss": 0.0139, + "step": 3146 + }, + { + "epoch": 0.3358135317492066, + "grad_norm": 3.881838770986193, + "learning_rate": 1.6800167276294838e-06, + "loss": 0.1553, + "step": 3148 + }, + { + "epoch": 0.336026882150572, + "grad_norm": 3.5700319886575165, + "learning_rate": 1.6794704496758552e-06, + "loss": -0.3496, + "step": 3150 + }, + { + "epoch": 0.33624023255193747, + "grad_norm": 5.437524211916905, + "learning_rate": 1.6789237947953127e-06, + "loss": -0.045, + "step": 3152 + }, + { + "epoch": 0.3364535829533029, + "grad_norm": 6.869488758037986, + "learning_rate": 1.678376763291106e-06, + "loss": 0.2861, + "step": 3154 + }, + { + "epoch": 0.33666693335466835, + "grad_norm": 17.03259128960244, + "learning_rate": 1.677829355466693e-06, + "loss": -0.1362, + "step": 3156 + }, + { + "epoch": 0.3368802837560338, + "grad_norm": 7.462810224452117, + "learning_rate": 1.6772815716257411e-06, + "loss": 0.0214, + "step": 3158 + }, + { + "epoch": 0.33709363415739924, + "grad_norm": 6.449753709865442, + "learning_rate": 1.6767334120721253e-06, + "loss": -0.9799, + "step": 3160 + }, + { + "epoch": 0.3373069845587647, + "grad_norm": 21.181475486069445, + "learning_rate": 1.67618487710993e-06, + "loss": 0.4387, + "step": 3162 + }, + { + "epoch": 0.3375203349601301, + "grad_norm": 9.726531112397971, + "learning_rate": 1.6756359670434477e-06, + "loss": 0.4666, + "step": 3164 + }, + { + "epoch": 0.33773368536149556, + "grad_norm": 15.058426809242238, + "learning_rate": 1.6750866821771782e-06, + "loss": -0.2678, + "step": 3166 + }, + { + "epoch": 0.337947035762861, + "grad_norm": 15.489372020694978, + "learning_rate": 1.67453702281583e-06, + "loss": 0.3489, + "step": 3168 + }, + { + "epoch": 0.33816038616422645, + "grad_norm": 7.857511838340049, + "learning_rate": 1.6739869892643189e-06, + "loss": -0.0367, + "step": 3170 + }, + { + "epoch": 0.3383737365655919, + "grad_norm": 24.22553807035829, + "learning_rate": 1.6734365818277684e-06, + "loss": 0.2624, + "step": 3172 + }, + { + "epoch": 0.33858708696695733, + "grad_norm": 14.275876651480186, + "learning_rate": 1.6728858008115101e-06, + "loss": -0.0626, + "step": 3174 + }, + { + "epoch": 0.3388004373683228, + "grad_norm": 6.235409545013611, + "learning_rate": 1.6723346465210815e-06, + "loss": -0.1395, + "step": 3176 + }, + { + "epoch": 0.33901378776968827, + "grad_norm": 4.493638107958904, + "learning_rate": 1.6717831192622286e-06, + "loss": -0.5898, + "step": 3178 + }, + { + "epoch": 0.3392271381710537, + "grad_norm": 9.193460287308593, + "learning_rate": 1.6712312193409029e-06, + "loss": 0.0325, + "step": 3180 + }, + { + "epoch": 0.33944048857241915, + "grad_norm": 8.243090932693868, + "learning_rate": 1.6706789470632633e-06, + "loss": -0.5602, + "step": 3182 + }, + { + "epoch": 0.3396538389737846, + "grad_norm": 15.407441171168392, + "learning_rate": 1.6701263027356759e-06, + "loss": -0.9466, + "step": 3184 + }, + { + "epoch": 0.33986718937515004, + "grad_norm": 5.577563368416891, + "learning_rate": 1.669573286664712e-06, + "loss": -0.8347, + "step": 3186 + }, + { + "epoch": 0.3400805397765155, + "grad_norm": 9.192850979837617, + "learning_rate": 1.6690198991571503e-06, + "loss": -0.7142, + "step": 3188 + }, + { + "epoch": 0.3402938901778809, + "grad_norm": 13.952691672246821, + "learning_rate": 1.6684661405199742e-06, + "loss": -0.5281, + "step": 3190 + }, + { + "epoch": 0.34050724057924636, + "grad_norm": 10.227030494408895, + "learning_rate": 1.6679120110603742e-06, + "loss": -0.6764, + "step": 3192 + }, + { + "epoch": 0.3407205909806118, + "grad_norm": 10.184273969054853, + "learning_rate": 1.6673575110857456e-06, + "loss": -0.0888, + "step": 3194 + }, + { + "epoch": 0.34093394138197725, + "grad_norm": 8.508826103907639, + "learning_rate": 1.6668026409036903e-06, + "loss": -0.7058, + "step": 3196 + }, + { + "epoch": 0.3411472917833427, + "grad_norm": 3.6582692013912603, + "learning_rate": 1.666247400822014e-06, + "loss": 0.6785, + "step": 3198 + }, + { + "epoch": 0.34136064218470813, + "grad_norm": 50.07178983955498, + "learning_rate": 1.6656917911487294e-06, + "loss": 0.6981, + "step": 3200 + }, + { + "epoch": 0.34157399258607357, + "grad_norm": 9.317190431163114, + "learning_rate": 1.665135812192053e-06, + "loss": -0.8498, + "step": 3202 + }, + { + "epoch": 0.341787342987439, + "grad_norm": 9.105010718053359, + "learning_rate": 1.6645794642604063e-06, + "loss": -0.77, + "step": 3204 + }, + { + "epoch": 0.34200069338880446, + "grad_norm": 6.684264785470964, + "learning_rate": 1.6640227476624163e-06, + "loss": 0.1101, + "step": 3206 + }, + { + "epoch": 0.3422140437901699, + "grad_norm": 3.2699390960272203, + "learning_rate": 1.6634656627069131e-06, + "loss": -0.4581, + "step": 3208 + }, + { + "epoch": 0.34242739419153534, + "grad_norm": 8.29898158822913, + "learning_rate": 1.6629082097029325e-06, + "loss": -0.7177, + "step": 3210 + }, + { + "epoch": 0.3426407445929008, + "grad_norm": 15.834058549559645, + "learning_rate": 1.6623503889597137e-06, + "loss": 0.5422, + "step": 3212 + }, + { + "epoch": 0.3428540949942662, + "grad_norm": 10.6527723276521, + "learning_rate": 1.6617922007867003e-06, + "loss": -0.5952, + "step": 3214 + }, + { + "epoch": 0.34306744539563166, + "grad_norm": 5.199178008937951, + "learning_rate": 1.6612336454935393e-06, + "loss": -0.2615, + "step": 3216 + }, + { + "epoch": 0.3432807957969971, + "grad_norm": 11.953678830913363, + "learning_rate": 1.6606747233900813e-06, + "loss": -0.7082, + "step": 3218 + }, + { + "epoch": 0.34349414619836255, + "grad_norm": 9.972676475817549, + "learning_rate": 1.6601154347863812e-06, + "loss": -0.2687, + "step": 3220 + }, + { + "epoch": 0.343707496599728, + "grad_norm": 3.296744510537866, + "learning_rate": 1.6595557799926967e-06, + "loss": -1.1673, + "step": 3222 + }, + { + "epoch": 0.34392084700109343, + "grad_norm": 2.2481277086933944, + "learning_rate": 1.6589957593194886e-06, + "loss": -0.1472, + "step": 3224 + }, + { + "epoch": 0.3441341974024589, + "grad_norm": 7.691933803561385, + "learning_rate": 1.6584353730774203e-06, + "loss": 0.0742, + "step": 3226 + }, + { + "epoch": 0.3443475478038243, + "grad_norm": 7.919402129162591, + "learning_rate": 1.6578746215773586e-06, + "loss": 0.0566, + "step": 3228 + }, + { + "epoch": 0.34456089820518976, + "grad_norm": 6.07264887843916, + "learning_rate": 1.6573135051303727e-06, + "loss": 0.4135, + "step": 3230 + }, + { + "epoch": 0.3447742486065552, + "grad_norm": 8.616506113048915, + "learning_rate": 1.6567520240477343e-06, + "loss": -1.0616, + "step": 3232 + }, + { + "epoch": 0.34498759900792064, + "grad_norm": 6.659203825728373, + "learning_rate": 1.6561901786409173e-06, + "loss": -0.6624, + "step": 3234 + }, + { + "epoch": 0.3452009494092861, + "grad_norm": 3.559671460403945, + "learning_rate": 1.655627969221598e-06, + "loss": -0.3419, + "step": 3236 + }, + { + "epoch": 0.3454142998106515, + "grad_norm": 7.658360779453722, + "learning_rate": 1.6550653961016536e-06, + "loss": -0.1534, + "step": 3238 + }, + { + "epoch": 0.34562765021201697, + "grad_norm": 6.667011024583781, + "learning_rate": 1.6545024595931646e-06, + "loss": 0.3198, + "step": 3240 + }, + { + "epoch": 0.3458410006133824, + "grad_norm": 4.533305460840186, + "learning_rate": 1.653939160008412e-06, + "loss": -0.8003, + "step": 3242 + }, + { + "epoch": 0.34605435101474785, + "grad_norm": 14.1568867597931, + "learning_rate": 1.6533754976598786e-06, + "loss": -0.9882, + "step": 3244 + }, + { + "epoch": 0.3462677014161133, + "grad_norm": 6.573248422723091, + "learning_rate": 1.6528114728602485e-06, + "loss": -0.3088, + "step": 3246 + }, + { + "epoch": 0.34648105181747874, + "grad_norm": 6.70146625377648, + "learning_rate": 1.652247085922406e-06, + "loss": -0.3485, + "step": 3248 + }, + { + "epoch": 0.3466944022188442, + "grad_norm": 4.840094253869335, + "learning_rate": 1.6516823371594378e-06, + "loss": -0.2964, + "step": 3250 + }, + { + "epoch": 0.3469077526202096, + "grad_norm": 10.9588765442189, + "learning_rate": 1.6511172268846306e-06, + "loss": -0.2775, + "step": 3252 + }, + { + "epoch": 0.34712110302157506, + "grad_norm": 4.053428924572115, + "learning_rate": 1.650551755411471e-06, + "loss": -0.3772, + "step": 3254 + }, + { + "epoch": 0.3473344534229405, + "grad_norm": 7.094856156106697, + "learning_rate": 1.6499859230536466e-06, + "loss": -1.1553, + "step": 3256 + }, + { + "epoch": 0.34754780382430595, + "grad_norm": 3.43896478541135, + "learning_rate": 1.6494197301250456e-06, + "loss": -0.0546, + "step": 3258 + }, + { + "epoch": 0.3477611542256714, + "grad_norm": 11.504417289332123, + "learning_rate": 1.648853176939755e-06, + "loss": -0.1689, + "step": 3260 + }, + { + "epoch": 0.34797450462703683, + "grad_norm": 25.22406223239814, + "learning_rate": 1.6482862638120636e-06, + "loss": -1.1216, + "step": 3262 + }, + { + "epoch": 0.34818785502840227, + "grad_norm": 9.450253928884585, + "learning_rate": 1.6477189910564578e-06, + "loss": -0.063, + "step": 3264 + }, + { + "epoch": 0.3484012054297677, + "grad_norm": 5.814005397017966, + "learning_rate": 1.6471513589876246e-06, + "loss": -0.7116, + "step": 3266 + }, + { + "epoch": 0.34861455583113315, + "grad_norm": 8.392174198135338, + "learning_rate": 1.64658336792045e-06, + "loss": -0.5563, + "step": 3268 + }, + { + "epoch": 0.3488279062324986, + "grad_norm": 21.901568207034494, + "learning_rate": 1.646015018170019e-06, + "loss": 0.1632, + "step": 3270 + }, + { + "epoch": 0.34904125663386404, + "grad_norm": 12.837464898303143, + "learning_rate": 1.6454463100516165e-06, + "loss": -0.3298, + "step": 3272 + }, + { + "epoch": 0.3492546070352295, + "grad_norm": 9.60302117457049, + "learning_rate": 1.644877243880725e-06, + "loss": 0.1415, + "step": 3274 + }, + { + "epoch": 0.3494679574365949, + "grad_norm": 3.105706601626303, + "learning_rate": 1.6443078199730267e-06, + "loss": -0.9318, + "step": 3276 + }, + { + "epoch": 0.34968130783796036, + "grad_norm": 5.433537516046655, + "learning_rate": 1.6437380386444012e-06, + "loss": -0.1564, + "step": 3278 + }, + { + "epoch": 0.3498946582393258, + "grad_norm": 3.0772488527471125, + "learning_rate": 1.6431679002109271e-06, + "loss": -0.7156, + "step": 3280 + }, + { + "epoch": 0.35010800864069125, + "grad_norm": 11.720944197843869, + "learning_rate": 1.6425974049888805e-06, + "loss": -0.9968, + "step": 3282 + }, + { + "epoch": 0.3503213590420567, + "grad_norm": 6.353758107657624, + "learning_rate": 1.6420265532947361e-06, + "loss": -0.3491, + "step": 3284 + }, + { + "epoch": 0.35053470944342213, + "grad_norm": 5.107652273276747, + "learning_rate": 1.6414553454451662e-06, + "loss": -0.9469, + "step": 3286 + }, + { + "epoch": 0.3507480598447876, + "grad_norm": 12.053008812395598, + "learning_rate": 1.6408837817570407e-06, + "loss": -0.7691, + "step": 3288 + }, + { + "epoch": 0.350961410246153, + "grad_norm": 18.287951932301283, + "learning_rate": 1.6403118625474262e-06, + "loss": -0.5909, + "step": 3290 + }, + { + "epoch": 0.35117476064751846, + "grad_norm": 12.274851709637097, + "learning_rate": 1.6397395881335872e-06, + "loss": -0.9908, + "step": 3292 + }, + { + "epoch": 0.3513881110488839, + "grad_norm": 5.481314287135517, + "learning_rate": 1.6391669588329848e-06, + "loss": -0.4183, + "step": 3294 + }, + { + "epoch": 0.35160146145024934, + "grad_norm": 14.283839503281236, + "learning_rate": 1.6385939749632779e-06, + "loss": -0.1061, + "step": 3296 + }, + { + "epoch": 0.3518148118516148, + "grad_norm": 10.114179175546134, + "learning_rate": 1.638020636842321e-06, + "loss": -0.311, + "step": 3298 + }, + { + "epoch": 0.3520281622529802, + "grad_norm": 6.83958221344994, + "learning_rate": 1.6374469447881657e-06, + "loss": -0.482, + "step": 3300 + }, + { + "epoch": 0.35224151265434567, + "grad_norm": 9.148134252160519, + "learning_rate": 1.6368728991190598e-06, + "loss": -0.4335, + "step": 3302 + }, + { + "epoch": 0.3524548630557111, + "grad_norm": 5.747265126580517, + "learning_rate": 1.6362985001534472e-06, + "loss": 0.1102, + "step": 3304 + }, + { + "epoch": 0.35266821345707655, + "grad_norm": 7.651276190447906, + "learning_rate": 1.6357237482099683e-06, + "loss": -0.3333, + "step": 3306 + }, + { + "epoch": 0.352881563858442, + "grad_norm": 10.001990918446518, + "learning_rate": 1.6351486436074577e-06, + "loss": -0.2229, + "step": 3308 + }, + { + "epoch": 0.35309491425980744, + "grad_norm": 2.955700646406898, + "learning_rate": 1.6345731866649479e-06, + "loss": -1.2758, + "step": 3310 + }, + { + "epoch": 0.3533082646611729, + "grad_norm": 12.879838559987562, + "learning_rate": 1.6339973777016654e-06, + "loss": -0.4236, + "step": 3312 + }, + { + "epoch": 0.3535216150625383, + "grad_norm": 8.88090044689332, + "learning_rate": 1.6334212170370322e-06, + "loss": 0.5514, + "step": 3314 + }, + { + "epoch": 0.35373496546390376, + "grad_norm": 7.4614760049213515, + "learning_rate": 1.6328447049906658e-06, + "loss": 0.0489, + "step": 3316 + }, + { + "epoch": 0.3539483158652692, + "grad_norm": 15.482415320656317, + "learning_rate": 1.6322678418823779e-06, + "loss": -0.1407, + "step": 3318 + }, + { + "epoch": 0.35416166626663464, + "grad_norm": 10.922671938745687, + "learning_rate": 1.6316906280321759e-06, + "loss": 0.3056, + "step": 3320 + }, + { + "epoch": 0.3543750166680001, + "grad_norm": 12.66048668550555, + "learning_rate": 1.631113063760261e-06, + "loss": -0.0076, + "step": 3322 + }, + { + "epoch": 0.35458836706936553, + "grad_norm": 14.778403143371358, + "learning_rate": 1.630535149387029e-06, + "loss": -1.0884, + "step": 3324 + }, + { + "epoch": 0.35480171747073097, + "grad_norm": 6.060286297100181, + "learning_rate": 1.62995688523307e-06, + "loss": 0.6083, + "step": 3326 + }, + { + "epoch": 0.3550150678720964, + "grad_norm": 2.9873060655036667, + "learning_rate": 1.6293782716191685e-06, + "loss": -0.3643, + "step": 3328 + }, + { + "epoch": 0.35522841827346185, + "grad_norm": 7.904394666540456, + "learning_rate": 1.628799308866302e-06, + "loss": -0.7756, + "step": 3330 + }, + { + "epoch": 0.3554417686748273, + "grad_norm": 32.22355480070881, + "learning_rate": 1.6282199972956424e-06, + "loss": -1.119, + "step": 3332 + }, + { + "epoch": 0.35565511907619274, + "grad_norm": 8.398125294491482, + "learning_rate": 1.6276403372285545e-06, + "loss": 0.0212, + "step": 3334 + }, + { + "epoch": 0.3558684694775582, + "grad_norm": 2.224857084294324, + "learning_rate": 1.6270603289865972e-06, + "loss": -0.0091, + "step": 3336 + }, + { + "epoch": 0.3560818198789236, + "grad_norm": 8.895828273807302, + "learning_rate": 1.6264799728915217e-06, + "loss": -0.5476, + "step": 3338 + }, + { + "epoch": 0.35629517028028906, + "grad_norm": 10.79631783716361, + "learning_rate": 1.6258992692652733e-06, + "loss": -0.0367, + "step": 3340 + }, + { + "epoch": 0.3565085206816545, + "grad_norm": 12.501126415213596, + "learning_rate": 1.6253182184299888e-06, + "loss": -0.6547, + "step": 3342 + }, + { + "epoch": 0.35672187108301995, + "grad_norm": 14.260599484860318, + "learning_rate": 1.6247368207079978e-06, + "loss": -0.836, + "step": 3344 + }, + { + "epoch": 0.3569352214843854, + "grad_norm": 5.513527804410655, + "learning_rate": 1.6241550764218236e-06, + "loss": -0.765, + "step": 3346 + }, + { + "epoch": 0.3571485718857509, + "grad_norm": 19.174380310031356, + "learning_rate": 1.6235729858941807e-06, + "loss": -0.4087, + "step": 3348 + }, + { + "epoch": 0.35736192228711633, + "grad_norm": 3.8157904515151397, + "learning_rate": 1.622990549447975e-06, + "loss": 0.1276, + "step": 3350 + }, + { + "epoch": 0.35757527268848177, + "grad_norm": 5.205093424943594, + "learning_rate": 1.622407767406306e-06, + "loss": 0.0283, + "step": 3352 + }, + { + "epoch": 0.3577886230898472, + "grad_norm": 5.999425278986083, + "learning_rate": 1.621824640092464e-06, + "loss": 0.6823, + "step": 3354 + }, + { + "epoch": 0.35800197349121265, + "grad_norm": 26.847681866408717, + "learning_rate": 1.6212411678299303e-06, + "loss": 0.5556, + "step": 3356 + }, + { + "epoch": 0.3582153238925781, + "grad_norm": 6.5764404925116, + "learning_rate": 1.6206573509423787e-06, + "loss": -0.6195, + "step": 3358 + }, + { + "epoch": 0.35842867429394354, + "grad_norm": 5.796268572403085, + "learning_rate": 1.6200731897536732e-06, + "loss": -0.4747, + "step": 3360 + }, + { + "epoch": 0.358642024695309, + "grad_norm": 20.25277587232032, + "learning_rate": 1.6194886845878698e-06, + "loss": 0.2142, + "step": 3362 + }, + { + "epoch": 0.3588553750966744, + "grad_norm": 4.922303897477237, + "learning_rate": 1.6189038357692138e-06, + "loss": -0.4065, + "step": 3364 + }, + { + "epoch": 0.35906872549803986, + "grad_norm": 12.102687658469536, + "learning_rate": 1.618318643622143e-06, + "loss": 0.0467, + "step": 3366 + }, + { + "epoch": 0.3592820758994053, + "grad_norm": 5.634496017022963, + "learning_rate": 1.6177331084712843e-06, + "loss": -0.8244, + "step": 3368 + }, + { + "epoch": 0.35949542630077075, + "grad_norm": 6.531509936175245, + "learning_rate": 1.6171472306414553e-06, + "loss": 0.3863, + "step": 3370 + }, + { + "epoch": 0.3597087767021362, + "grad_norm": 7.749608930081726, + "learning_rate": 1.6165610104576635e-06, + "loss": -0.4161, + "step": 3372 + }, + { + "epoch": 0.35992212710350163, + "grad_norm": 9.978199358319136, + "learning_rate": 1.6159744482451067e-06, + "loss": 0.225, + "step": 3374 + }, + { + "epoch": 0.3601354775048671, + "grad_norm": 2.8180065713964315, + "learning_rate": 1.6153875443291724e-06, + "loss": -0.5235, + "step": 3376 + }, + { + "epoch": 0.3603488279062325, + "grad_norm": 3.2536766750339687, + "learning_rate": 1.6148002990354372e-06, + "loss": -0.8102, + "step": 3378 + }, + { + "epoch": 0.36056217830759796, + "grad_norm": 6.776642109066481, + "learning_rate": 1.6142127126896679e-06, + "loss": -0.6258, + "step": 3380 + }, + { + "epoch": 0.3607755287089634, + "grad_norm": 9.191311745649323, + "learning_rate": 1.6136247856178194e-06, + "loss": 0.0416, + "step": 3382 + }, + { + "epoch": 0.36098887911032884, + "grad_norm": 5.958214276863685, + "learning_rate": 1.6130365181460359e-06, + "loss": -0.3925, + "step": 3384 + }, + { + "epoch": 0.3612022295116943, + "grad_norm": 18.17735136986778, + "learning_rate": 1.6124479106006518e-06, + "loss": -0.3844, + "step": 3386 + }, + { + "epoch": 0.3614155799130597, + "grad_norm": 6.11819395273999, + "learning_rate": 1.6118589633081884e-06, + "loss": -0.2621, + "step": 3388 + }, + { + "epoch": 0.36162893031442517, + "grad_norm": 9.532671676915985, + "learning_rate": 1.6112696765953558e-06, + "loss": 0.2025, + "step": 3390 + }, + { + "epoch": 0.3618422807157906, + "grad_norm": 8.516902754099748, + "learning_rate": 1.6106800507890533e-06, + "loss": -0.4136, + "step": 3392 + }, + { + "epoch": 0.36205563111715605, + "grad_norm": 10.533268296057837, + "learning_rate": 1.6100900862163675e-06, + "loss": -0.2169, + "step": 3394 + }, + { + "epoch": 0.3622689815185215, + "grad_norm": 64.64111511125043, + "learning_rate": 1.6094997832045732e-06, + "loss": -0.0717, + "step": 3396 + }, + { + "epoch": 0.36248233191988694, + "grad_norm": 5.2985530690025415, + "learning_rate": 1.608909142081133e-06, + "loss": -0.2195, + "step": 3398 + }, + { + "epoch": 0.3626956823212524, + "grad_norm": 13.76013579379519, + "learning_rate": 1.6083181631736969e-06, + "loss": -0.1999, + "step": 3400 + }, + { + "epoch": 0.3629090327226178, + "grad_norm": 19.488232964816568, + "learning_rate": 1.607726846810102e-06, + "loss": -1.0721, + "step": 3402 + }, + { + "epoch": 0.36312238312398326, + "grad_norm": 7.976673902612208, + "learning_rate": 1.6071351933183731e-06, + "loss": -0.9085, + "step": 3404 + }, + { + "epoch": 0.3633357335253487, + "grad_norm": 18.051012964378337, + "learning_rate": 1.6065432030267227e-06, + "loss": -0.2483, + "step": 3406 + }, + { + "epoch": 0.36354908392671414, + "grad_norm": 7.984821072469964, + "learning_rate": 1.605950876263548e-06, + "loss": 0.2467, + "step": 3408 + }, + { + "epoch": 0.3637624343280796, + "grad_norm": 10.471399932609952, + "learning_rate": 1.605358213357435e-06, + "loss": -0.1, + "step": 3410 + }, + { + "epoch": 0.36397578472944503, + "grad_norm": 11.80819922706443, + "learning_rate": 1.604765214637155e-06, + "loss": -0.0843, + "step": 3412 + }, + { + "epoch": 0.36418913513081047, + "grad_norm": 12.99142554932335, + "learning_rate": 1.604171880431666e-06, + "loss": -0.7969, + "step": 3414 + }, + { + "epoch": 0.3644024855321759, + "grad_norm": 10.309151636626616, + "learning_rate": 1.6035782110701121e-06, + "loss": 0.5247, + "step": 3416 + }, + { + "epoch": 0.36461583593354135, + "grad_norm": 8.438129063702482, + "learning_rate": 1.602984206881823e-06, + "loss": -1.1092, + "step": 3418 + }, + { + "epoch": 0.3648291863349068, + "grad_norm": 7.858614548648805, + "learning_rate": 1.6023898681963145e-06, + "loss": 0.3757, + "step": 3420 + }, + { + "epoch": 0.36504253673627224, + "grad_norm": 9.120078858273622, + "learning_rate": 1.6017951953432877e-06, + "loss": 0.8939, + "step": 3422 + }, + { + "epoch": 0.3652558871376377, + "grad_norm": 12.772956727147955, + "learning_rate": 1.6012001886526294e-06, + "loss": 0.0471, + "step": 3424 + }, + { + "epoch": 0.3654692375390031, + "grad_norm": 12.189366055513569, + "learning_rate": 1.6006048484544116e-06, + "loss": -1.8708, + "step": 3426 + }, + { + "epoch": 0.36568258794036856, + "grad_norm": 7.407081580665367, + "learning_rate": 1.6000091750788904e-06, + "loss": -1.1229, + "step": 3428 + }, + { + "epoch": 0.365895938341734, + "grad_norm": 4.297411318315459, + "learning_rate": 1.599413168856508e-06, + "loss": -0.9688, + "step": 3430 + }, + { + "epoch": 0.36610928874309945, + "grad_norm": 9.307911661330278, + "learning_rate": 1.598816830117891e-06, + "loss": -0.6191, + "step": 3432 + }, + { + "epoch": 0.3663226391444649, + "grad_norm": 27.250922121454458, + "learning_rate": 1.5982201591938494e-06, + "loss": 0.2745, + "step": 3434 + }, + { + "epoch": 0.36653598954583033, + "grad_norm": 16.54538086629179, + "learning_rate": 1.5976231564153783e-06, + "loss": -1.2112, + "step": 3436 + }, + { + "epoch": 0.3667493399471958, + "grad_norm": 11.673647753424502, + "learning_rate": 1.5970258221136574e-06, + "loss": 0.6495, + "step": 3438 + }, + { + "epoch": 0.3669626903485612, + "grad_norm": 12.52237843700656, + "learning_rate": 1.5964281566200492e-06, + "loss": -0.5357, + "step": 3440 + }, + { + "epoch": 0.36717604074992666, + "grad_norm": 12.179727643506995, + "learning_rate": 1.5958301602661007e-06, + "loss": -0.5334, + "step": 3442 + }, + { + "epoch": 0.3673893911512921, + "grad_norm": 11.180900904415719, + "learning_rate": 1.5952318333835418e-06, + "loss": 0.2141, + "step": 3444 + }, + { + "epoch": 0.36760274155265754, + "grad_norm": 10.203469660283607, + "learning_rate": 1.5946331763042866e-06, + "loss": -1.3314, + "step": 3446 + }, + { + "epoch": 0.367816091954023, + "grad_norm": 11.130946076542475, + "learning_rate": 1.5940341893604315e-06, + "loss": -0.3329, + "step": 3448 + }, + { + "epoch": 0.3680294423553884, + "grad_norm": 10.953595972003322, + "learning_rate": 1.5934348728842566e-06, + "loss": -0.2787, + "step": 3450 + }, + { + "epoch": 0.36824279275675387, + "grad_norm": 9.832463870177506, + "learning_rate": 1.5928352272082244e-06, + "loss": -0.4354, + "step": 3452 + }, + { + "epoch": 0.3684561431581193, + "grad_norm": 7.923232847512963, + "learning_rate": 1.5922352526649801e-06, + "loss": 0.7566, + "step": 3454 + }, + { + "epoch": 0.36866949355948475, + "grad_norm": 5.741520316878696, + "learning_rate": 1.591634949587351e-06, + "loss": -0.0912, + "step": 3456 + }, + { + "epoch": 0.3688828439608502, + "grad_norm": 7.020911708226061, + "learning_rate": 1.5910343183083476e-06, + "loss": 0.3246, + "step": 3458 + }, + { + "epoch": 0.36909619436221563, + "grad_norm": 6.267022787859831, + "learning_rate": 1.5904333591611616e-06, + "loss": -0.117, + "step": 3460 + }, + { + "epoch": 0.3693095447635811, + "grad_norm": 34.925353917565225, + "learning_rate": 1.589832072479167e-06, + "loss": -0.3285, + "step": 3462 + }, + { + "epoch": 0.3695228951649465, + "grad_norm": 16.73258333176238, + "learning_rate": 1.5892304585959191e-06, + "loss": -1.1832, + "step": 3464 + }, + { + "epoch": 0.36973624556631196, + "grad_norm": 5.801640303319302, + "learning_rate": 1.588628517845155e-06, + "loss": -0.6167, + "step": 3466 + }, + { + "epoch": 0.3699495959676774, + "grad_norm": 10.532760453083393, + "learning_rate": 1.588026250560793e-06, + "loss": 0.0305, + "step": 3468 + }, + { + "epoch": 0.37016294636904284, + "grad_norm": 2.63558295079751, + "learning_rate": 1.587423657076933e-06, + "loss": 0.6387, + "step": 3470 + }, + { + "epoch": 0.3703762967704083, + "grad_norm": 7.200152487931937, + "learning_rate": 1.5868207377278547e-06, + "loss": -0.2491, + "step": 3472 + }, + { + "epoch": 0.37058964717177373, + "grad_norm": 45.85795461021694, + "learning_rate": 1.5862174928480203e-06, + "loss": -1.0033, + "step": 3474 + }, + { + "epoch": 0.37080299757313917, + "grad_norm": 11.037217125814097, + "learning_rate": 1.585613922772071e-06, + "loss": -0.6305, + "step": 3476 + }, + { + "epoch": 0.3710163479745046, + "grad_norm": 10.585015869769725, + "learning_rate": 1.5850100278348294e-06, + "loss": -0.5404, + "step": 3478 + }, + { + "epoch": 0.37122969837587005, + "grad_norm": 6.680797046571585, + "learning_rate": 1.5844058083712977e-06, + "loss": -0.5089, + "step": 3480 + }, + { + "epoch": 0.3714430487772355, + "grad_norm": 17.99898717642397, + "learning_rate": 1.5838012647166582e-06, + "loss": -0.273, + "step": 3482 + }, + { + "epoch": 0.37165639917860094, + "grad_norm": 5.311135747205808, + "learning_rate": 1.5831963972062732e-06, + "loss": 0.3601, + "step": 3484 + }, + { + "epoch": 0.3718697495799664, + "grad_norm": 9.622669818998512, + "learning_rate": 1.5825912061756852e-06, + "loss": -0.3873, + "step": 3486 + }, + { + "epoch": 0.3720830999813318, + "grad_norm": 9.050721492536255, + "learning_rate": 1.5819856919606151e-06, + "loss": -0.602, + "step": 3488 + }, + { + "epoch": 0.37229645038269726, + "grad_norm": 11.402923468211684, + "learning_rate": 1.5813798548969638e-06, + "loss": -0.2176, + "step": 3490 + }, + { + "epoch": 0.3725098007840627, + "grad_norm": 5.5881954810188175, + "learning_rate": 1.5807736953208113e-06, + "loss": -1.464, + "step": 3492 + }, + { + "epoch": 0.37272315118542815, + "grad_norm": 14.204231676820294, + "learning_rate": 1.5801672135684157e-06, + "loss": 0.18, + "step": 3494 + }, + { + "epoch": 0.3729365015867936, + "grad_norm": 12.884869367269026, + "learning_rate": 1.5795604099762152e-06, + "loss": 0.0789, + "step": 3496 + }, + { + "epoch": 0.37314985198815903, + "grad_norm": 4.278821945309066, + "learning_rate": 1.5789532848808249e-06, + "loss": -0.8066, + "step": 3498 + }, + { + "epoch": 0.3733632023895245, + "grad_norm": 13.74044439763939, + "learning_rate": 1.57834583861904e-06, + "loss": -0.6557, + "step": 3500 + }, + { + "epoch": 0.3735765527908899, + "grad_norm": 3.377067553666443, + "learning_rate": 1.5777380715278325e-06, + "loss": -1.0924, + "step": 3502 + }, + { + "epoch": 0.37378990319225536, + "grad_norm": 12.752440625509362, + "learning_rate": 1.5771299839443534e-06, + "loss": 0.1376, + "step": 3504 + }, + { + "epoch": 0.3740032535936208, + "grad_norm": 6.298250318577893, + "learning_rate": 1.57652157620593e-06, + "loss": -0.1212, + "step": 3506 + }, + { + "epoch": 0.37421660399498624, + "grad_norm": 4.14395665135049, + "learning_rate": 1.5759128486500686e-06, + "loss": -0.3851, + "step": 3508 + }, + { + "epoch": 0.3744299543963517, + "grad_norm": 13.00778898154499, + "learning_rate": 1.5753038016144528e-06, + "loss": -0.9167, + "step": 3510 + }, + { + "epoch": 0.3746433047977171, + "grad_norm": 7.257613170103919, + "learning_rate": 1.5746944354369425e-06, + "loss": -0.1856, + "step": 3512 + }, + { + "epoch": 0.37485665519908257, + "grad_norm": 14.591500222590538, + "learning_rate": 1.5740847504555759e-06, + "loss": 0.0322, + "step": 3514 + }, + { + "epoch": 0.375070005600448, + "grad_norm": 8.693674155080066, + "learning_rate": 1.573474747008567e-06, + "loss": -0.0929, + "step": 3516 + }, + { + "epoch": 0.37528335600181345, + "grad_norm": 8.046570829106535, + "learning_rate": 1.5728644254343066e-06, + "loss": -0.4738, + "step": 3518 + }, + { + "epoch": 0.37549670640317895, + "grad_norm": 11.933628426727779, + "learning_rate": 1.5722537860713626e-06, + "loss": -0.6178, + "step": 3520 + }, + { + "epoch": 0.3757100568045444, + "grad_norm": 14.8034557688994, + "learning_rate": 1.5716428292584786e-06, + "loss": -0.6019, + "step": 3522 + }, + { + "epoch": 0.37592340720590983, + "grad_norm": 4.743831691169828, + "learning_rate": 1.5710315553345747e-06, + "loss": 0.4648, + "step": 3524 + }, + { + "epoch": 0.3761367576072753, + "grad_norm": 49.150973280612675, + "learning_rate": 1.5704199646387467e-06, + "loss": -0.5584, + "step": 3526 + }, + { + "epoch": 0.3763501080086407, + "grad_norm": 4.000421610824277, + "learning_rate": 1.569808057510266e-06, + "loss": -0.6094, + "step": 3528 + }, + { + "epoch": 0.37656345841000616, + "grad_norm": 7.849882081256847, + "learning_rate": 1.56919583428858e-06, + "loss": -0.8505, + "step": 3530 + }, + { + "epoch": 0.3767768088113716, + "grad_norm": 5.415205477680958, + "learning_rate": 1.5685832953133107e-06, + "loss": -0.2044, + "step": 3532 + }, + { + "epoch": 0.37699015921273704, + "grad_norm": 8.42177117657776, + "learning_rate": 1.567970440924256e-06, + "loss": 0.3354, + "step": 3534 + }, + { + "epoch": 0.3772035096141025, + "grad_norm": 7.278239670224589, + "learning_rate": 1.5673572714613885e-06, + "loss": 0.0115, + "step": 3536 + }, + { + "epoch": 0.3774168600154679, + "grad_norm": 6.71628273258775, + "learning_rate": 1.5667437872648552e-06, + "loss": -0.533, + "step": 3538 + }, + { + "epoch": 0.37763021041683337, + "grad_norm": 5.8085723743734965, + "learning_rate": 1.5661299886749784e-06, + "loss": -0.2473, + "step": 3540 + }, + { + "epoch": 0.3778435608181988, + "grad_norm": 4.148729159587339, + "learning_rate": 1.5655158760322542e-06, + "loss": -0.21, + "step": 3542 + }, + { + "epoch": 0.37805691121956425, + "grad_norm": 22.590137427651552, + "learning_rate": 1.5649014496773533e-06, + "loss": -0.0537, + "step": 3544 + }, + { + "epoch": 0.3782702616209297, + "grad_norm": 8.524955553431015, + "learning_rate": 1.5642867099511206e-06, + "loss": -1.2622, + "step": 3546 + }, + { + "epoch": 0.37848361202229514, + "grad_norm": 5.403875387316923, + "learning_rate": 1.563671657194574e-06, + "loss": 0.4265, + "step": 3548 + }, + { + "epoch": 0.3786969624236606, + "grad_norm": 5.182485383323739, + "learning_rate": 1.563056291748906e-06, + "loss": -0.2579, + "step": 3550 + }, + { + "epoch": 0.378910312825026, + "grad_norm": 8.877747655016213, + "learning_rate": 1.5624406139554814e-06, + "loss": 0.6247, + "step": 3552 + }, + { + "epoch": 0.37912366322639146, + "grad_norm": 5.866965769599405, + "learning_rate": 1.5618246241558399e-06, + "loss": -0.5452, + "step": 3554 + }, + { + "epoch": 0.3793370136277569, + "grad_norm": 5.371396413251093, + "learning_rate": 1.5612083226916931e-06, + "loss": -0.0421, + "step": 3556 + }, + { + "epoch": 0.37955036402912234, + "grad_norm": 7.19459581735369, + "learning_rate": 1.560591709904926e-06, + "loss": -0.386, + "step": 3558 + }, + { + "epoch": 0.3797637144304878, + "grad_norm": 20.528980488408273, + "learning_rate": 1.5599747861375953e-06, + "loss": -0.3766, + "step": 3560 + }, + { + "epoch": 0.37997706483185323, + "grad_norm": 10.336748668733522, + "learning_rate": 1.5593575517319319e-06, + "loss": -0.3529, + "step": 3562 + }, + { + "epoch": 0.38019041523321867, + "grad_norm": 4.5443685266820175, + "learning_rate": 1.5587400070303375e-06, + "loss": 0.3915, + "step": 3564 + }, + { + "epoch": 0.3804037656345841, + "grad_norm": 19.179097866668833, + "learning_rate": 1.5581221523753868e-06, + "loss": 0.8604, + "step": 3566 + }, + { + "epoch": 0.38061711603594955, + "grad_norm": 11.552700669225064, + "learning_rate": 1.5575039881098265e-06, + "loss": 0.3069, + "step": 3568 + }, + { + "epoch": 0.380830466437315, + "grad_norm": 3.9783674546263494, + "learning_rate": 1.556885514576574e-06, + "loss": -1.3345, + "step": 3570 + }, + { + "epoch": 0.38104381683868044, + "grad_norm": 7.633657705964115, + "learning_rate": 1.5562667321187198e-06, + "loss": -0.4604, + "step": 3572 + }, + { + "epoch": 0.3812571672400459, + "grad_norm": 5.769370759422934, + "learning_rate": 1.5556476410795244e-06, + "loss": 0.3274, + "step": 3574 + }, + { + "epoch": 0.3814705176414113, + "grad_norm": 7.391582133537034, + "learning_rate": 1.5550282418024206e-06, + "loss": -0.0919, + "step": 3576 + }, + { + "epoch": 0.38168386804277676, + "grad_norm": 12.389887728087002, + "learning_rate": 1.5544085346310109e-06, + "loss": 0.3406, + "step": 3578 + }, + { + "epoch": 0.3818972184441422, + "grad_norm": 5.277084032347067, + "learning_rate": 1.5537885199090703e-06, + "loss": -0.6772, + "step": 3580 + }, + { + "epoch": 0.38211056884550765, + "grad_norm": 7.562116521804289, + "learning_rate": 1.5531681979805427e-06, + "loss": -1.202, + "step": 3582 + }, + { + "epoch": 0.3823239192468731, + "grad_norm": 9.691369413117519, + "learning_rate": 1.5525475691895436e-06, + "loss": -0.1949, + "step": 3584 + }, + { + "epoch": 0.38253726964823853, + "grad_norm": 31.840374293070614, + "learning_rate": 1.5519266338803578e-06, + "loss": -0.6753, + "step": 3586 + }, + { + "epoch": 0.382750620049604, + "grad_norm": 7.356390177701568, + "learning_rate": 1.5513053923974416e-06, + "loss": -0.1972, + "step": 3588 + }, + { + "epoch": 0.3829639704509694, + "grad_norm": 6.626997164220199, + "learning_rate": 1.5506838450854192e-06, + "loss": -1.1384, + "step": 3590 + }, + { + "epoch": 0.38317732085233486, + "grad_norm": 9.800195055504844, + "learning_rate": 1.5500619922890858e-06, + "loss": -0.2405, + "step": 3592 + }, + { + "epoch": 0.3833906712537003, + "grad_norm": 4.871044229269088, + "learning_rate": 1.5494398343534059e-06, + "loss": -0.2559, + "step": 3594 + }, + { + "epoch": 0.38360402165506574, + "grad_norm": 13.889942664864302, + "learning_rate": 1.548817371623513e-06, + "loss": -0.0779, + "step": 3596 + }, + { + "epoch": 0.3838173720564312, + "grad_norm": 3.875472882140023, + "learning_rate": 1.5481946044447098e-06, + "loss": -0.1772, + "step": 3598 + }, + { + "epoch": 0.3840307224577966, + "grad_norm": 4.588414315068769, + "learning_rate": 1.5475715331624675e-06, + "loss": -0.3885, + "step": 3600 + }, + { + "epoch": 0.38424407285916207, + "grad_norm": 8.776223512204368, + "learning_rate": 1.5469481581224271e-06, + "loss": -0.9852, + "step": 3602 + }, + { + "epoch": 0.3844574232605275, + "grad_norm": 8.918997021245008, + "learning_rate": 1.5463244796703964e-06, + "loss": -0.104, + "step": 3604 + }, + { + "epoch": 0.38467077366189295, + "grad_norm": 4.821139814081166, + "learning_rate": 1.5457004981523526e-06, + "loss": -0.1127, + "step": 3606 + }, + { + "epoch": 0.3848841240632584, + "grad_norm": 6.848927030004704, + "learning_rate": 1.545076213914442e-06, + "loss": -0.1438, + "step": 3608 + }, + { + "epoch": 0.38509747446462383, + "grad_norm": 7.271133623491892, + "learning_rate": 1.5444516273029767e-06, + "loss": -0.7004, + "step": 3610 + }, + { + "epoch": 0.3853108248659893, + "grad_norm": 4.810511338969127, + "learning_rate": 1.5438267386644378e-06, + "loss": -0.2117, + "step": 3612 + }, + { + "epoch": 0.3855241752673547, + "grad_norm": 10.917959202064116, + "learning_rate": 1.5432015483454734e-06, + "loss": -0.1434, + "step": 3614 + }, + { + "epoch": 0.38573752566872016, + "grad_norm": 8.293317355645408, + "learning_rate": 1.5425760566929e-06, + "loss": -0.124, + "step": 3616 + }, + { + "epoch": 0.3859508760700856, + "grad_norm": 17.55127517912104, + "learning_rate": 1.5419502640536996e-06, + "loss": -0.7544, + "step": 3618 + }, + { + "epoch": 0.38616422647145104, + "grad_norm": 8.173118929424952, + "learning_rate": 1.5413241707750228e-06, + "loss": 0.3597, + "step": 3620 + }, + { + "epoch": 0.3863775768728165, + "grad_norm": 35.60294675727771, + "learning_rate": 1.540697777204186e-06, + "loss": -0.474, + "step": 3622 + }, + { + "epoch": 0.38659092727418193, + "grad_norm": 6.589839122065532, + "learning_rate": 1.5400710836886726e-06, + "loss": 0.2631, + "step": 3624 + }, + { + "epoch": 0.38680427767554737, + "grad_norm": 4.936850894975371, + "learning_rate": 1.539444090576132e-06, + "loss": -0.1817, + "step": 3626 + }, + { + "epoch": 0.3870176280769128, + "grad_norm": 14.697991955454372, + "learning_rate": 1.53881679821438e-06, + "loss": 0.8469, + "step": 3628 + }, + { + "epoch": 0.38723097847827825, + "grad_norm": 10.621792396410738, + "learning_rate": 1.5381892069513988e-06, + "loss": -1.0562, + "step": 3630 + }, + { + "epoch": 0.3874443288796437, + "grad_norm": 5.593816089439401, + "learning_rate": 1.5375613171353354e-06, + "loss": 0.5602, + "step": 3632 + }, + { + "epoch": 0.38765767928100914, + "grad_norm": 9.809925542721066, + "learning_rate": 1.5369331291145039e-06, + "loss": -0.6302, + "step": 3634 + }, + { + "epoch": 0.3878710296823746, + "grad_norm": 5.67511602320479, + "learning_rate": 1.5363046432373823e-06, + "loss": -0.4808, + "step": 3636 + }, + { + "epoch": 0.38808438008374, + "grad_norm": 21.254274256029046, + "learning_rate": 1.535675859852615e-06, + "loss": -0.6095, + "step": 3638 + }, + { + "epoch": 0.38829773048510546, + "grad_norm": 8.836941051384738, + "learning_rate": 1.5350467793090106e-06, + "loss": -0.5257, + "step": 3640 + }, + { + "epoch": 0.3885110808864709, + "grad_norm": 16.115766147179784, + "learning_rate": 1.5344174019555432e-06, + "loss": -0.3704, + "step": 3642 + }, + { + "epoch": 0.38872443128783635, + "grad_norm": 8.173292215333689, + "learning_rate": 1.533787728141351e-06, + "loss": -0.619, + "step": 3644 + }, + { + "epoch": 0.3889377816892018, + "grad_norm": 4.145918645434731, + "learning_rate": 1.5331577582157368e-06, + "loss": -0.3127, + "step": 3646 + }, + { + "epoch": 0.38915113209056723, + "grad_norm": 12.75611886468427, + "learning_rate": 1.5325274925281684e-06, + "loss": -0.4764, + "step": 3648 + }, + { + "epoch": 0.3893644824919327, + "grad_norm": 4.161936998346248, + "learning_rate": 1.5318969314282765e-06, + "loss": -1.1281, + "step": 3650 + }, + { + "epoch": 0.3895778328932981, + "grad_norm": 10.006237986567456, + "learning_rate": 1.5312660752658562e-06, + "loss": 0.0385, + "step": 3652 + }, + { + "epoch": 0.38979118329466356, + "grad_norm": 6.323036700283537, + "learning_rate": 1.5306349243908666e-06, + "loss": 0.1731, + "step": 3654 + }, + { + "epoch": 0.390004533696029, + "grad_norm": 9.44266012609835, + "learning_rate": 1.5300034791534296e-06, + "loss": -0.5307, + "step": 3656 + }, + { + "epoch": 0.39021788409739444, + "grad_norm": 8.403390075480871, + "learning_rate": 1.5293717399038316e-06, + "loss": 0.0937, + "step": 3658 + }, + { + "epoch": 0.3904312344987599, + "grad_norm": 11.344009329768848, + "learning_rate": 1.5287397069925199e-06, + "loss": 0.37, + "step": 3660 + }, + { + "epoch": 0.3906445849001253, + "grad_norm": 14.730368480326328, + "learning_rate": 1.5281073807701077e-06, + "loss": -0.5828, + "step": 3662 + }, + { + "epoch": 0.39085793530149077, + "grad_norm": 9.328031275602731, + "learning_rate": 1.527474761587368e-06, + "loss": -0.2643, + "step": 3664 + }, + { + "epoch": 0.3910712857028562, + "grad_norm": 18.286360085749408, + "learning_rate": 1.5268418497952384e-06, + "loss": -0.2819, + "step": 3666 + }, + { + "epoch": 0.39128463610422165, + "grad_norm": 37.07454150517397, + "learning_rate": 1.5262086457448181e-06, + "loss": -0.5526, + "step": 3668 + }, + { + "epoch": 0.3914979865055871, + "grad_norm": 17.536812264474047, + "learning_rate": 1.5255751497873673e-06, + "loss": 0.0052, + "step": 3670 + }, + { + "epoch": 0.39171133690695253, + "grad_norm": 8.506179114189134, + "learning_rate": 1.52494136227431e-06, + "loss": 0.0464, + "step": 3672 + }, + { + "epoch": 0.391924687308318, + "grad_norm": 37.66877069449353, + "learning_rate": 1.5243072835572316e-06, + "loss": 0.0412, + "step": 3674 + }, + { + "epoch": 0.3921380377096834, + "grad_norm": 10.320882226482288, + "learning_rate": 1.5236729139878778e-06, + "loss": -0.7596, + "step": 3676 + }, + { + "epoch": 0.39235138811104886, + "grad_norm": 10.493267744788415, + "learning_rate": 1.5230382539181568e-06, + "loss": 0.2893, + "step": 3678 + }, + { + "epoch": 0.3925647385124143, + "grad_norm": 6.89027641429402, + "learning_rate": 1.5224033037001371e-06, + "loss": -0.0023, + "step": 3680 + }, + { + "epoch": 0.39277808891377974, + "grad_norm": 10.278621986589588, + "learning_rate": 1.5217680636860491e-06, + "loss": -0.3683, + "step": 3682 + }, + { + "epoch": 0.3929914393151452, + "grad_norm": 13.027417616458742, + "learning_rate": 1.5211325342282834e-06, + "loss": -0.3161, + "step": 3684 + }, + { + "epoch": 0.3932047897165106, + "grad_norm": 8.270384897461405, + "learning_rate": 1.5204967156793909e-06, + "loss": -0.2265, + "step": 3686 + }, + { + "epoch": 0.39341814011787607, + "grad_norm": 5.17269067400758, + "learning_rate": 1.5198606083920838e-06, + "loss": -0.1584, + "step": 3688 + }, + { + "epoch": 0.39363149051924157, + "grad_norm": 6.158685297773266, + "learning_rate": 1.5192242127192334e-06, + "loss": -0.4424, + "step": 3690 + }, + { + "epoch": 0.393844840920607, + "grad_norm": 15.105097684401171, + "learning_rate": 1.5185875290138718e-06, + "loss": -0.9077, + "step": 3692 + }, + { + "epoch": 0.39405819132197245, + "grad_norm": 9.234219977875838, + "learning_rate": 1.5179505576291907e-06, + "loss": 0.2225, + "step": 3694 + }, + { + "epoch": 0.3942715417233379, + "grad_norm": 8.438402702275523, + "learning_rate": 1.5173132989185406e-06, + "loss": 0.7342, + "step": 3696 + }, + { + "epoch": 0.39448489212470333, + "grad_norm": 20.074302057850378, + "learning_rate": 1.5166757532354324e-06, + "loss": -0.1696, + "step": 3698 + }, + { + "epoch": 0.3946982425260688, + "grad_norm": 8.43205864465627, + "learning_rate": 1.5160379209335359e-06, + "loss": 0.6239, + "step": 3700 + }, + { + "epoch": 0.3949115929274342, + "grad_norm": 4.847117481685127, + "learning_rate": 1.5153998023666796e-06, + "loss": -0.0262, + "step": 3702 + }, + { + "epoch": 0.39512494332879966, + "grad_norm": 18.794978688243567, + "learning_rate": 1.514761397888851e-06, + "loss": -0.0887, + "step": 3704 + }, + { + "epoch": 0.3953382937301651, + "grad_norm": 13.059955641365022, + "learning_rate": 1.5141227078541969e-06, + "loss": -0.2858, + "step": 3706 + }, + { + "epoch": 0.39555164413153054, + "grad_norm": 5.915296662302059, + "learning_rate": 1.5134837326170212e-06, + "loss": 0.5433, + "step": 3708 + }, + { + "epoch": 0.395764994532896, + "grad_norm": 4.473512792288653, + "learning_rate": 1.5128444725317869e-06, + "loss": -0.3851, + "step": 3710 + }, + { + "epoch": 0.39597834493426143, + "grad_norm": 13.54069040456599, + "learning_rate": 1.5122049279531142e-06, + "loss": -0.0226, + "step": 3712 + }, + { + "epoch": 0.39619169533562687, + "grad_norm": 6.915342754753017, + "learning_rate": 1.5115650992357824e-06, + "loss": -0.0367, + "step": 3714 + }, + { + "epoch": 0.3964050457369923, + "grad_norm": 5.749806765083028, + "learning_rate": 1.5109249867347274e-06, + "loss": 0.5132, + "step": 3716 + }, + { + "epoch": 0.39661839613835775, + "grad_norm": 6.793310731511242, + "learning_rate": 1.510284590805043e-06, + "loss": -0.5096, + "step": 3718 + }, + { + "epoch": 0.3968317465397232, + "grad_norm": 8.354381670352554, + "learning_rate": 1.5096439118019798e-06, + "loss": -0.0896, + "step": 3720 + }, + { + "epoch": 0.39704509694108864, + "grad_norm": 28.348260743108963, + "learning_rate": 1.5090029500809458e-06, + "loss": 0.0119, + "step": 3722 + }, + { + "epoch": 0.3972584473424541, + "grad_norm": 38.71242056236036, + "learning_rate": 1.5083617059975055e-06, + "loss": -0.537, + "step": 3724 + }, + { + "epoch": 0.3974717977438195, + "grad_norm": 14.812596921638383, + "learning_rate": 1.5077201799073809e-06, + "loss": -0.1094, + "step": 3726 + }, + { + "epoch": 0.39768514814518496, + "grad_norm": 3.8786450221280804, + "learning_rate": 1.5070783721664492e-06, + "loss": 0.3986, + "step": 3728 + }, + { + "epoch": 0.3978984985465504, + "grad_norm": 8.813091782698688, + "learning_rate": 1.506436283130745e-06, + "loss": 0.457, + "step": 3730 + }, + { + "epoch": 0.39811184894791585, + "grad_norm": 9.321936249578352, + "learning_rate": 1.5057939131564577e-06, + "loss": 0.2946, + "step": 3732 + }, + { + "epoch": 0.3983251993492813, + "grad_norm": 12.720376552609476, + "learning_rate": 1.5051512625999338e-06, + "loss": -0.8586, + "step": 3734 + }, + { + "epoch": 0.39853854975064673, + "grad_norm": 12.253185977992182, + "learning_rate": 1.5045083318176744e-06, + "loss": -0.7481, + "step": 3736 + }, + { + "epoch": 0.3987519001520122, + "grad_norm": 9.65539300951454, + "learning_rate": 1.5038651211663366e-06, + "loss": 0.1597, + "step": 3738 + }, + { + "epoch": 0.3989652505533776, + "grad_norm": 8.288906758248611, + "learning_rate": 1.5032216310027332e-06, + "loss": 0.5925, + "step": 3740 + }, + { + "epoch": 0.39917860095474306, + "grad_norm": 11.613430773068316, + "learning_rate": 1.5025778616838312e-06, + "loss": -0.2488, + "step": 3742 + }, + { + "epoch": 0.3993919513561085, + "grad_norm": 4.3470597767447225, + "learning_rate": 1.5019338135667528e-06, + "loss": -0.6235, + "step": 3744 + }, + { + "epoch": 0.39960530175747394, + "grad_norm": 8.77142956174303, + "learning_rate": 1.5012894870087748e-06, + "loss": -0.5426, + "step": 3746 + }, + { + "epoch": 0.3998186521588394, + "grad_norm": 6.2140174042811465, + "learning_rate": 1.5006448823673288e-06, + "loss": -0.6785, + "step": 3748 + }, + { + "epoch": 0.4000320025602048, + "grad_norm": 10.278830581090943, + "learning_rate": 1.5e-06, + "loss": -0.6082, + "step": 3750 + }, + { + "epoch": 0.40024535296157027, + "grad_norm": 3.0716642853508156, + "learning_rate": 1.4993548402645283e-06, + "loss": 0.1627, + "step": 3752 + }, + { + "epoch": 0.4004587033629357, + "grad_norm": 8.929308545596351, + "learning_rate": 1.4987094035188071e-06, + "loss": 0.7348, + "step": 3754 + }, + { + "epoch": 0.40067205376430115, + "grad_norm": 9.760478018241823, + "learning_rate": 1.4980636901208836e-06, + "loss": -0.3719, + "step": 3756 + }, + { + "epoch": 0.4008854041656666, + "grad_norm": 12.965383373268924, + "learning_rate": 1.4974177004289585e-06, + "loss": -0.5636, + "step": 3758 + }, + { + "epoch": 0.40109875456703203, + "grad_norm": 7.023628672814459, + "learning_rate": 1.4967714348013859e-06, + "loss": -0.276, + "step": 3760 + }, + { + "epoch": 0.4013121049683975, + "grad_norm": 7.051871244043609, + "learning_rate": 1.4961248935966722e-06, + "loss": -0.2024, + "step": 3762 + }, + { + "epoch": 0.4015254553697629, + "grad_norm": 7.870900793456, + "learning_rate": 1.495478077173478e-06, + "loss": -0.2659, + "step": 3764 + }, + { + "epoch": 0.40173880577112836, + "grad_norm": 10.744294104933509, + "learning_rate": 1.4948309858906153e-06, + "loss": -0.0773, + "step": 3766 + }, + { + "epoch": 0.4019521561724938, + "grad_norm": 8.731722528130536, + "learning_rate": 1.4941836201070496e-06, + "loss": -0.5997, + "step": 3768 + }, + { + "epoch": 0.40216550657385924, + "grad_norm": 9.265364675895867, + "learning_rate": 1.4935359801818976e-06, + "loss": -0.932, + "step": 3770 + }, + { + "epoch": 0.4023788569752247, + "grad_norm": 7.6866034433335, + "learning_rate": 1.4928880664744293e-06, + "loss": -0.14, + "step": 3772 + }, + { + "epoch": 0.4025922073765901, + "grad_norm": 17.183856180408796, + "learning_rate": 1.4922398793440656e-06, + "loss": -1.1887, + "step": 3774 + }, + { + "epoch": 0.40280555777795557, + "grad_norm": 26.055286858529467, + "learning_rate": 1.4915914191503792e-06, + "loss": -0.3363, + "step": 3776 + }, + { + "epoch": 0.403018908179321, + "grad_norm": 24.324981532927723, + "learning_rate": 1.490942686253095e-06, + "loss": 0.0196, + "step": 3778 + }, + { + "epoch": 0.40323225858068645, + "grad_norm": 5.673600661053408, + "learning_rate": 1.4902936810120878e-06, + "loss": 0.2877, + "step": 3780 + }, + { + "epoch": 0.4034456089820519, + "grad_norm": 11.991667794320717, + "learning_rate": 1.4896444037873857e-06, + "loss": -0.5011, + "step": 3782 + }, + { + "epoch": 0.40365895938341734, + "grad_norm": 6.269309766176834, + "learning_rate": 1.4889948549391654e-06, + "loss": -0.3712, + "step": 3784 + }, + { + "epoch": 0.4038723097847828, + "grad_norm": 7.983399761703533, + "learning_rate": 1.4883450348277554e-06, + "loss": 0.2014, + "step": 3786 + }, + { + "epoch": 0.4040856601861482, + "grad_norm": 14.025832973128054, + "learning_rate": 1.4876949438136346e-06, + "loss": 0.4821, + "step": 3788 + }, + { + "epoch": 0.40429901058751366, + "grad_norm": 2.0871101406766797, + "learning_rate": 1.4870445822574318e-06, + "loss": 0.3806, + "step": 3790 + }, + { + "epoch": 0.4045123609888791, + "grad_norm": 15.234242210541096, + "learning_rate": 1.4863939505199268e-06, + "loss": -0.5138, + "step": 3792 + }, + { + "epoch": 0.40472571139024455, + "grad_norm": 3.9791045604321926, + "learning_rate": 1.4857430489620474e-06, + "loss": -0.1733, + "step": 3794 + }, + { + "epoch": 0.40493906179161, + "grad_norm": 14.49322021731263, + "learning_rate": 1.4850918779448738e-06, + "loss": 0.0031, + "step": 3796 + }, + { + "epoch": 0.40515241219297543, + "grad_norm": 6.236022553434043, + "learning_rate": 1.4844404378296332e-06, + "loss": -0.8858, + "step": 3798 + }, + { + "epoch": 0.4053657625943409, + "grad_norm": 10.153159209012589, + "learning_rate": 1.4837887289777033e-06, + "loss": -0.1881, + "step": 3800 + }, + { + "epoch": 0.4055791129957063, + "grad_norm": 22.536762116923256, + "learning_rate": 1.483136751750611e-06, + "loss": -1.0648, + "step": 3802 + }, + { + "epoch": 0.40579246339707176, + "grad_norm": 4.9982402908561285, + "learning_rate": 1.4824845065100313e-06, + "loss": -0.7726, + "step": 3804 + }, + { + "epoch": 0.4060058137984372, + "grad_norm": 15.397036335744595, + "learning_rate": 1.4818319936177884e-06, + "loss": 0.0598, + "step": 3806 + }, + { + "epoch": 0.40621916419980264, + "grad_norm": 12.251552110747216, + "learning_rate": 1.4811792134358548e-06, + "loss": -0.7963, + "step": 3808 + }, + { + "epoch": 0.4064325146011681, + "grad_norm": 8.44138232516171, + "learning_rate": 1.4805261663263517e-06, + "loss": 0.3401, + "step": 3810 + }, + { + "epoch": 0.4066458650025335, + "grad_norm": 13.671347622374185, + "learning_rate": 1.4798728526515477e-06, + "loss": -0.2303, + "step": 3812 + }, + { + "epoch": 0.40685921540389897, + "grad_norm": 9.353688683820609, + "learning_rate": 1.4792192727738595e-06, + "loss": -0.5559, + "step": 3814 + }, + { + "epoch": 0.4070725658052644, + "grad_norm": 4.617017069310191, + "learning_rate": 1.4785654270558523e-06, + "loss": 0.1225, + "step": 3816 + }, + { + "epoch": 0.40728591620662985, + "grad_norm": 15.311881303574632, + "learning_rate": 1.477911315860237e-06, + "loss": -0.1622, + "step": 3818 + }, + { + "epoch": 0.4074992666079953, + "grad_norm": 10.876928677454421, + "learning_rate": 1.4772569395498735e-06, + "loss": -0.2184, + "step": 3820 + }, + { + "epoch": 0.40771261700936073, + "grad_norm": 3.9911845859843993, + "learning_rate": 1.476602298487768e-06, + "loss": 0.9747, + "step": 3822 + }, + { + "epoch": 0.4079259674107262, + "grad_norm": 9.93701348580409, + "learning_rate": 1.4759473930370736e-06, + "loss": -0.4765, + "step": 3824 + }, + { + "epoch": 0.4081393178120916, + "grad_norm": 14.465674124148975, + "learning_rate": 1.4752922235610898e-06, + "loss": 0.0095, + "step": 3826 + }, + { + "epoch": 0.40835266821345706, + "grad_norm": 3.1426750828526617, + "learning_rate": 1.4746367904232635e-06, + "loss": 0.2866, + "step": 3828 + }, + { + "epoch": 0.4085660186148225, + "grad_norm": 3.574490640869594, + "learning_rate": 1.4739810939871867e-06, + "loss": -0.6683, + "step": 3830 + }, + { + "epoch": 0.40877936901618794, + "grad_norm": 11.836480940223721, + "learning_rate": 1.4733251346165985e-06, + "loss": 0.3133, + "step": 3832 + }, + { + "epoch": 0.4089927194175534, + "grad_norm": 8.769075135721405, + "learning_rate": 1.4726689126753828e-06, + "loss": -0.1171, + "step": 3834 + }, + { + "epoch": 0.4092060698189188, + "grad_norm": 13.053457259472992, + "learning_rate": 1.47201242852757e-06, + "loss": -0.4025, + "step": 3836 + }, + { + "epoch": 0.40941942022028427, + "grad_norm": 9.606955022704115, + "learning_rate": 1.4713556825373356e-06, + "loss": -1.0689, + "step": 3838 + }, + { + "epoch": 0.4096327706216497, + "grad_norm": 8.53723841373623, + "learning_rate": 1.470698675069001e-06, + "loss": -1.1057, + "step": 3840 + }, + { + "epoch": 0.40984612102301515, + "grad_norm": 15.854424551363909, + "learning_rate": 1.4700414064870312e-06, + "loss": -0.0192, + "step": 3842 + }, + { + "epoch": 0.4100594714243806, + "grad_norm": 14.110817139843025, + "learning_rate": 1.4693838771560376e-06, + "loss": -0.8178, + "step": 3844 + }, + { + "epoch": 0.41027282182574604, + "grad_norm": 7.841196902924903, + "learning_rate": 1.468726087440775e-06, + "loss": -0.3877, + "step": 3846 + }, + { + "epoch": 0.4104861722271115, + "grad_norm": 13.955756959499775, + "learning_rate": 1.4680680377061436e-06, + "loss": -0.0809, + "step": 3848 + }, + { + "epoch": 0.4106995226284769, + "grad_norm": 11.52339075634261, + "learning_rate": 1.4674097283171874e-06, + "loss": -0.7089, + "step": 3850 + }, + { + "epoch": 0.41091287302984236, + "grad_norm": 13.965589052175874, + "learning_rate": 1.4667511596390947e-06, + "loss": -0.3752, + "step": 3852 + }, + { + "epoch": 0.4111262234312078, + "grad_norm": 6.27228963475403, + "learning_rate": 1.4660923320371972e-06, + "loss": 0.0164, + "step": 3854 + }, + { + "epoch": 0.41133957383257325, + "grad_norm": 10.576482397726632, + "learning_rate": 1.4654332458769702e-06, + "loss": -0.3667, + "step": 3856 + }, + { + "epoch": 0.4115529242339387, + "grad_norm": 7.023585082591441, + "learning_rate": 1.4647739015240337e-06, + "loss": -1.0273, + "step": 3858 + }, + { + "epoch": 0.41176627463530413, + "grad_norm": 5.5920592677708285, + "learning_rate": 1.4641142993441484e-06, + "loss": -0.6201, + "step": 3860 + }, + { + "epoch": 0.4119796250366696, + "grad_norm": 6.062016990721737, + "learning_rate": 1.4634544397032212e-06, + "loss": -0.9741, + "step": 3862 + }, + { + "epoch": 0.41219297543803507, + "grad_norm": 7.07971214310567, + "learning_rate": 1.462794322967299e-06, + "loss": 0.0646, + "step": 3864 + }, + { + "epoch": 0.4124063258394005, + "grad_norm": 8.969742738630517, + "learning_rate": 1.4621339495025728e-06, + "loss": -0.1992, + "step": 3866 + }, + { + "epoch": 0.41261967624076595, + "grad_norm": 19.08417923071099, + "learning_rate": 1.4614733196753764e-06, + "loss": 0.4917, + "step": 3868 + }, + { + "epoch": 0.4128330266421314, + "grad_norm": 5.656505414414432, + "learning_rate": 1.4608124338521841e-06, + "loss": -0.288, + "step": 3870 + }, + { + "epoch": 0.41304637704349684, + "grad_norm": 13.420175280191685, + "learning_rate": 1.460151292399614e-06, + "loss": -0.2037, + "step": 3872 + }, + { + "epoch": 0.4132597274448623, + "grad_norm": 7.280245562681179, + "learning_rate": 1.459489895684425e-06, + "loss": 0.3146, + "step": 3874 + }, + { + "epoch": 0.4134730778462277, + "grad_norm": 6.304564968324813, + "learning_rate": 1.4588282440735172e-06, + "loss": -0.3996, + "step": 3876 + }, + { + "epoch": 0.41368642824759316, + "grad_norm": 12.98346923219324, + "learning_rate": 1.4581663379339343e-06, + "loss": -0.555, + "step": 3878 + }, + { + "epoch": 0.4138997786489586, + "grad_norm": 5.220776220317172, + "learning_rate": 1.4575041776328584e-06, + "loss": -0.2861, + "step": 3880 + }, + { + "epoch": 0.41411312905032405, + "grad_norm": 8.735284959091189, + "learning_rate": 1.4568417635376143e-06, + "loss": -0.7867, + "step": 3882 + }, + { + "epoch": 0.4143264794516895, + "grad_norm": 11.296750650032552, + "learning_rate": 1.4561790960156668e-06, + "loss": -0.1305, + "step": 3884 + }, + { + "epoch": 0.41453982985305493, + "grad_norm": 3.3149405239500584, + "learning_rate": 1.455516175434622e-06, + "loss": 0.2072, + "step": 3886 + }, + { + "epoch": 0.4147531802544204, + "grad_norm": 3.4451619430840887, + "learning_rate": 1.4548530021622259e-06, + "loss": -0.9674, + "step": 3888 + }, + { + "epoch": 0.4149665306557858, + "grad_norm": 11.13636668286441, + "learning_rate": 1.4541895765663644e-06, + "loss": -1.083, + "step": 3890 + }, + { + "epoch": 0.41517988105715126, + "grad_norm": 9.882330861502682, + "learning_rate": 1.4535258990150644e-06, + "loss": -0.4966, + "step": 3892 + }, + { + "epoch": 0.4153932314585167, + "grad_norm": 23.108544773171474, + "learning_rate": 1.4528619698764916e-06, + "loss": -1.1224, + "step": 3894 + }, + { + "epoch": 0.41560658185988214, + "grad_norm": 8.065084393992292, + "learning_rate": 1.4521977895189516e-06, + "loss": -0.6081, + "step": 3896 + }, + { + "epoch": 0.4158199322612476, + "grad_norm": 10.35144748957568, + "learning_rate": 1.4515333583108893e-06, + "loss": -0.0065, + "step": 3898 + }, + { + "epoch": 0.416033282662613, + "grad_norm": 5.739977822467415, + "learning_rate": 1.450868676620889e-06, + "loss": 0.5308, + "step": 3900 + }, + { + "epoch": 0.41624663306397847, + "grad_norm": 18.423635795582772, + "learning_rate": 1.4502037448176732e-06, + "loss": -0.2492, + "step": 3902 + }, + { + "epoch": 0.4164599834653439, + "grad_norm": 11.407788644958819, + "learning_rate": 1.4495385632701043e-06, + "loss": -0.5145, + "step": 3904 + }, + { + "epoch": 0.41667333386670935, + "grad_norm": 17.250492223407633, + "learning_rate": 1.4488731323471825e-06, + "loss": 0.1914, + "step": 3906 + }, + { + "epoch": 0.4168866842680748, + "grad_norm": 13.210799516900469, + "learning_rate": 1.4482074524180462e-06, + "loss": -1.1653, + "step": 3908 + }, + { + "epoch": 0.41710003466944023, + "grad_norm": 6.604339047629001, + "learning_rate": 1.4475415238519727e-06, + "loss": 0.1379, + "step": 3910 + }, + { + "epoch": 0.4173133850708057, + "grad_norm": 4.9881731848655955, + "learning_rate": 1.4468753470183763e-06, + "loss": 0.1574, + "step": 3912 + }, + { + "epoch": 0.4175267354721711, + "grad_norm": 14.778786914861252, + "learning_rate": 1.4462089222868098e-06, + "loss": -0.7164, + "step": 3914 + }, + { + "epoch": 0.41774008587353656, + "grad_norm": 6.738250381353129, + "learning_rate": 1.4455422500269627e-06, + "loss": -0.3154, + "step": 3916 + }, + { + "epoch": 0.417953436274902, + "grad_norm": 13.170570631250948, + "learning_rate": 1.4448753306086626e-06, + "loss": -0.5433, + "step": 3918 + }, + { + "epoch": 0.41816678667626744, + "grad_norm": 5.1028857348257, + "learning_rate": 1.4442081644018742e-06, + "loss": 1.2319, + "step": 3920 + }, + { + "epoch": 0.4183801370776329, + "grad_norm": 8.04817282049669, + "learning_rate": 1.4435407517766982e-06, + "loss": -0.5833, + "step": 3922 + }, + { + "epoch": 0.4185934874789983, + "grad_norm": 5.543333760338335, + "learning_rate": 1.442873093103373e-06, + "loss": -0.8108, + "step": 3924 + }, + { + "epoch": 0.41880683788036377, + "grad_norm": 6.465230235469865, + "learning_rate": 1.4422051887522732e-06, + "loss": -0.1626, + "step": 3926 + }, + { + "epoch": 0.4190201882817292, + "grad_norm": 6.1147153690396, + "learning_rate": 1.4415370390939087e-06, + "loss": 0.5269, + "step": 3928 + }, + { + "epoch": 0.41923353868309465, + "grad_norm": 5.080110523821032, + "learning_rate": 1.440868644498928e-06, + "loss": -0.7297, + "step": 3930 + }, + { + "epoch": 0.4194468890844601, + "grad_norm": 5.3021043709141065, + "learning_rate": 1.4402000053381122e-06, + "loss": -0.3416, + "step": 3932 + }, + { + "epoch": 0.41966023948582554, + "grad_norm": 6.103858134501873, + "learning_rate": 1.4395311219823806e-06, + "loss": -0.0671, + "step": 3934 + }, + { + "epoch": 0.419873589887191, + "grad_norm": 5.9868590674704, + "learning_rate": 1.438861994802787e-06, + "loss": -1.131, + "step": 3936 + }, + { + "epoch": 0.4200869402885564, + "grad_norm": 10.516876187290785, + "learning_rate": 1.43819262417052e-06, + "loss": -0.7947, + "step": 3938 + }, + { + "epoch": 0.42030029068992186, + "grad_norm": 12.928120322055655, + "learning_rate": 1.4375230104569042e-06, + "loss": -0.5527, + "step": 3940 + }, + { + "epoch": 0.4205136410912873, + "grad_norm": 12.481368675339098, + "learning_rate": 1.4368531540333986e-06, + "loss": -1.007, + "step": 3942 + }, + { + "epoch": 0.42072699149265275, + "grad_norm": 4.554880095350058, + "learning_rate": 1.4361830552715973e-06, + "loss": -0.5925, + "step": 3944 + }, + { + "epoch": 0.4209403418940182, + "grad_norm": 11.009251448238798, + "learning_rate": 1.4355127145432272e-06, + "loss": -0.8023, + "step": 3946 + }, + { + "epoch": 0.42115369229538363, + "grad_norm": 9.66609143661165, + "learning_rate": 1.4348421322201519e-06, + "loss": 0.1195, + "step": 3948 + }, + { + "epoch": 0.4213670426967491, + "grad_norm": 10.051317253173782, + "learning_rate": 1.434171308674367e-06, + "loss": -0.3243, + "step": 3950 + }, + { + "epoch": 0.4215803930981145, + "grad_norm": 17.024595047553298, + "learning_rate": 1.433500244278003e-06, + "loss": 0.081, + "step": 3952 + }, + { + "epoch": 0.42179374349947996, + "grad_norm": 4.0862712268656995, + "learning_rate": 1.4328289394033233e-06, + "loss": 0.7866, + "step": 3954 + }, + { + "epoch": 0.4220070939008454, + "grad_norm": 19.2957789055391, + "learning_rate": 1.4321573944227252e-06, + "loss": -0.4549, + "step": 3956 + }, + { + "epoch": 0.42222044430221084, + "grad_norm": 15.053834501948307, + "learning_rate": 1.4314856097087395e-06, + "loss": -0.0044, + "step": 3958 + }, + { + "epoch": 0.4224337947035763, + "grad_norm": 9.495430415987746, + "learning_rate": 1.4308135856340291e-06, + "loss": -0.7449, + "step": 3960 + }, + { + "epoch": 0.4226471451049417, + "grad_norm": 8.501717503166493, + "learning_rate": 1.4301413225713903e-06, + "loss": 0.128, + "step": 3962 + }, + { + "epoch": 0.42286049550630717, + "grad_norm": 8.176279313921565, + "learning_rate": 1.429468820893752e-06, + "loss": -0.9805, + "step": 3964 + }, + { + "epoch": 0.4230738459076726, + "grad_norm": 23.910406400655198, + "learning_rate": 1.4287960809741749e-06, + "loss": -0.5621, + "step": 3966 + }, + { + "epoch": 0.42328719630903805, + "grad_norm": 4.01379412569471, + "learning_rate": 1.4281231031858524e-06, + "loss": -0.7727, + "step": 3968 + }, + { + "epoch": 0.4235005467104035, + "grad_norm": 11.992127392257016, + "learning_rate": 1.4274498879021095e-06, + "loss": -0.3956, + "step": 3970 + }, + { + "epoch": 0.42371389711176893, + "grad_norm": 6.785607830551377, + "learning_rate": 1.4267764354964037e-06, + "loss": -1.2169, + "step": 3972 + }, + { + "epoch": 0.4239272475131344, + "grad_norm": 6.077305027293753, + "learning_rate": 1.426102746342323e-06, + "loss": 0.1862, + "step": 3974 + }, + { + "epoch": 0.4241405979144998, + "grad_norm": 5.495524884415415, + "learning_rate": 1.4254288208135873e-06, + "loss": -0.6692, + "step": 3976 + }, + { + "epoch": 0.42435394831586526, + "grad_norm": 5.140112354930525, + "learning_rate": 1.424754659284048e-06, + "loss": 0.513, + "step": 3978 + }, + { + "epoch": 0.4245672987172307, + "grad_norm": 4.488664422234236, + "learning_rate": 1.4240802621276862e-06, + "loss": -1.3173, + "step": 3980 + }, + { + "epoch": 0.42478064911859614, + "grad_norm": 7.580279365063635, + "learning_rate": 1.4234056297186147e-06, + "loss": -1.3735, + "step": 3982 + }, + { + "epoch": 0.4249939995199616, + "grad_norm": 10.844089956475731, + "learning_rate": 1.422730762431077e-06, + "loss": -0.8356, + "step": 3984 + }, + { + "epoch": 0.425207349921327, + "grad_norm": 4.206407035290552, + "learning_rate": 1.4220556606394463e-06, + "loss": -0.005, + "step": 3986 + }, + { + "epoch": 0.42542070032269247, + "grad_norm": 5.484335400987816, + "learning_rate": 1.4213803247182258e-06, + "loss": 0.2021, + "step": 3988 + }, + { + "epoch": 0.4256340507240579, + "grad_norm": 21.10047662380732, + "learning_rate": 1.420704755042049e-06, + "loss": -1.0204, + "step": 3990 + }, + { + "epoch": 0.42584740112542335, + "grad_norm": 11.541032476494708, + "learning_rate": 1.420028951985679e-06, + "loss": -1.0231, + "step": 3992 + }, + { + "epoch": 0.4260607515267888, + "grad_norm": 16.042813291021844, + "learning_rate": 1.419352915924008e-06, + "loss": -1.3542, + "step": 3994 + }, + { + "epoch": 0.42627410192815424, + "grad_norm": 2.2683628020710516, + "learning_rate": 1.4186766472320582e-06, + "loss": 0.1584, + "step": 3996 + }, + { + "epoch": 0.4264874523295197, + "grad_norm": 5.841967117423284, + "learning_rate": 1.4180001462849797e-06, + "loss": -0.0674, + "step": 3998 + }, + { + "epoch": 0.4267008027308851, + "grad_norm": 3.025176683859653, + "learning_rate": 1.4173234134580526e-06, + "loss": -0.7912, + "step": 4000 + }, + { + "epoch": 0.42691415313225056, + "grad_norm": 2.9680620303618195, + "learning_rate": 1.4166464491266856e-06, + "loss": -0.3055, + "step": 4002 + }, + { + "epoch": 0.427127503533616, + "grad_norm": 3.8931449298436958, + "learning_rate": 1.4159692536664144e-06, + "loss": -1.2022, + "step": 4004 + }, + { + "epoch": 0.42734085393498145, + "grad_norm": 4.56985797885865, + "learning_rate": 1.4152918274529046e-06, + "loss": 0.3853, + "step": 4006 + }, + { + "epoch": 0.4275542043363469, + "grad_norm": 11.82705029534229, + "learning_rate": 1.4146141708619487e-06, + "loss": 0.4052, + "step": 4008 + }, + { + "epoch": 0.42776755473771233, + "grad_norm": 7.752805438305697, + "learning_rate": 1.4139362842694677e-06, + "loss": -0.6557, + "step": 4010 + }, + { + "epoch": 0.42798090513907777, + "grad_norm": 4.476500979392576, + "learning_rate": 1.4132581680515097e-06, + "loss": 0.2613, + "step": 4012 + }, + { + "epoch": 0.4281942555404432, + "grad_norm": 12.297788104097421, + "learning_rate": 1.4125798225842502e-06, + "loss": -1.2171, + "step": 4014 + }, + { + "epoch": 0.42840760594180866, + "grad_norm": 11.955052810312644, + "learning_rate": 1.4119012482439928e-06, + "loss": -0.4417, + "step": 4016 + }, + { + "epoch": 0.4286209563431741, + "grad_norm": 9.633215839005958, + "learning_rate": 1.4112224454071661e-06, + "loss": -0.6075, + "step": 4018 + }, + { + "epoch": 0.42883430674453954, + "grad_norm": 11.158225785192515, + "learning_rate": 1.410543414450328e-06, + "loss": 0.4019, + "step": 4020 + }, + { + "epoch": 0.429047657145905, + "grad_norm": 10.441371467491765, + "learning_rate": 1.4098641557501605e-06, + "loss": -0.8726, + "step": 4022 + }, + { + "epoch": 0.4292610075472704, + "grad_norm": 3.0540093704298155, + "learning_rate": 1.4091846696834738e-06, + "loss": 0.5318, + "step": 4024 + }, + { + "epoch": 0.42947435794863587, + "grad_norm": 4.115340878896579, + "learning_rate": 1.4085049566272028e-06, + "loss": 0.7905, + "step": 4026 + }, + { + "epoch": 0.4296877083500013, + "grad_norm": 7.832721889700597, + "learning_rate": 1.4078250169584095e-06, + "loss": -0.3018, + "step": 4028 + }, + { + "epoch": 0.42990105875136675, + "grad_norm": 9.338502527929311, + "learning_rate": 1.4071448510542815e-06, + "loss": -0.7675, + "step": 4030 + }, + { + "epoch": 0.4301144091527322, + "grad_norm": 21.579437583175366, + "learning_rate": 1.4064644592921304e-06, + "loss": -1.1669, + "step": 4032 + }, + { + "epoch": 0.4303277595540977, + "grad_norm": 10.109927357754493, + "learning_rate": 1.4057838420493947e-06, + "loss": -1.0436, + "step": 4034 + }, + { + "epoch": 0.43054110995546313, + "grad_norm": 16.516450539845547, + "learning_rate": 1.405102999703638e-06, + "loss": -0.3367, + "step": 4036 + }, + { + "epoch": 0.4307544603568286, + "grad_norm": 6.201196211880813, + "learning_rate": 1.4044219326325475e-06, + "loss": 0.0267, + "step": 4038 + }, + { + "epoch": 0.430967810758194, + "grad_norm": 7.192446647216055, + "learning_rate": 1.4037406412139365e-06, + "loss": -0.157, + "step": 4040 + }, + { + "epoch": 0.43118116115955946, + "grad_norm": 8.900003557647027, + "learning_rate": 1.4030591258257413e-06, + "loss": -0.8874, + "step": 4042 + }, + { + "epoch": 0.4313945115609249, + "grad_norm": 15.380208899112041, + "learning_rate": 1.402377386846024e-06, + "loss": -0.0878, + "step": 4044 + }, + { + "epoch": 0.43160786196229034, + "grad_norm": 8.168380931074411, + "learning_rate": 1.4016954246529694e-06, + "loss": 0.1239, + "step": 4046 + }, + { + "epoch": 0.4318212123636558, + "grad_norm": 9.874931174485805, + "learning_rate": 1.4010132396248868e-06, + "loss": -0.5172, + "step": 4048 + }, + { + "epoch": 0.4320345627650212, + "grad_norm": 6.2386281240094625, + "learning_rate": 1.4003308321402091e-06, + "loss": -0.9584, + "step": 4050 + }, + { + "epoch": 0.43224791316638667, + "grad_norm": 7.5913964187706195, + "learning_rate": 1.3996482025774925e-06, + "loss": -1.0215, + "step": 4052 + }, + { + "epoch": 0.4324612635677521, + "grad_norm": 12.427880949233717, + "learning_rate": 1.3989653513154163e-06, + "loss": -0.7783, + "step": 4054 + }, + { + "epoch": 0.43267461396911755, + "grad_norm": 12.1671073716645, + "learning_rate": 1.3982822787327827e-06, + "loss": -0.1999, + "step": 4056 + }, + { + "epoch": 0.432887964370483, + "grad_norm": 7.4267674511925925, + "learning_rate": 1.3975989852085175e-06, + "loss": 0.6351, + "step": 4058 + }, + { + "epoch": 0.43310131477184843, + "grad_norm": 32.133525536609575, + "learning_rate": 1.3969154711216678e-06, + "loss": 0.1598, + "step": 4060 + }, + { + "epoch": 0.4333146651732139, + "grad_norm": 16.72782082277909, + "learning_rate": 1.3962317368514034e-06, + "loss": -0.5512, + "step": 4062 + }, + { + "epoch": 0.4335280155745793, + "grad_norm": 13.069314466885357, + "learning_rate": 1.3955477827770173e-06, + "loss": -1.4918, + "step": 4064 + }, + { + "epoch": 0.43374136597594476, + "grad_norm": 65.59728267906638, + "learning_rate": 1.3948636092779232e-06, + "loss": 0.177, + "step": 4066 + }, + { + "epoch": 0.4339547163773102, + "grad_norm": 13.103752811044153, + "learning_rate": 1.394179216733657e-06, + "loss": 0.0618, + "step": 4068 + }, + { + "epoch": 0.43416806677867564, + "grad_norm": 5.88666812114249, + "learning_rate": 1.393494605523876e-06, + "loss": -0.7158, + "step": 4070 + }, + { + "epoch": 0.4343814171800411, + "grad_norm": 7.578926063453547, + "learning_rate": 1.392809776028359e-06, + "loss": -0.9872, + "step": 4072 + }, + { + "epoch": 0.4345947675814065, + "grad_norm": 12.969725053969256, + "learning_rate": 1.392124728627006e-06, + "loss": -0.3054, + "step": 4074 + }, + { + "epoch": 0.43480811798277197, + "grad_norm": 20.12123906294212, + "learning_rate": 1.3914394636998373e-06, + "loss": -1.0916, + "step": 4076 + }, + { + "epoch": 0.4350214683841374, + "grad_norm": 3.495966891664299, + "learning_rate": 1.3907539816269945e-06, + "loss": -0.0307, + "step": 4078 + }, + { + "epoch": 0.43523481878550285, + "grad_norm": 4.237485833933858, + "learning_rate": 1.3900682827887388e-06, + "loss": 0.1478, + "step": 4080 + }, + { + "epoch": 0.4354481691868683, + "grad_norm": 11.694741143157062, + "learning_rate": 1.3893823675654532e-06, + "loss": -0.3642, + "step": 4082 + }, + { + "epoch": 0.43566151958823374, + "grad_norm": 11.635767620697962, + "learning_rate": 1.388696236337639e-06, + "loss": -1.3747, + "step": 4084 + }, + { + "epoch": 0.4358748699895992, + "grad_norm": 8.087269702944788, + "learning_rate": 1.388009889485918e-06, + "loss": -0.2237, + "step": 4086 + }, + { + "epoch": 0.4360882203909646, + "grad_norm": 5.764035885479102, + "learning_rate": 1.3873233273910326e-06, + "loss": -0.5253, + "step": 4088 + }, + { + "epoch": 0.43630157079233006, + "grad_norm": 6.740627302562899, + "learning_rate": 1.3866365504338425e-06, + "loss": 0.6706, + "step": 4090 + }, + { + "epoch": 0.4365149211936955, + "grad_norm": 9.049840506020912, + "learning_rate": 1.3859495589953286e-06, + "loss": -0.9122, + "step": 4092 + }, + { + "epoch": 0.43672827159506095, + "grad_norm": 13.682776061623818, + "learning_rate": 1.38526235345659e-06, + "loss": -0.4638, + "step": 4094 + }, + { + "epoch": 0.4369416219964264, + "grad_norm": 21.5243152862706, + "learning_rate": 1.3845749341988441e-06, + "loss": -1.037, + "step": 4096 + }, + { + "epoch": 0.43715497239779183, + "grad_norm": 10.926226790653176, + "learning_rate": 1.3838873016034275e-06, + "loss": -0.0643, + "step": 4098 + }, + { + "epoch": 0.43736832279915727, + "grad_norm": 6.061883461619144, + "learning_rate": 1.3831994560517953e-06, + "loss": -1.2447, + "step": 4100 + }, + { + "epoch": 0.4375816732005227, + "grad_norm": 8.487033636336555, + "learning_rate": 1.3825113979255197e-06, + "loss": -0.1818, + "step": 4102 + }, + { + "epoch": 0.43779502360188816, + "grad_norm": 10.76777536080801, + "learning_rate": 1.381823127606292e-06, + "loss": 0.1261, + "step": 4104 + }, + { + "epoch": 0.4380083740032536, + "grad_norm": 7.573498678259308, + "learning_rate": 1.381134645475921e-06, + "loss": -0.931, + "step": 4106 + }, + { + "epoch": 0.43822172440461904, + "grad_norm": 5.415563869925728, + "learning_rate": 1.3804459519163318e-06, + "loss": -0.4808, + "step": 4108 + }, + { + "epoch": 0.4384350748059845, + "grad_norm": 5.4567482768790505, + "learning_rate": 1.3797570473095688e-06, + "loss": -0.0735, + "step": 4110 + }, + { + "epoch": 0.4386484252073499, + "grad_norm": 5.895080647014312, + "learning_rate": 1.3790679320377914e-06, + "loss": 0.4635, + "step": 4112 + }, + { + "epoch": 0.43886177560871537, + "grad_norm": 3.0446845952138717, + "learning_rate": 1.378378606483277e-06, + "loss": 0.1309, + "step": 4114 + }, + { + "epoch": 0.4390751260100808, + "grad_norm": 9.481123929995697, + "learning_rate": 1.37768907102842e-06, + "loss": 0.4556, + "step": 4116 + }, + { + "epoch": 0.43928847641144625, + "grad_norm": 7.747528424344573, + "learning_rate": 1.3769993260557307e-06, + "loss": -0.372, + "step": 4118 + }, + { + "epoch": 0.4395018268128117, + "grad_norm": 9.475071389067807, + "learning_rate": 1.3763093719478357e-06, + "loss": -0.1492, + "step": 4120 + }, + { + "epoch": 0.43971517721417713, + "grad_norm": 5.938704429384989, + "learning_rate": 1.3756192090874768e-06, + "loss": 0.2839, + "step": 4122 + }, + { + "epoch": 0.4399285276155426, + "grad_norm": 8.292685576479947, + "learning_rate": 1.374928837857513e-06, + "loss": 0.6997, + "step": 4124 + }, + { + "epoch": 0.440141878016908, + "grad_norm": 12.255258299598877, + "learning_rate": 1.3742382586409185e-06, + "loss": -0.1741, + "step": 4126 + }, + { + "epoch": 0.44035522841827346, + "grad_norm": 6.5670686115655705, + "learning_rate": 1.373547471820782e-06, + "loss": -0.484, + "step": 4128 + }, + { + "epoch": 0.4405685788196389, + "grad_norm": 10.53130196445708, + "learning_rate": 1.3728564777803086e-06, + "loss": -1.1797, + "step": 4130 + }, + { + "epoch": 0.44078192922100434, + "grad_norm": 8.768805789873078, + "learning_rate": 1.3721652769028173e-06, + "loss": 0.822, + "step": 4132 + }, + { + "epoch": 0.4409952796223698, + "grad_norm": 3.671062653210065, + "learning_rate": 1.3714738695717429e-06, + "loss": 0.3302, + "step": 4134 + }, + { + "epoch": 0.4412086300237352, + "grad_norm": 7.560183361804276, + "learning_rate": 1.3707822561706334e-06, + "loss": -0.224, + "step": 4136 + }, + { + "epoch": 0.44142198042510067, + "grad_norm": 17.368442285848577, + "learning_rate": 1.3700904370831525e-06, + "loss": -1.1103, + "step": 4138 + }, + { + "epoch": 0.4416353308264661, + "grad_norm": 5.751304356315676, + "learning_rate": 1.369398412693077e-06, + "loss": 0.4971, + "step": 4140 + }, + { + "epoch": 0.44184868122783155, + "grad_norm": 13.580964092933243, + "learning_rate": 1.3687061833842978e-06, + "loss": -0.1831, + "step": 4142 + }, + { + "epoch": 0.442062031629197, + "grad_norm": 11.689281943873425, + "learning_rate": 1.3680137495408202e-06, + "loss": -0.9796, + "step": 4144 + }, + { + "epoch": 0.44227538203056244, + "grad_norm": 9.926161571307272, + "learning_rate": 1.3673211115467617e-06, + "loss": -0.1472, + "step": 4146 + }, + { + "epoch": 0.4424887324319279, + "grad_norm": 5.628222121048884, + "learning_rate": 1.3666282697863544e-06, + "loss": -0.5658, + "step": 4148 + }, + { + "epoch": 0.4427020828332933, + "grad_norm": 21.55251703961161, + "learning_rate": 1.365935224643942e-06, + "loss": -0.9759, + "step": 4150 + }, + { + "epoch": 0.44291543323465876, + "grad_norm": 5.464145367999125, + "learning_rate": 1.3652419765039824e-06, + "loss": -0.4121, + "step": 4152 + }, + { + "epoch": 0.4431287836360242, + "grad_norm": 21.86721381833333, + "learning_rate": 1.3645485257510454e-06, + "loss": -0.4838, + "step": 4154 + }, + { + "epoch": 0.44334213403738965, + "grad_norm": 10.36206754781976, + "learning_rate": 1.3638548727698127e-06, + "loss": -0.5343, + "step": 4156 + }, + { + "epoch": 0.4435554844387551, + "grad_norm": 15.207114162530383, + "learning_rate": 1.3631610179450796e-06, + "loss": -0.7877, + "step": 4158 + }, + { + "epoch": 0.44376883484012053, + "grad_norm": 8.39899916465029, + "learning_rate": 1.3624669616617523e-06, + "loss": -0.1684, + "step": 4160 + }, + { + "epoch": 0.44398218524148597, + "grad_norm": 19.60794729919366, + "learning_rate": 1.3617727043048488e-06, + "loss": 0.6558, + "step": 4162 + }, + { + "epoch": 0.4441955356428514, + "grad_norm": 19.46760116364577, + "learning_rate": 1.361078246259499e-06, + "loss": -0.4379, + "step": 4164 + }, + { + "epoch": 0.44440888604421686, + "grad_norm": 12.999054443914337, + "learning_rate": 1.360383587910944e-06, + "loss": 0.0614, + "step": 4166 + }, + { + "epoch": 0.4446222364455823, + "grad_norm": 5.815917645486424, + "learning_rate": 1.359688729644536e-06, + "loss": -0.6593, + "step": 4168 + }, + { + "epoch": 0.44483558684694774, + "grad_norm": 10.496310197881899, + "learning_rate": 1.3589936718457375e-06, + "loss": -0.6503, + "step": 4170 + }, + { + "epoch": 0.4450489372483132, + "grad_norm": 14.310410211876418, + "learning_rate": 1.3582984149001232e-06, + "loss": -0.2569, + "step": 4172 + }, + { + "epoch": 0.4452622876496786, + "grad_norm": 15.581433391292151, + "learning_rate": 1.3576029591933772e-06, + "loss": -0.0161, + "step": 4174 + }, + { + "epoch": 0.44547563805104406, + "grad_norm": 15.116973960321218, + "learning_rate": 1.3569073051112932e-06, + "loss": 1.0956, + "step": 4176 + }, + { + "epoch": 0.4456889884524095, + "grad_norm": 4.606977061438454, + "learning_rate": 1.3562114530397768e-06, + "loss": -1.3273, + "step": 4178 + }, + { + "epoch": 0.44590233885377495, + "grad_norm": 6.856563711797961, + "learning_rate": 1.3555154033648417e-06, + "loss": 0.4318, + "step": 4180 + }, + { + "epoch": 0.4461156892551404, + "grad_norm": 14.629236857194718, + "learning_rate": 1.3548191564726124e-06, + "loss": -0.5614, + "step": 4182 + }, + { + "epoch": 0.44632903965650583, + "grad_norm": 4.502652079812518, + "learning_rate": 1.3541227127493214e-06, + "loss": 0.2268, + "step": 4184 + }, + { + "epoch": 0.4465423900578713, + "grad_norm": 13.688836889048439, + "learning_rate": 1.353426072581313e-06, + "loss": -0.8713, + "step": 4186 + }, + { + "epoch": 0.4467557404592367, + "grad_norm": 23.57063062226622, + "learning_rate": 1.352729236355037e-06, + "loss": -1.1336, + "step": 4188 + }, + { + "epoch": 0.44696909086060216, + "grad_norm": 7.779413904052088, + "learning_rate": 1.3520322044570546e-06, + "loss": 0.133, + "step": 4190 + }, + { + "epoch": 0.4471824412619676, + "grad_norm": 1.719567578240573, + "learning_rate": 1.3513349772740349e-06, + "loss": -0.751, + "step": 4192 + }, + { + "epoch": 0.44739579166333304, + "grad_norm": 4.348998198028288, + "learning_rate": 1.3506375551927544e-06, + "loss": -0.4438, + "step": 4194 + }, + { + "epoch": 0.4476091420646985, + "grad_norm": 15.291671426107172, + "learning_rate": 1.349939938600099e-06, + "loss": -1.3087, + "step": 4196 + }, + { + "epoch": 0.4478224924660639, + "grad_norm": 17.66713476172116, + "learning_rate": 1.3492421278830617e-06, + "loss": 0.0154, + "step": 4198 + }, + { + "epoch": 0.44803584286742937, + "grad_norm": 10.406805200043019, + "learning_rate": 1.3485441234287433e-06, + "loss": -0.7413, + "step": 4200 + }, + { + "epoch": 0.4482491932687948, + "grad_norm": 8.031903248751929, + "learning_rate": 1.347845925624353e-06, + "loss": -0.7575, + "step": 4202 + }, + { + "epoch": 0.44846254367016025, + "grad_norm": 52.394367836369, + "learning_rate": 1.3471475348572052e-06, + "loss": 0.1623, + "step": 4204 + }, + { + "epoch": 0.44867589407152575, + "grad_norm": 16.439058717999487, + "learning_rate": 1.3464489515147237e-06, + "loss": -2.1801, + "step": 4206 + }, + { + "epoch": 0.4488892444728912, + "grad_norm": 12.59674978637548, + "learning_rate": 1.3457501759844372e-06, + "loss": -0.4182, + "step": 4208 + }, + { + "epoch": 0.44910259487425663, + "grad_norm": 6.068940700771966, + "learning_rate": 1.3450512086539822e-06, + "loss": -1.2939, + "step": 4210 + }, + { + "epoch": 0.4493159452756221, + "grad_norm": 7.738706275322668, + "learning_rate": 1.3443520499111017e-06, + "loss": 0.3859, + "step": 4212 + }, + { + "epoch": 0.4495292956769875, + "grad_norm": 12.696984268855182, + "learning_rate": 1.3436527001436435e-06, + "loss": 0.1126, + "step": 4214 + }, + { + "epoch": 0.44974264607835296, + "grad_norm": 14.066417818009079, + "learning_rate": 1.3429531597395632e-06, + "loss": -0.1684, + "step": 4216 + }, + { + "epoch": 0.4499559964797184, + "grad_norm": 9.35385455234468, + "learning_rate": 1.3422534290869206e-06, + "loss": 0.3365, + "step": 4218 + }, + { + "epoch": 0.45016934688108384, + "grad_norm": 15.926813951454921, + "learning_rate": 1.3415535085738819e-06, + "loss": -0.4131, + "step": 4220 + }, + { + "epoch": 0.4503826972824493, + "grad_norm": 14.032092283454821, + "learning_rate": 1.3408533985887183e-06, + "loss": -0.5715, + "step": 4222 + }, + { + "epoch": 0.4505960476838147, + "grad_norm": 7.207428484971019, + "learning_rate": 1.3401530995198064e-06, + "loss": -0.0725, + "step": 4224 + }, + { + "epoch": 0.45080939808518017, + "grad_norm": 1.646477605950923, + "learning_rate": 1.3394526117556275e-06, + "loss": -0.1308, + "step": 4226 + }, + { + "epoch": 0.4510227484865456, + "grad_norm": 9.844939098165417, + "learning_rate": 1.338751935684767e-06, + "loss": -0.1188, + "step": 4228 + }, + { + "epoch": 0.45123609888791105, + "grad_norm": 32.74222278509314, + "learning_rate": 1.3380510716959168e-06, + "loss": -0.5154, + "step": 4230 + }, + { + "epoch": 0.4514494492892765, + "grad_norm": 16.135746066128124, + "learning_rate": 1.33735002017787e-06, + "loss": -1.0133, + "step": 4232 + }, + { + "epoch": 0.45166279969064194, + "grad_norm": 7.705043243899408, + "learning_rate": 1.336648781519526e-06, + "loss": -0.5154, + "step": 4234 + }, + { + "epoch": 0.4518761500920074, + "grad_norm": 19.30589606829004, + "learning_rate": 1.3359473561098873e-06, + "loss": -1.476, + "step": 4236 + }, + { + "epoch": 0.4520895004933728, + "grad_norm": 7.944067040134553, + "learning_rate": 1.33524574433806e-06, + "loss": 0.2563, + "step": 4238 + }, + { + "epoch": 0.45230285089473826, + "grad_norm": 23.327604775369522, + "learning_rate": 1.3345439465932537e-06, + "loss": -0.6428, + "step": 4240 + }, + { + "epoch": 0.4525162012961037, + "grad_norm": 2.7593121581299096, + "learning_rate": 1.3338419632647811e-06, + "loss": -0.0952, + "step": 4242 + }, + { + "epoch": 0.45272955169746915, + "grad_norm": 12.703953316350042, + "learning_rate": 1.3331397947420575e-06, + "loss": 0.1548, + "step": 4244 + }, + { + "epoch": 0.4529429020988346, + "grad_norm": 7.231114671726472, + "learning_rate": 1.3324374414146016e-06, + "loss": -0.8308, + "step": 4246 + }, + { + "epoch": 0.45315625250020003, + "grad_norm": 8.663999394076267, + "learning_rate": 1.331734903672034e-06, + "loss": -0.4839, + "step": 4248 + }, + { + "epoch": 0.45336960290156547, + "grad_norm": 9.558845908032373, + "learning_rate": 1.3310321819040779e-06, + "loss": -0.1037, + "step": 4250 + }, + { + "epoch": 0.4535829533029309, + "grad_norm": 19.014509875003515, + "learning_rate": 1.3303292765005589e-06, + "loss": -0.8341, + "step": 4252 + }, + { + "epoch": 0.45379630370429636, + "grad_norm": 15.859284379002853, + "learning_rate": 1.329626187851404e-06, + "loss": 0.0748, + "step": 4254 + }, + { + "epoch": 0.4540096541056618, + "grad_norm": 10.773250818438424, + "learning_rate": 1.328922916346642e-06, + "loss": -1.0728, + "step": 4256 + }, + { + "epoch": 0.45422300450702724, + "grad_norm": 7.504152446162364, + "learning_rate": 1.328219462376403e-06, + "loss": 0.0275, + "step": 4258 + }, + { + "epoch": 0.4544363549083927, + "grad_norm": 16.969181055104944, + "learning_rate": 1.3275158263309182e-06, + "loss": 0.4795, + "step": 4260 + }, + { + "epoch": 0.4546497053097581, + "grad_norm": 3.1390119493191877, + "learning_rate": 1.3268120086005204e-06, + "loss": -0.2793, + "step": 4262 + }, + { + "epoch": 0.45486305571112357, + "grad_norm": 4.9063290518375196, + "learning_rate": 1.3261080095756428e-06, + "loss": -0.9313, + "step": 4264 + }, + { + "epoch": 0.455076406112489, + "grad_norm": 7.993120456208586, + "learning_rate": 1.3254038296468188e-06, + "loss": 0.4007, + "step": 4266 + }, + { + "epoch": 0.45528975651385445, + "grad_norm": 11.615733741827826, + "learning_rate": 1.3246994692046835e-06, + "loss": -0.5834, + "step": 4268 + }, + { + "epoch": 0.4555031069152199, + "grad_norm": 4.811338355048313, + "learning_rate": 1.3239949286399702e-06, + "loss": -0.5466, + "step": 4270 + }, + { + "epoch": 0.45571645731658533, + "grad_norm": 7.227819610336676, + "learning_rate": 1.3232902083435137e-06, + "loss": -1.2736, + "step": 4272 + }, + { + "epoch": 0.4559298077179508, + "grad_norm": 14.728087851955863, + "learning_rate": 1.322585308706248e-06, + "loss": -0.0124, + "step": 4274 + }, + { + "epoch": 0.4561431581193162, + "grad_norm": 18.60464527244425, + "learning_rate": 1.3218802301192058e-06, + "loss": -0.5882, + "step": 4276 + }, + { + "epoch": 0.45635650852068166, + "grad_norm": 7.992587419900394, + "learning_rate": 1.3211749729735204e-06, + "loss": 0.5317, + "step": 4278 + }, + { + "epoch": 0.4565698589220471, + "grad_norm": 2.6133916782058177, + "learning_rate": 1.3204695376604234e-06, + "loss": -0.6653, + "step": 4280 + }, + { + "epoch": 0.45678320932341254, + "grad_norm": 10.460284137453474, + "learning_rate": 1.3197639245712452e-06, + "loss": -0.8603, + "step": 4282 + }, + { + "epoch": 0.456996559724778, + "grad_norm": 10.489187750897747, + "learning_rate": 1.3190581340974153e-06, + "loss": -0.3257, + "step": 4284 + }, + { + "epoch": 0.4572099101261434, + "grad_norm": 8.85822382779699, + "learning_rate": 1.3183521666304609e-06, + "loss": -0.5889, + "step": 4286 + }, + { + "epoch": 0.45742326052750887, + "grad_norm": 64.75912541288587, + "learning_rate": 1.317646022562008e-06, + "loss": -1.2003, + "step": 4288 + }, + { + "epoch": 0.4576366109288743, + "grad_norm": 5.003061202234531, + "learning_rate": 1.3169397022837802e-06, + "loss": -0.3862, + "step": 4290 + }, + { + "epoch": 0.45784996133023975, + "grad_norm": 13.961865433285054, + "learning_rate": 1.3162332061875993e-06, + "loss": -0.1786, + "step": 4292 + }, + { + "epoch": 0.4580633117316052, + "grad_norm": 8.256386741166033, + "learning_rate": 1.315526534665384e-06, + "loss": -0.0831, + "step": 4294 + }, + { + "epoch": 0.45827666213297064, + "grad_norm": 12.459491927890799, + "learning_rate": 1.3148196881091512e-06, + "loss": 0.1897, + "step": 4296 + }, + { + "epoch": 0.4584900125343361, + "grad_norm": 10.950577952145183, + "learning_rate": 1.3141126669110134e-06, + "loss": -0.6855, + "step": 4298 + }, + { + "epoch": 0.4587033629357015, + "grad_norm": 5.105086974275276, + "learning_rate": 1.3134054714631816e-06, + "loss": -0.3922, + "step": 4300 + }, + { + "epoch": 0.45891671333706696, + "grad_norm": 16.86561356541645, + "learning_rate": 1.3126981021579624e-06, + "loss": -0.191, + "step": 4302 + }, + { + "epoch": 0.4591300637384324, + "grad_norm": 17.443032522079413, + "learning_rate": 1.3119905593877592e-06, + "loss": -0.9906, + "step": 4304 + }, + { + "epoch": 0.45934341413979785, + "grad_norm": 17.551625725033443, + "learning_rate": 1.3112828435450722e-06, + "loss": -0.8583, + "step": 4306 + }, + { + "epoch": 0.4595567645411633, + "grad_norm": 7.1459032094902195, + "learning_rate": 1.3105749550224964e-06, + "loss": -0.0604, + "step": 4308 + }, + { + "epoch": 0.45977011494252873, + "grad_norm": 3.0945000387312955, + "learning_rate": 1.3098668942127238e-06, + "loss": -0.7886, + "step": 4310 + }, + { + "epoch": 0.45998346534389417, + "grad_norm": 14.590468140984948, + "learning_rate": 1.3091586615085412e-06, + "loss": -0.4518, + "step": 4312 + }, + { + "epoch": 0.4601968157452596, + "grad_norm": 8.347775272555465, + "learning_rate": 1.308450257302831e-06, + "loss": -0.6866, + "step": 4314 + }, + { + "epoch": 0.46041016614662506, + "grad_norm": 4.763502547536105, + "learning_rate": 1.3077416819885706e-06, + "loss": 0.3771, + "step": 4316 + }, + { + "epoch": 0.4606235165479905, + "grad_norm": 9.230317729997811, + "learning_rate": 1.307032935958832e-06, + "loss": -0.8715, + "step": 4318 + }, + { + "epoch": 0.46083686694935594, + "grad_norm": 4.983748758021443, + "learning_rate": 1.3063240196067836e-06, + "loss": -0.4145, + "step": 4320 + }, + { + "epoch": 0.4610502173507214, + "grad_norm": 13.623443372404992, + "learning_rate": 1.3056149333256858e-06, + "loss": 0.8636, + "step": 4322 + }, + { + "epoch": 0.4612635677520868, + "grad_norm": 11.906086576829582, + "learning_rate": 1.304905677508895e-06, + "loss": -0.3991, + "step": 4324 + }, + { + "epoch": 0.46147691815345226, + "grad_norm": 9.094700364280792, + "learning_rate": 1.3041962525498613e-06, + "loss": -1.0302, + "step": 4326 + }, + { + "epoch": 0.4616902685548177, + "grad_norm": 6.899346224668135, + "learning_rate": 1.3034866588421282e-06, + "loss": -0.7402, + "step": 4328 + }, + { + "epoch": 0.46190361895618315, + "grad_norm": 7.300980040758923, + "learning_rate": 1.302776896779333e-06, + "loss": -0.1963, + "step": 4330 + }, + { + "epoch": 0.4621169693575486, + "grad_norm": 12.923413839272758, + "learning_rate": 1.3020669667552068e-06, + "loss": 0.0475, + "step": 4332 + }, + { + "epoch": 0.46233031975891403, + "grad_norm": 16.71130795328477, + "learning_rate": 1.3013568691635732e-06, + "loss": -1.6059, + "step": 4334 + }, + { + "epoch": 0.4625436701602795, + "grad_norm": 7.05061087622635, + "learning_rate": 1.3006466043983496e-06, + "loss": 0.2508, + "step": 4336 + }, + { + "epoch": 0.4627570205616449, + "grad_norm": 5.013131826154311, + "learning_rate": 1.2999361728535447e-06, + "loss": -0.7492, + "step": 4338 + }, + { + "epoch": 0.46297037096301036, + "grad_norm": 12.01960809410301, + "learning_rate": 1.299225574923262e-06, + "loss": -0.0667, + "step": 4340 + }, + { + "epoch": 0.4631837213643758, + "grad_norm": 2.8823014249934835, + "learning_rate": 1.2985148110016947e-06, + "loss": -0.9636, + "step": 4342 + }, + { + "epoch": 0.46339707176574124, + "grad_norm": 10.449797518294671, + "learning_rate": 1.29780388148313e-06, + "loss": -0.958, + "step": 4344 + }, + { + "epoch": 0.4636104221671067, + "grad_norm": 8.853742926489918, + "learning_rate": 1.2970927867619459e-06, + "loss": -0.142, + "step": 4346 + }, + { + "epoch": 0.4638237725684721, + "grad_norm": 13.65141265251894, + "learning_rate": 1.2963815272326128e-06, + "loss": 0.6607, + "step": 4348 + }, + { + "epoch": 0.46403712296983757, + "grad_norm": 4.658990883099752, + "learning_rate": 1.2956701032896923e-06, + "loss": -0.216, + "step": 4350 + }, + { + "epoch": 0.464250473371203, + "grad_norm": 5.28671710136447, + "learning_rate": 1.2949585153278365e-06, + "loss": 0.3112, + "step": 4352 + }, + { + "epoch": 0.46446382377256845, + "grad_norm": 5.86497583092819, + "learning_rate": 1.2942467637417899e-06, + "loss": -0.7457, + "step": 4354 + }, + { + "epoch": 0.4646771741739339, + "grad_norm": 3.9562649090345436, + "learning_rate": 1.2935348489263861e-06, + "loss": -0.7077, + "step": 4356 + }, + { + "epoch": 0.46489052457529934, + "grad_norm": 12.663869090660674, + "learning_rate": 1.2928227712765502e-06, + "loss": -0.5329, + "step": 4358 + }, + { + "epoch": 0.4651038749766648, + "grad_norm": 10.648698817828215, + "learning_rate": 1.2921105311872981e-06, + "loss": -0.0253, + "step": 4360 + }, + { + "epoch": 0.4653172253780302, + "grad_norm": 6.372143270944333, + "learning_rate": 1.2913981290537348e-06, + "loss": -0.0653, + "step": 4362 + }, + { + "epoch": 0.46553057577939566, + "grad_norm": 6.439440232158279, + "learning_rate": 1.2906855652710554e-06, + "loss": -0.3021, + "step": 4364 + }, + { + "epoch": 0.4657439261807611, + "grad_norm": 10.74113820521704, + "learning_rate": 1.2899728402345456e-06, + "loss": -0.0159, + "step": 4366 + }, + { + "epoch": 0.46595727658212654, + "grad_norm": 5.283916802206519, + "learning_rate": 1.2892599543395792e-06, + "loss": -0.4462, + "step": 4368 + }, + { + "epoch": 0.466170626983492, + "grad_norm": 19.268692071023377, + "learning_rate": 1.2885469079816197e-06, + "loss": -0.6327, + "step": 4370 + }, + { + "epoch": 0.46638397738485743, + "grad_norm": 2.6225113141111422, + "learning_rate": 1.2878337015562206e-06, + "loss": -0.7265, + "step": 4372 + }, + { + "epoch": 0.46659732778622287, + "grad_norm": 6.156249514437024, + "learning_rate": 1.2871203354590227e-06, + "loss": -1.0016, + "step": 4374 + }, + { + "epoch": 0.4668106781875883, + "grad_norm": 6.941200462500259, + "learning_rate": 1.2864068100857564e-06, + "loss": -0.6483, + "step": 4376 + }, + { + "epoch": 0.4670240285889538, + "grad_norm": 7.836499048250883, + "learning_rate": 1.28569312583224e-06, + "loss": -0.7491, + "step": 4378 + }, + { + "epoch": 0.46723737899031925, + "grad_norm": 15.777497075561797, + "learning_rate": 1.28497928309438e-06, + "loss": -1.1427, + "step": 4380 + }, + { + "epoch": 0.4674507293916847, + "grad_norm": 15.386744007334778, + "learning_rate": 1.284265282268171e-06, + "loss": -0.0934, + "step": 4382 + }, + { + "epoch": 0.46766407979305014, + "grad_norm": 9.84695807797469, + "learning_rate": 1.283551123749695e-06, + "loss": -0.5291, + "step": 4384 + }, + { + "epoch": 0.4678774301944156, + "grad_norm": 27.990071085282693, + "learning_rate": 1.2828368079351216e-06, + "loss": -0.5687, + "step": 4386 + }, + { + "epoch": 0.468090780595781, + "grad_norm": 3.850379673369808, + "learning_rate": 1.2821223352207082e-06, + "loss": -0.4535, + "step": 4388 + }, + { + "epoch": 0.46830413099714646, + "grad_norm": 5.103955557035587, + "learning_rate": 1.281407706002798e-06, + "loss": 0.0035, + "step": 4390 + }, + { + "epoch": 0.4685174813985119, + "grad_norm": 8.774602099695729, + "learning_rate": 1.2806929206778225e-06, + "loss": -1.3779, + "step": 4392 + }, + { + "epoch": 0.46873083179987735, + "grad_norm": 20.98949345586214, + "learning_rate": 1.2799779796422986e-06, + "loss": -0.7346, + "step": 4394 + }, + { + "epoch": 0.4689441822012428, + "grad_norm": 13.421276813772895, + "learning_rate": 1.27926288329283e-06, + "loss": -0.5927, + "step": 4396 + }, + { + "epoch": 0.46915753260260823, + "grad_norm": 2.11756345670777, + "learning_rate": 1.2785476320261067e-06, + "loss": -0.7593, + "step": 4398 + }, + { + "epoch": 0.46937088300397367, + "grad_norm": 8.22705338478405, + "learning_rate": 1.2778322262389047e-06, + "loss": 0.0115, + "step": 4400 + }, + { + "epoch": 0.4695842334053391, + "grad_norm": 7.084002619422214, + "learning_rate": 1.2771166663280854e-06, + "loss": -0.7357, + "step": 4402 + }, + { + "epoch": 0.46979758380670456, + "grad_norm": 4.388575464805909, + "learning_rate": 1.276400952690596e-06, + "loss": -1.3197, + "step": 4404 + }, + { + "epoch": 0.47001093420807, + "grad_norm": 14.135717362131533, + "learning_rate": 1.2756850857234685e-06, + "loss": -0.3283, + "step": 4406 + }, + { + "epoch": 0.47022428460943544, + "grad_norm": 13.577868499953045, + "learning_rate": 1.2749690658238204e-06, + "loss": -0.4527, + "step": 4408 + }, + { + "epoch": 0.4704376350108009, + "grad_norm": 6.398065663167572, + "learning_rate": 1.274252893388854e-06, + "loss": -0.1612, + "step": 4410 + }, + { + "epoch": 0.4706509854121663, + "grad_norm": 9.838865309637782, + "learning_rate": 1.273536568815856e-06, + "loss": -0.7704, + "step": 4412 + }, + { + "epoch": 0.47086433581353176, + "grad_norm": 8.882666342490582, + "learning_rate": 1.272820092502198e-06, + "loss": -0.8816, + "step": 4414 + }, + { + "epoch": 0.4710776862148972, + "grad_norm": 12.42776431452187, + "learning_rate": 1.272103464845335e-06, + "loss": -0.6595, + "step": 4416 + }, + { + "epoch": 0.47129103661626265, + "grad_norm": 3.8384531025793818, + "learning_rate": 1.2713866862428067e-06, + "loss": 0.3371, + "step": 4418 + }, + { + "epoch": 0.4715043870176281, + "grad_norm": 4.78696905371181, + "learning_rate": 1.2706697570922358e-06, + "loss": -0.5369, + "step": 4420 + }, + { + "epoch": 0.47171773741899353, + "grad_norm": 14.510319133494733, + "learning_rate": 1.2699526777913292e-06, + "loss": 0.5357, + "step": 4422 + }, + { + "epoch": 0.471931087820359, + "grad_norm": 6.80407644788293, + "learning_rate": 1.2692354487378768e-06, + "loss": -0.6157, + "step": 4424 + }, + { + "epoch": 0.4721444382217244, + "grad_norm": 9.266471139244477, + "learning_rate": 1.2685180703297513e-06, + "loss": -0.4389, + "step": 4426 + }, + { + "epoch": 0.47235778862308986, + "grad_norm": 16.110849513425503, + "learning_rate": 1.267800542964909e-06, + "loss": -0.3114, + "step": 4428 + }, + { + "epoch": 0.4725711390244553, + "grad_norm": 9.22043482677727, + "learning_rate": 1.2670828670413883e-06, + "loss": -0.69, + "step": 4430 + }, + { + "epoch": 0.47278448942582074, + "grad_norm": 14.042813760069581, + "learning_rate": 1.2663650429573095e-06, + "loss": 0.1055, + "step": 4432 + }, + { + "epoch": 0.4729978398271862, + "grad_norm": 14.235566293593651, + "learning_rate": 1.2656470711108762e-06, + "loss": -1.594, + "step": 4434 + }, + { + "epoch": 0.4732111902285516, + "grad_norm": 9.076831046927458, + "learning_rate": 1.2649289519003736e-06, + "loss": -0.3433, + "step": 4436 + }, + { + "epoch": 0.47342454062991707, + "grad_norm": 11.69636412292387, + "learning_rate": 1.2642106857241677e-06, + "loss": -0.5291, + "step": 4438 + }, + { + "epoch": 0.4736378910312825, + "grad_norm": 4.416341532231857, + "learning_rate": 1.2634922729807076e-06, + "loss": -0.7143, + "step": 4440 + }, + { + "epoch": 0.47385124143264795, + "grad_norm": 15.495740475246544, + "learning_rate": 1.2627737140685226e-06, + "loss": 0.1331, + "step": 4442 + }, + { + "epoch": 0.4740645918340134, + "grad_norm": 13.177738985192402, + "learning_rate": 1.2620550093862234e-06, + "loss": -0.2522, + "step": 4444 + }, + { + "epoch": 0.47427794223537884, + "grad_norm": 8.777140608534667, + "learning_rate": 1.2613361593325016e-06, + "loss": 0.4565, + "step": 4446 + }, + { + "epoch": 0.4744912926367443, + "grad_norm": 17.675173683102255, + "learning_rate": 1.2606171643061292e-06, + "loss": -0.2671, + "step": 4448 + }, + { + "epoch": 0.4747046430381097, + "grad_norm": 9.670891231969337, + "learning_rate": 1.2598980247059592e-06, + "loss": 0.4706, + "step": 4450 + }, + { + "epoch": 0.47491799343947516, + "grad_norm": 7.096326141526543, + "learning_rate": 1.259178740930924e-06, + "loss": -0.6008, + "step": 4452 + }, + { + "epoch": 0.4751313438408406, + "grad_norm": 4.524877764271734, + "learning_rate": 1.2584593133800372e-06, + "loss": 0.3817, + "step": 4454 + }, + { + "epoch": 0.47534469424220605, + "grad_norm": 9.325317315264588, + "learning_rate": 1.2577397424523903e-06, + "loss": -0.1513, + "step": 4456 + }, + { + "epoch": 0.4755580446435715, + "grad_norm": 16.146045831876503, + "learning_rate": 1.2570200285471564e-06, + "loss": -0.4314, + "step": 4458 + }, + { + "epoch": 0.47577139504493693, + "grad_norm": 1.5206328859099232, + "learning_rate": 1.2563001720635863e-06, + "loss": -0.8729, + "step": 4460 + }, + { + "epoch": 0.47598474544630237, + "grad_norm": 7.240853947308043, + "learning_rate": 1.2555801734010102e-06, + "loss": 0.914, + "step": 4462 + }, + { + "epoch": 0.4761980958476678, + "grad_norm": 11.815653864397458, + "learning_rate": 1.2548600329588387e-06, + "loss": -0.0145, + "step": 4464 + }, + { + "epoch": 0.47641144624903325, + "grad_norm": 14.003308341109388, + "learning_rate": 1.2541397511365583e-06, + "loss": 0.2121, + "step": 4466 + }, + { + "epoch": 0.4766247966503987, + "grad_norm": 4.698987371123391, + "learning_rate": 1.2534193283337369e-06, + "loss": 0.1048, + "step": 4468 + }, + { + "epoch": 0.47683814705176414, + "grad_norm": 12.135282277523169, + "learning_rate": 1.252698764950018e-06, + "loss": -0.4957, + "step": 4470 + }, + { + "epoch": 0.4770514974531296, + "grad_norm": 15.414519034970269, + "learning_rate": 1.251978061385125e-06, + "loss": 0.2921, + "step": 4472 + }, + { + "epoch": 0.477264847854495, + "grad_norm": 7.473340481352623, + "learning_rate": 1.2512572180388583e-06, + "loss": -1.2826, + "step": 4474 + }, + { + "epoch": 0.47747819825586046, + "grad_norm": 8.559210012805837, + "learning_rate": 1.2505362353110955e-06, + "loss": -0.2939, + "step": 4476 + }, + { + "epoch": 0.4776915486572259, + "grad_norm": 13.915242602492558, + "learning_rate": 1.249815113601792e-06, + "loss": -0.8636, + "step": 4478 + }, + { + "epoch": 0.47790489905859135, + "grad_norm": 6.307136971788797, + "learning_rate": 1.24909385331098e-06, + "loss": 0.0403, + "step": 4480 + }, + { + "epoch": 0.4781182494599568, + "grad_norm": 9.055877678436463, + "learning_rate": 1.2483724548387695e-06, + "loss": -1.1216, + "step": 4482 + }, + { + "epoch": 0.47833159986132223, + "grad_norm": 8.750311242040427, + "learning_rate": 1.2476509185853455e-06, + "loss": -0.2075, + "step": 4484 + }, + { + "epoch": 0.4785449502626877, + "grad_norm": 11.059948261317459, + "learning_rate": 1.2469292449509707e-06, + "loss": -1.7432, + "step": 4486 + }, + { + "epoch": 0.4787583006640531, + "grad_norm": 8.774480140920161, + "learning_rate": 1.2462074343359838e-06, + "loss": -0.1438, + "step": 4488 + }, + { + "epoch": 0.47897165106541856, + "grad_norm": 5.2731449025575206, + "learning_rate": 1.245485487140799e-06, + "loss": -0.4711, + "step": 4490 + }, + { + "epoch": 0.479185001466784, + "grad_norm": 13.191758597280291, + "learning_rate": 1.2447634037659071e-06, + "loss": -0.216, + "step": 4492 + }, + { + "epoch": 0.47939835186814944, + "grad_norm": 10.4008495304413, + "learning_rate": 1.2440411846118735e-06, + "loss": -0.0535, + "step": 4494 + }, + { + "epoch": 0.4796117022695149, + "grad_norm": 29.985250839717285, + "learning_rate": 1.2433188300793398e-06, + "loss": -0.3686, + "step": 4496 + }, + { + "epoch": 0.4798250526708803, + "grad_norm": 9.625533765924926, + "learning_rate": 1.2425963405690217e-06, + "loss": -1.2015, + "step": 4498 + }, + { + "epoch": 0.48003840307224577, + "grad_norm": 6.515581356847022, + "learning_rate": 1.2418737164817107e-06, + "loss": -0.5768, + "step": 4500 + }, + { + "epoch": 0.4802517534736112, + "grad_norm": 20.157209076613583, + "learning_rate": 1.2411509582182726e-06, + "loss": -0.5799, + "step": 4502 + }, + { + "epoch": 0.48046510387497665, + "grad_norm": 6.537780251372035, + "learning_rate": 1.2404280661796476e-06, + "loss": -0.2906, + "step": 4504 + }, + { + "epoch": 0.4806784542763421, + "grad_norm": 10.846772653815831, + "learning_rate": 1.2397050407668499e-06, + "loss": 0.2712, + "step": 4506 + }, + { + "epoch": 0.48089180467770754, + "grad_norm": 14.611655685688662, + "learning_rate": 1.2389818823809684e-06, + "loss": -1.5096, + "step": 4508 + }, + { + "epoch": 0.481105155079073, + "grad_norm": 8.871907304126829, + "learning_rate": 1.2382585914231648e-06, + "loss": 0.3268, + "step": 4510 + }, + { + "epoch": 0.4813185054804384, + "grad_norm": 9.316870515282826, + "learning_rate": 1.2375351682946755e-06, + "loss": 0.128, + "step": 4512 + }, + { + "epoch": 0.48153185588180386, + "grad_norm": 5.8102533888591825, + "learning_rate": 1.236811613396809e-06, + "loss": -0.0202, + "step": 4514 + }, + { + "epoch": 0.4817452062831693, + "grad_norm": 11.83016883813224, + "learning_rate": 1.2360879271309477e-06, + "loss": -0.8983, + "step": 4516 + }, + { + "epoch": 0.48195855668453474, + "grad_norm": 15.273351918561513, + "learning_rate": 1.2353641098985463e-06, + "loss": 0.013, + "step": 4518 + }, + { + "epoch": 0.4821719070859002, + "grad_norm": 4.730661983578829, + "learning_rate": 1.234640162101133e-06, + "loss": 0.0783, + "step": 4520 + }, + { + "epoch": 0.48238525748726563, + "grad_norm": 20.53063193949676, + "learning_rate": 1.2339160841403073e-06, + "loss": -0.5691, + "step": 4522 + }, + { + "epoch": 0.48259860788863107, + "grad_norm": 9.104099450443687, + "learning_rate": 1.2331918764177423e-06, + "loss": -0.0155, + "step": 4524 + }, + { + "epoch": 0.4828119582899965, + "grad_norm": 5.907743450972621, + "learning_rate": 1.2324675393351817e-06, + "loss": -0.6435, + "step": 4526 + }, + { + "epoch": 0.48302530869136195, + "grad_norm": 5.528787461650035, + "learning_rate": 1.2317430732944416e-06, + "loss": 0.5551, + "step": 4528 + }, + { + "epoch": 0.4832386590927274, + "grad_norm": 13.912061067372578, + "learning_rate": 1.2310184786974101e-06, + "loss": 0.4585, + "step": 4530 + }, + { + "epoch": 0.48345200949409284, + "grad_norm": 20.003388340490677, + "learning_rate": 1.2302937559460452e-06, + "loss": -0.534, + "step": 4532 + }, + { + "epoch": 0.4836653598954583, + "grad_norm": 16.838925200486056, + "learning_rate": 1.2295689054423778e-06, + "loss": 0.7351, + "step": 4534 + }, + { + "epoch": 0.4838787102968237, + "grad_norm": 10.381338804212216, + "learning_rate": 1.2288439275885083e-06, + "loss": -0.4951, + "step": 4536 + }, + { + "epoch": 0.48409206069818916, + "grad_norm": 10.878374878049932, + "learning_rate": 1.2281188227866084e-06, + "loss": 0.1162, + "step": 4538 + }, + { + "epoch": 0.4843054110995546, + "grad_norm": 10.408269876879743, + "learning_rate": 1.2273935914389202e-06, + "loss": -1.4322, + "step": 4540 + }, + { + "epoch": 0.48451876150092005, + "grad_norm": 12.726902760756488, + "learning_rate": 1.2266682339477554e-06, + "loss": 0.0366, + "step": 4542 + }, + { + "epoch": 0.4847321119022855, + "grad_norm": 8.051336730162319, + "learning_rate": 1.2259427507154962e-06, + "loss": -0.8376, + "step": 4544 + }, + { + "epoch": 0.48494546230365093, + "grad_norm": 6.273512315247756, + "learning_rate": 1.225217142144595e-06, + "loss": -0.4874, + "step": 4546 + }, + { + "epoch": 0.48515881270501643, + "grad_norm": 7.1599346548512415, + "learning_rate": 1.2244914086375723e-06, + "loss": 1.1056, + "step": 4548 + }, + { + "epoch": 0.48537216310638187, + "grad_norm": 8.935358278351602, + "learning_rate": 1.2237655505970202e-06, + "loss": -0.8255, + "step": 4550 + }, + { + "epoch": 0.4855855135077473, + "grad_norm": 11.570030681736082, + "learning_rate": 1.223039568425597e-06, + "loss": 1.1273, + "step": 4552 + }, + { + "epoch": 0.48579886390911275, + "grad_norm": 11.811346603617537, + "learning_rate": 1.2223134625260323e-06, + "loss": -0.6168, + "step": 4554 + }, + { + "epoch": 0.4860122143104782, + "grad_norm": 15.775116451092632, + "learning_rate": 1.2215872333011228e-06, + "loss": -0.0359, + "step": 4556 + }, + { + "epoch": 0.48622556471184364, + "grad_norm": 8.669708685902004, + "learning_rate": 1.2208608811537347e-06, + "loss": -0.2268, + "step": 4558 + }, + { + "epoch": 0.4864389151132091, + "grad_norm": 3.676105508952681, + "learning_rate": 1.2201344064868015e-06, + "loss": 0.3688, + "step": 4560 + }, + { + "epoch": 0.4866522655145745, + "grad_norm": 22.8655170040659, + "learning_rate": 1.2194078097033253e-06, + "loss": -1.1593, + "step": 4562 + }, + { + "epoch": 0.48686561591593996, + "grad_norm": 8.240978011682213, + "learning_rate": 1.2186810912063758e-06, + "loss": -0.856, + "step": 4564 + }, + { + "epoch": 0.4870789663173054, + "grad_norm": 17.282596695188143, + "learning_rate": 1.21795425139909e-06, + "loss": -1.0308, + "step": 4566 + }, + { + "epoch": 0.48729231671867085, + "grad_norm": 8.965152458043606, + "learning_rate": 1.217227290684672e-06, + "loss": -0.4099, + "step": 4568 + }, + { + "epoch": 0.4875056671200363, + "grad_norm": 8.505290801326138, + "learning_rate": 1.2165002094663938e-06, + "loss": -0.7557, + "step": 4570 + }, + { + "epoch": 0.48771901752140173, + "grad_norm": 8.785744784118428, + "learning_rate": 1.2157730081475932e-06, + "loss": -1.0014, + "step": 4572 + }, + { + "epoch": 0.4879323679227672, + "grad_norm": 7.6885856447392955, + "learning_rate": 1.2150456871316758e-06, + "loss": 0.0901, + "step": 4574 + }, + { + "epoch": 0.4881457183241326, + "grad_norm": 20.723703502399175, + "learning_rate": 1.2143182468221123e-06, + "loss": -1.5185, + "step": 4576 + }, + { + "epoch": 0.48835906872549806, + "grad_norm": 4.679515338293057, + "learning_rate": 1.2135906876224408e-06, + "loss": -0.6542, + "step": 4578 + }, + { + "epoch": 0.4885724191268635, + "grad_norm": 2.587660155464381, + "learning_rate": 1.2128630099362645e-06, + "loss": 0.2051, + "step": 4580 + }, + { + "epoch": 0.48878576952822894, + "grad_norm": 8.013191661528694, + "learning_rate": 1.2121352141672526e-06, + "loss": -1.0724, + "step": 4582 + }, + { + "epoch": 0.4889991199295944, + "grad_norm": 13.194682632436736, + "learning_rate": 1.2114073007191398e-06, + "loss": 0.2959, + "step": 4584 + }, + { + "epoch": 0.4892124703309598, + "grad_norm": 30.853141676549846, + "learning_rate": 1.2106792699957262e-06, + "loss": -0.504, + "step": 4586 + }, + { + "epoch": 0.48942582073232527, + "grad_norm": 14.564916030048694, + "learning_rate": 1.209951122400877e-06, + "loss": -0.5387, + "step": 4588 + }, + { + "epoch": 0.4896391711336907, + "grad_norm": 9.592957265701498, + "learning_rate": 1.2092228583385217e-06, + "loss": 0.1868, + "step": 4590 + }, + { + "epoch": 0.48985252153505615, + "grad_norm": 8.721499567449033, + "learning_rate": 1.2084944782126553e-06, + "loss": -0.7361, + "step": 4592 + }, + { + "epoch": 0.4900658719364216, + "grad_norm": 4.713498377991401, + "learning_rate": 1.2077659824273363e-06, + "loss": 0.0196, + "step": 4594 + }, + { + "epoch": 0.49027922233778704, + "grad_norm": 13.055652851637328, + "learning_rate": 1.2070373713866877e-06, + "loss": -0.1762, + "step": 4596 + }, + { + "epoch": 0.4904925727391525, + "grad_norm": 11.433633922911113, + "learning_rate": 1.206308645494897e-06, + "loss": -0.7682, + "step": 4598 + }, + { + "epoch": 0.4907059231405179, + "grad_norm": 6.743175718179602, + "learning_rate": 1.2055798051562143e-06, + "loss": -0.3305, + "step": 4600 + }, + { + "epoch": 0.49091927354188336, + "grad_norm": 3.863660066541985, + "learning_rate": 1.204850850774954e-06, + "loss": -0.1976, + "step": 4602 + }, + { + "epoch": 0.4911326239432488, + "grad_norm": 6.168076010641581, + "learning_rate": 1.2041217827554937e-06, + "loss": 0.8196, + "step": 4604 + }, + { + "epoch": 0.49134597434461424, + "grad_norm": 10.29647416406701, + "learning_rate": 1.2033926015022737e-06, + "loss": -0.3058, + "step": 4606 + }, + { + "epoch": 0.4915593247459797, + "grad_norm": 11.635012732388624, + "learning_rate": 1.2026633074197973e-06, + "loss": -0.0888, + "step": 4608 + }, + { + "epoch": 0.49177267514734513, + "grad_norm": 5.842557512591921, + "learning_rate": 1.2019339009126306e-06, + "loss": -0.4134, + "step": 4610 + }, + { + "epoch": 0.49198602554871057, + "grad_norm": 7.577472263680202, + "learning_rate": 1.2012043823854015e-06, + "loss": -0.9818, + "step": 4612 + }, + { + "epoch": 0.492199375950076, + "grad_norm": 12.966349817697527, + "learning_rate": 1.2004747522428006e-06, + "loss": -0.9403, + "step": 4614 + }, + { + "epoch": 0.49241272635144145, + "grad_norm": 6.358270477726682, + "learning_rate": 1.1997450108895806e-06, + "loss": -0.0293, + "step": 4616 + }, + { + "epoch": 0.4926260767528069, + "grad_norm": 12.432836562754504, + "learning_rate": 1.1990151587305547e-06, + "loss": -0.005, + "step": 4618 + }, + { + "epoch": 0.49283942715417234, + "grad_norm": 4.470496998516397, + "learning_rate": 1.1982851961705991e-06, + "loss": -1.231, + "step": 4620 + }, + { + "epoch": 0.4930527775555378, + "grad_norm": 10.051842817773, + "learning_rate": 1.19755512361465e-06, + "loss": -0.0907, + "step": 4622 + }, + { + "epoch": 0.4932661279569032, + "grad_norm": 1.338515503845533, + "learning_rate": 1.1968249414677054e-06, + "loss": -0.3354, + "step": 4624 + }, + { + "epoch": 0.49347947835826866, + "grad_norm": 7.068380546084522, + "learning_rate": 1.1960946501348237e-06, + "loss": 0.1583, + "step": 4626 + }, + { + "epoch": 0.4936928287596341, + "grad_norm": 7.959554339156045, + "learning_rate": 1.1953642500211243e-06, + "loss": 0.1664, + "step": 4628 + }, + { + "epoch": 0.49390617916099955, + "grad_norm": 16.119822871385402, + "learning_rate": 1.1946337415317864e-06, + "loss": -0.9291, + "step": 4630 + }, + { + "epoch": 0.494119529562365, + "grad_norm": 8.835324031027753, + "learning_rate": 1.1939031250720494e-06, + "loss": 0.0016, + "step": 4632 + }, + { + "epoch": 0.49433287996373043, + "grad_norm": 6.4077970331717795, + "learning_rate": 1.1931724010472132e-06, + "loss": -0.7412, + "step": 4634 + }, + { + "epoch": 0.4945462303650959, + "grad_norm": 11.734586019400133, + "learning_rate": 1.192441569862637e-06, + "loss": -1.8694, + "step": 4636 + }, + { + "epoch": 0.4947595807664613, + "grad_norm": 5.751180980492697, + "learning_rate": 1.1917106319237384e-06, + "loss": -0.2429, + "step": 4638 + }, + { + "epoch": 0.49497293116782676, + "grad_norm": 35.53276140277989, + "learning_rate": 1.1909795876359962e-06, + "loss": -0.5023, + "step": 4640 + }, + { + "epoch": 0.4951862815691922, + "grad_norm": 6.852687460144838, + "learning_rate": 1.1902484374049469e-06, + "loss": -0.3091, + "step": 4642 + }, + { + "epoch": 0.49539963197055764, + "grad_norm": 7.159117247102893, + "learning_rate": 1.1895171816361859e-06, + "loss": -0.5061, + "step": 4644 + }, + { + "epoch": 0.4956129823719231, + "grad_norm": 20.64712143168135, + "learning_rate": 1.1887858207353675e-06, + "loss": -0.2522, + "step": 4646 + }, + { + "epoch": 0.4958263327732885, + "grad_norm": 9.160943801222633, + "learning_rate": 1.188054355108204e-06, + "loss": -1.1135, + "step": 4648 + }, + { + "epoch": 0.49603968317465397, + "grad_norm": 6.578662547735444, + "learning_rate": 1.1873227851604664e-06, + "loss": -1.1455, + "step": 4650 + }, + { + "epoch": 0.4962530335760194, + "grad_norm": 14.766783552459229, + "learning_rate": 1.1865911112979822e-06, + "loss": -1.0374, + "step": 4652 + }, + { + "epoch": 0.49646638397738485, + "grad_norm": 9.195883253009201, + "learning_rate": 1.185859333926638e-06, + "loss": 0.147, + "step": 4654 + }, + { + "epoch": 0.4966797343787503, + "grad_norm": 13.434952486380208, + "learning_rate": 1.1851274534523773e-06, + "loss": -0.1566, + "step": 4656 + }, + { + "epoch": 0.49689308478011573, + "grad_norm": 11.03819741593441, + "learning_rate": 1.1843954702812007e-06, + "loss": 0.1271, + "step": 4658 + }, + { + "epoch": 0.4971064351814812, + "grad_norm": 12.941047432486323, + "learning_rate": 1.1836633848191657e-06, + "loss": -0.5143, + "step": 4660 + }, + { + "epoch": 0.4973197855828466, + "grad_norm": 19.40993500703773, + "learning_rate": 1.1829311974723866e-06, + "loss": -1.1364, + "step": 4662 + }, + { + "epoch": 0.49753313598421206, + "grad_norm": 5.192855862134528, + "learning_rate": 1.1821989086470349e-06, + "loss": -0.5752, + "step": 4664 + }, + { + "epoch": 0.4977464863855775, + "grad_norm": 4.678098207975314, + "learning_rate": 1.1814665187493367e-06, + "loss": -0.607, + "step": 4666 + }, + { + "epoch": 0.49795983678694294, + "grad_norm": 6.074770821423067, + "learning_rate": 1.1807340281855762e-06, + "loss": 0.2156, + "step": 4668 + }, + { + "epoch": 0.4981731871883084, + "grad_norm": 13.967286676852519, + "learning_rate": 1.1800014373620921e-06, + "loss": 0.321, + "step": 4670 + }, + { + "epoch": 0.49838653758967383, + "grad_norm": 8.134778801310159, + "learning_rate": 1.179268746685279e-06, + "loss": -1.1076, + "step": 4672 + }, + { + "epoch": 0.49859988799103927, + "grad_norm": 4.502911926829281, + "learning_rate": 1.1785359565615877e-06, + "loss": -0.3766, + "step": 4674 + }, + { + "epoch": 0.4988132383924047, + "grad_norm": 4.80294900750161, + "learning_rate": 1.1778030673975225e-06, + "loss": -0.6306, + "step": 4676 + }, + { + "epoch": 0.49902658879377015, + "grad_norm": 7.097755796682588, + "learning_rate": 1.1770700795996444e-06, + "loss": -0.9219, + "step": 4678 + }, + { + "epoch": 0.4992399391951356, + "grad_norm": 8.84087886503126, + "learning_rate": 1.1763369935745675e-06, + "loss": -0.4515, + "step": 4680 + }, + { + "epoch": 0.49945328959650104, + "grad_norm": 4.63782152739515, + "learning_rate": 1.1756038097289622e-06, + "loss": 0.3262, + "step": 4682 + }, + { + "epoch": 0.4996666399978665, + "grad_norm": 9.598062627728716, + "learning_rate": 1.1748705284695514e-06, + "loss": -0.0451, + "step": 4684 + }, + { + "epoch": 0.4998799903992319, + "grad_norm": 12.578719599837715, + "learning_rate": 1.1741371502031132e-06, + "loss": 0.0603, + "step": 4686 + }, + { + "epoch": 0.5000933408005974, + "grad_norm": 4.195133692486686, + "learning_rate": 1.1734036753364791e-06, + "loss": 0.0556, + "step": 4688 + }, + { + "epoch": 0.5003066912019628, + "grad_norm": 5.264943177523864, + "learning_rate": 1.1726701042765342e-06, + "loss": -1.2503, + "step": 4690 + }, + { + "epoch": 0.5005200416033283, + "grad_norm": 14.24882026837529, + "learning_rate": 1.171936437430217e-06, + "loss": 0.0333, + "step": 4692 + }, + { + "epoch": 0.5007333920046937, + "grad_norm": 21.134050837376257, + "learning_rate": 1.1712026752045188e-06, + "loss": -0.9765, + "step": 4694 + }, + { + "epoch": 0.5009467424060592, + "grad_norm": 6.835990867230146, + "learning_rate": 1.1704688180064847e-06, + "loss": -0.2473, + "step": 4696 + }, + { + "epoch": 0.5011600928074246, + "grad_norm": 6.889772333886117, + "learning_rate": 1.1697348662432116e-06, + "loss": -0.4979, + "step": 4698 + }, + { + "epoch": 0.5013734432087901, + "grad_norm": 2.7318348974051507, + "learning_rate": 1.1690008203218492e-06, + "loss": -0.1562, + "step": 4700 + }, + { + "epoch": 0.5015867936101555, + "grad_norm": 23.38407048984099, + "learning_rate": 1.1682666806495992e-06, + "loss": 0.334, + "step": 4702 + }, + { + "epoch": 0.501800144011521, + "grad_norm": 3.307401438610587, + "learning_rate": 1.167532447633716e-06, + "loss": -0.2977, + "step": 4704 + }, + { + "epoch": 0.5020134944128863, + "grad_norm": 6.38139429491155, + "learning_rate": 1.1667981216815048e-06, + "loss": 0.4215, + "step": 4706 + }, + { + "epoch": 0.5022268448142518, + "grad_norm": 8.975676473301844, + "learning_rate": 1.1660637032003232e-06, + "loss": -0.6435, + "step": 4708 + }, + { + "epoch": 0.5024401952156172, + "grad_norm": 8.718847483507833, + "learning_rate": 1.1653291925975795e-06, + "loss": -0.3361, + "step": 4710 + }, + { + "epoch": 0.5026535456169827, + "grad_norm": 5.5592645913057, + "learning_rate": 1.164594590280734e-06, + "loss": -0.572, + "step": 4712 + }, + { + "epoch": 0.5028668960183481, + "grad_norm": 10.451638139494758, + "learning_rate": 1.1638598966572963e-06, + "loss": 0.2284, + "step": 4714 + }, + { + "epoch": 0.5030802464197136, + "grad_norm": 10.973871340147893, + "learning_rate": 1.1631251121348286e-06, + "loss": -1.1373, + "step": 4716 + }, + { + "epoch": 0.503293596821079, + "grad_norm": 14.20785637798361, + "learning_rate": 1.1623902371209418e-06, + "loss": -0.3594, + "step": 4718 + }, + { + "epoch": 0.5035069472224445, + "grad_norm": 8.274883533538636, + "learning_rate": 1.1616552720232976e-06, + "loss": -0.5371, + "step": 4720 + }, + { + "epoch": 0.5037202976238099, + "grad_norm": 14.95296024795198, + "learning_rate": 1.1609202172496087e-06, + "loss": -0.4901, + "step": 4722 + }, + { + "epoch": 0.5039336480251754, + "grad_norm": 13.275956600353139, + "learning_rate": 1.160185073207636e-06, + "loss": -0.3204, + "step": 4724 + }, + { + "epoch": 0.5041469984265408, + "grad_norm": 8.302126110298243, + "learning_rate": 1.1594498403051907e-06, + "loss": -0.8251, + "step": 4726 + }, + { + "epoch": 0.5043603488279063, + "grad_norm": 25.27919195728371, + "learning_rate": 1.1587145189501333e-06, + "loss": 0.5335, + "step": 4728 + }, + { + "epoch": 0.5045736992292716, + "grad_norm": 6.999233386442993, + "learning_rate": 1.1579791095503733e-06, + "loss": -0.9445, + "step": 4730 + }, + { + "epoch": 0.5047870496306371, + "grad_norm": 10.228291086309364, + "learning_rate": 1.1572436125138683e-06, + "loss": -0.5328, + "step": 4732 + }, + { + "epoch": 0.5050004000320025, + "grad_norm": 7.436682757418147, + "learning_rate": 1.156508028248626e-06, + "loss": 0.2427, + "step": 4734 + }, + { + "epoch": 0.505213750433368, + "grad_norm": 8.710985197035614, + "learning_rate": 1.1557723571627015e-06, + "loss": 0.3087, + "step": 4736 + }, + { + "epoch": 0.5054271008347334, + "grad_norm": 24.79048299836852, + "learning_rate": 1.1550365996641978e-06, + "loss": 0.2125, + "step": 4738 + }, + { + "epoch": 0.5056404512360989, + "grad_norm": 7.103859829079952, + "learning_rate": 1.154300756161267e-06, + "loss": -0.4293, + "step": 4740 + }, + { + "epoch": 0.5058538016374643, + "grad_norm": 4.642771161700045, + "learning_rate": 1.1535648270621075e-06, + "loss": -0.0717, + "step": 4742 + }, + { + "epoch": 0.5060671520388298, + "grad_norm": 7.976131523538036, + "learning_rate": 1.1528288127749664e-06, + "loss": -0.2725, + "step": 4744 + }, + { + "epoch": 0.5062805024401952, + "grad_norm": 6.5098352114477676, + "learning_rate": 1.1520927137081374e-06, + "loss": -0.4244, + "step": 4746 + }, + { + "epoch": 0.5064938528415607, + "grad_norm": 7.1088713569982644, + "learning_rate": 1.1513565302699606e-06, + "loss": -0.2204, + "step": 4748 + }, + { + "epoch": 0.5067072032429261, + "grad_norm": 10.329220204561405, + "learning_rate": 1.150620262868825e-06, + "loss": 0.1121, + "step": 4750 + }, + { + "epoch": 0.5069205536442916, + "grad_norm": 3.1438253855371463, + "learning_rate": 1.149883911913164e-06, + "loss": -0.4502, + "step": 4752 + }, + { + "epoch": 0.507133904045657, + "grad_norm": 10.055330269042036, + "learning_rate": 1.1491474778114587e-06, + "loss": -0.8719, + "step": 4754 + }, + { + "epoch": 0.5073472544470224, + "grad_norm": 8.472112922046271, + "learning_rate": 1.1484109609722353e-06, + "loss": 0.2336, + "step": 4756 + }, + { + "epoch": 0.5075606048483878, + "grad_norm": 13.02323018977036, + "learning_rate": 1.1476743618040665e-06, + "loss": -0.8607, + "step": 4758 + }, + { + "epoch": 0.5077739552497533, + "grad_norm": 7.125861806466218, + "learning_rate": 1.1469376807155705e-06, + "loss": -0.334, + "step": 4760 + }, + { + "epoch": 0.5079873056511187, + "grad_norm": 3.380760371844327, + "learning_rate": 1.1462009181154115e-06, + "loss": -0.3745, + "step": 4762 + }, + { + "epoch": 0.5082006560524842, + "grad_norm": 6.20819000362366, + "learning_rate": 1.1454640744122986e-06, + "loss": -1.1264, + "step": 4764 + }, + { + "epoch": 0.5084140064538496, + "grad_norm": 10.337655691646857, + "learning_rate": 1.1447271500149847e-06, + "loss": -0.5596, + "step": 4766 + }, + { + "epoch": 0.5086273568552151, + "grad_norm": 5.893705183608253, + "learning_rate": 1.1439901453322695e-06, + "loss": -0.315, + "step": 4768 + }, + { + "epoch": 0.5088407072565805, + "grad_norm": 10.118838171370284, + "learning_rate": 1.1432530607729956e-06, + "loss": -0.4918, + "step": 4770 + }, + { + "epoch": 0.509054057657946, + "grad_norm": 10.263903814473828, + "learning_rate": 1.1425158967460509e-06, + "loss": -1.0981, + "step": 4772 + }, + { + "epoch": 0.5092674080593114, + "grad_norm": 10.926726440189602, + "learning_rate": 1.1417786536603668e-06, + "loss": -0.4391, + "step": 4774 + }, + { + "epoch": 0.5094807584606769, + "grad_norm": 10.25573281532261, + "learning_rate": 1.1410413319249192e-06, + "loss": 0.7525, + "step": 4776 + }, + { + "epoch": 0.5096941088620422, + "grad_norm": 5.744638730017928, + "learning_rate": 1.1403039319487272e-06, + "loss": -0.1489, + "step": 4778 + }, + { + "epoch": 0.5099074592634077, + "grad_norm": 3.9744973077963177, + "learning_rate": 1.1395664541408524e-06, + "loss": -0.348, + "step": 4780 + }, + { + "epoch": 0.5101208096647731, + "grad_norm": 6.487285241629583, + "learning_rate": 1.1388288989104018e-06, + "loss": -1.1607, + "step": 4782 + }, + { + "epoch": 0.5103341600661386, + "grad_norm": 3.1551896232406107, + "learning_rate": 1.1380912666665233e-06, + "loss": 0.4837, + "step": 4784 + }, + { + "epoch": 0.510547510467504, + "grad_norm": 6.457161072812085, + "learning_rate": 1.1373535578184082e-06, + "loss": 0.1604, + "step": 4786 + }, + { + "epoch": 0.5107608608688695, + "grad_norm": 28.53947276482364, + "learning_rate": 1.1366157727752908e-06, + "loss": -0.2717, + "step": 4788 + }, + { + "epoch": 0.5109742112702349, + "grad_norm": 11.481653940261118, + "learning_rate": 1.1358779119464465e-06, + "loss": -1.5289, + "step": 4790 + }, + { + "epoch": 0.5111875616716004, + "grad_norm": 17.33256955490071, + "learning_rate": 1.1351399757411947e-06, + "loss": -0.7712, + "step": 4792 + }, + { + "epoch": 0.5114009120729658, + "grad_norm": 9.845150465927766, + "learning_rate": 1.1344019645688942e-06, + "loss": -0.09, + "step": 4794 + }, + { + "epoch": 0.5116142624743313, + "grad_norm": 7.89371304400198, + "learning_rate": 1.1336638788389473e-06, + "loss": -0.3798, + "step": 4796 + }, + { + "epoch": 0.5118276128756967, + "grad_norm": 8.216646237193517, + "learning_rate": 1.1329257189607968e-06, + "loss": -0.1694, + "step": 4798 + }, + { + "epoch": 0.5120409632770622, + "grad_norm": 12.986971606690473, + "learning_rate": 1.1321874853439265e-06, + "loss": -1.3227, + "step": 4800 + } + ], + "logging_steps": 2, + "max_steps": 9374, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 600, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 410275052716032.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}