diff --git "a/ARM-7B-Pruned/checkpoint-2400/trainer_state.json" "b/ARM-7B-Pruned/checkpoint-2400/trainer_state.json" new file mode 100644--- /dev/null +++ "b/ARM-7B-Pruned/checkpoint-2400/trainer_state.json" @@ -0,0 +1,8433 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2560204816385311, + "eval_steps": 50000, + "global_step": 2400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00021335040136544257, + "grad_norm": 42.684086970967954, + "learning_rate": 4.264392324093816e-09, + "loss": 0.5986, + "step": 2 + }, + { + "epoch": 0.00042670080273088514, + "grad_norm": 42.537918257537875, + "learning_rate": 8.528784648187632e-09, + "loss": 0.6957, + "step": 4 + }, + { + "epoch": 0.0006400512040963277, + "grad_norm": 58.765999906046574, + "learning_rate": 1.2793176972281448e-08, + "loss": 0.047, + "step": 6 + }, + { + "epoch": 0.0008534016054617703, + "grad_norm": 154.70890416675192, + "learning_rate": 1.7057569296375264e-08, + "loss": -0.6682, + "step": 8 + }, + { + "epoch": 0.0010667520068272129, + "grad_norm": 315.97286091458943, + "learning_rate": 2.1321961620469082e-08, + "loss": 0.0971, + "step": 10 + }, + { + "epoch": 0.0012801024081926554, + "grad_norm": 102.37154058019046, + "learning_rate": 2.5586353944562897e-08, + "loss": 1.0541, + "step": 12 + }, + { + "epoch": 0.001493452809558098, + "grad_norm": 105.127529829733, + "learning_rate": 2.9850746268656714e-08, + "loss": 0.4648, + "step": 14 + }, + { + "epoch": 0.0017068032109235406, + "grad_norm": 48.013196159706226, + "learning_rate": 3.411513859275053e-08, + "loss": -0.4529, + "step": 16 + }, + { + "epoch": 0.0019201536122889832, + "grad_norm": 55.69084727420936, + "learning_rate": 3.837953091684435e-08, + "loss": -0.1927, + "step": 18 + }, + { + "epoch": 0.0021335040136544257, + "grad_norm": 33.25875332287273, + "learning_rate": 4.2643923240938164e-08, + "loss": 0.2517, + "step": 20 + }, + { + "epoch": 0.002346854415019868, + "grad_norm": 245.94604658609376, + "learning_rate": 4.6908315565031985e-08, + "loss": -0.8646, + "step": 22 + }, + { + "epoch": 0.002560204816385311, + "grad_norm": 33.35694659794871, + "learning_rate": 5.117270788912579e-08, + "loss": 0.7383, + "step": 24 + }, + { + "epoch": 0.0027735552177507532, + "grad_norm": 193.28196705366508, + "learning_rate": 5.5437100213219614e-08, + "loss": -0.9655, + "step": 26 + }, + { + "epoch": 0.002986905619116196, + "grad_norm": 50.147398314622684, + "learning_rate": 5.970149253731343e-08, + "loss": -0.5218, + "step": 28 + }, + { + "epoch": 0.0032002560204816384, + "grad_norm": 75.21413198410397, + "learning_rate": 6.396588486140725e-08, + "loss": 0.1629, + "step": 30 + }, + { + "epoch": 0.003413606421847081, + "grad_norm": 103.01315028563378, + "learning_rate": 6.823027718550106e-08, + "loss": -0.7119, + "step": 32 + }, + { + "epoch": 0.0036269568232125235, + "grad_norm": 36.63545743331723, + "learning_rate": 7.249466950959488e-08, + "loss": 0.3538, + "step": 34 + }, + { + "epoch": 0.0038403072245779663, + "grad_norm": 39.002709650329486, + "learning_rate": 7.67590618336887e-08, + "loss": -0.6754, + "step": 36 + }, + { + "epoch": 0.004053657625943409, + "grad_norm": 23.502647009611, + "learning_rate": 8.102345415778252e-08, + "loss": 0.1865, + "step": 38 + }, + { + "epoch": 0.0042670080273088514, + "grad_norm": 26.139670413986497, + "learning_rate": 8.528784648187633e-08, + "loss": -0.3739, + "step": 40 + }, + { + "epoch": 0.004480358428674294, + "grad_norm": 30.02660527440867, + "learning_rate": 8.955223880597014e-08, + "loss": -0.0359, + "step": 42 + }, + { + "epoch": 0.004693708830039736, + "grad_norm": 13.980962042551367, + "learning_rate": 9.381663113006397e-08, + "loss": 0.4169, + "step": 44 + }, + { + "epoch": 0.004907059231405179, + "grad_norm": 94.42191623111091, + "learning_rate": 9.808102345415778e-08, + "loss": -1.4962, + "step": 46 + }, + { + "epoch": 0.005120409632770622, + "grad_norm": 36.146349198898875, + "learning_rate": 1.0234541577825159e-07, + "loss": 0.8652, + "step": 48 + }, + { + "epoch": 0.0053337600341360645, + "grad_norm": 34.112375662585286, + "learning_rate": 1.0660980810234541e-07, + "loss": 0.4556, + "step": 50 + }, + { + "epoch": 0.0055471104355015064, + "grad_norm": 54.41510979168854, + "learning_rate": 1.1087420042643923e-07, + "loss": 0.2452, + "step": 52 + }, + { + "epoch": 0.005760460836866949, + "grad_norm": 27.31961709504184, + "learning_rate": 1.1513859275053305e-07, + "loss": 0.3131, + "step": 54 + }, + { + "epoch": 0.005973811238232392, + "grad_norm": 77.00896363994629, + "learning_rate": 1.1940298507462686e-07, + "loss": -0.0289, + "step": 56 + }, + { + "epoch": 0.006187161639597835, + "grad_norm": 21.8781170195382, + "learning_rate": 1.2366737739872068e-07, + "loss": -0.6188, + "step": 58 + }, + { + "epoch": 0.006400512040963277, + "grad_norm": 16.368843761461356, + "learning_rate": 1.279317697228145e-07, + "loss": 0.2618, + "step": 60 + }, + { + "epoch": 0.0066138624423287195, + "grad_norm": 17.051752922162727, + "learning_rate": 1.3219616204690832e-07, + "loss": -0.1717, + "step": 62 + }, + { + "epoch": 0.006827212843694162, + "grad_norm": 99.13064855085295, + "learning_rate": 1.3646055437100212e-07, + "loss": -0.449, + "step": 64 + }, + { + "epoch": 0.007040563245059605, + "grad_norm": 136.3486847826033, + "learning_rate": 1.4072494669509594e-07, + "loss": 0.2984, + "step": 66 + }, + { + "epoch": 0.007253913646425047, + "grad_norm": 62.554463250965604, + "learning_rate": 1.4498933901918976e-07, + "loss": -0.2482, + "step": 68 + }, + { + "epoch": 0.00746726404779049, + "grad_norm": 51.515904581548206, + "learning_rate": 1.4925373134328355e-07, + "loss": -0.8223, + "step": 70 + }, + { + "epoch": 0.007680614449155933, + "grad_norm": 113.86732871533368, + "learning_rate": 1.535181236673774e-07, + "loss": -0.0383, + "step": 72 + }, + { + "epoch": 0.007893964850521375, + "grad_norm": 26.087446116741575, + "learning_rate": 1.5778251599147122e-07, + "loss": 0.7253, + "step": 74 + }, + { + "epoch": 0.008107315251886817, + "grad_norm": 24.131096654425452, + "learning_rate": 1.6204690831556504e-07, + "loss": -0.0834, + "step": 76 + }, + { + "epoch": 0.008320665653252261, + "grad_norm": 74.42536774731673, + "learning_rate": 1.6631130063965884e-07, + "loss": -0.2997, + "step": 78 + }, + { + "epoch": 0.008534016054617703, + "grad_norm": 71.52938681964571, + "learning_rate": 1.7057569296375266e-07, + "loss": 0.0944, + "step": 80 + }, + { + "epoch": 0.008747366455983145, + "grad_norm": 29.241243114282817, + "learning_rate": 1.7484008528784648e-07, + "loss": -0.7054, + "step": 82 + }, + { + "epoch": 0.008960716857348588, + "grad_norm": 134.58638764428474, + "learning_rate": 1.7910447761194027e-07, + "loss": -0.338, + "step": 84 + }, + { + "epoch": 0.00917406725871403, + "grad_norm": 109.66885361053902, + "learning_rate": 1.8336886993603412e-07, + "loss": -0.1817, + "step": 86 + }, + { + "epoch": 0.009387417660079472, + "grad_norm": 27.969535103995202, + "learning_rate": 1.8763326226012794e-07, + "loss": -0.9258, + "step": 88 + }, + { + "epoch": 0.009600768061444916, + "grad_norm": 27.49676445944368, + "learning_rate": 1.9189765458422174e-07, + "loss": 0.3309, + "step": 90 + }, + { + "epoch": 0.009814118462810358, + "grad_norm": 25.544238337228855, + "learning_rate": 1.9616204690831556e-07, + "loss": -0.0894, + "step": 92 + }, + { + "epoch": 0.010027468864175802, + "grad_norm": 26.09909186001115, + "learning_rate": 2.0042643923240938e-07, + "loss": -0.1209, + "step": 94 + }, + { + "epoch": 0.010240819265541243, + "grad_norm": 32.632466828944544, + "learning_rate": 2.0469083155650317e-07, + "loss": 0.4736, + "step": 96 + }, + { + "epoch": 0.010454169666906685, + "grad_norm": 50.68481961380247, + "learning_rate": 2.08955223880597e-07, + "loss": -0.6321, + "step": 98 + }, + { + "epoch": 0.010667520068272129, + "grad_norm": 25.00088670078768, + "learning_rate": 2.1321961620469082e-07, + "loss": -0.3787, + "step": 100 + }, + { + "epoch": 0.010880870469637571, + "grad_norm": 21.25939551515176, + "learning_rate": 2.1748400852878466e-07, + "loss": 0.1969, + "step": 102 + }, + { + "epoch": 0.011094220871003013, + "grad_norm": 23.873098687717384, + "learning_rate": 2.2174840085287846e-07, + "loss": -0.0885, + "step": 104 + }, + { + "epoch": 0.011307571272368457, + "grad_norm": 17.01811754764777, + "learning_rate": 2.2601279317697228e-07, + "loss": 0.0203, + "step": 106 + }, + { + "epoch": 0.011520921673733898, + "grad_norm": 31.009663505522784, + "learning_rate": 2.302771855010661e-07, + "loss": -0.817, + "step": 108 + }, + { + "epoch": 0.011734272075099342, + "grad_norm": 33.200887615423774, + "learning_rate": 2.345415778251599e-07, + "loss": 0.0471, + "step": 110 + }, + { + "epoch": 0.011947622476464784, + "grad_norm": 32.63042973424626, + "learning_rate": 2.388059701492537e-07, + "loss": -1.5996, + "step": 112 + }, + { + "epoch": 0.012160972877830226, + "grad_norm": 30.22918533639733, + "learning_rate": 2.4307036247334754e-07, + "loss": -0.2466, + "step": 114 + }, + { + "epoch": 0.01237432327919567, + "grad_norm": 27.145400816572458, + "learning_rate": 2.4733475479744136e-07, + "loss": -0.3294, + "step": 116 + }, + { + "epoch": 0.012587673680561112, + "grad_norm": 19.75203303790041, + "learning_rate": 2.515991471215352e-07, + "loss": -0.7427, + "step": 118 + }, + { + "epoch": 0.012801024081926553, + "grad_norm": 16.441388594403527, + "learning_rate": 2.55863539445629e-07, + "loss": -0.2637, + "step": 120 + }, + { + "epoch": 0.013014374483291997, + "grad_norm": 23.81603976262189, + "learning_rate": 2.601279317697228e-07, + "loss": -0.07, + "step": 122 + }, + { + "epoch": 0.013227724884657439, + "grad_norm": 80.42145460011395, + "learning_rate": 2.6439232409381664e-07, + "loss": -0.1647, + "step": 124 + }, + { + "epoch": 0.013441075286022881, + "grad_norm": 28.328954293896295, + "learning_rate": 2.686567164179104e-07, + "loss": -0.1786, + "step": 126 + }, + { + "epoch": 0.013654425687388325, + "grad_norm": 44.29342497352258, + "learning_rate": 2.7292110874200423e-07, + "loss": 0.6337, + "step": 128 + }, + { + "epoch": 0.013867776088753767, + "grad_norm": 14.247744370026695, + "learning_rate": 2.7718550106609805e-07, + "loss": -0.7468, + "step": 130 + }, + { + "epoch": 0.01408112649011921, + "grad_norm": 30.909935252567514, + "learning_rate": 2.8144989339019187e-07, + "loss": 0.2444, + "step": 132 + }, + { + "epoch": 0.014294476891484652, + "grad_norm": 19.951382058960338, + "learning_rate": 2.857142857142857e-07, + "loss": -0.0888, + "step": 134 + }, + { + "epoch": 0.014507827292850094, + "grad_norm": 19.451243521855947, + "learning_rate": 2.899786780383795e-07, + "loss": 0.0622, + "step": 136 + }, + { + "epoch": 0.014721177694215538, + "grad_norm": 44.543818203511506, + "learning_rate": 2.9424307036247334e-07, + "loss": -0.5024, + "step": 138 + }, + { + "epoch": 0.01493452809558098, + "grad_norm": 33.99708212815377, + "learning_rate": 2.985074626865671e-07, + "loss": 0.3096, + "step": 140 + }, + { + "epoch": 0.015147878496946422, + "grad_norm": 23.05447566893894, + "learning_rate": 3.02771855010661e-07, + "loss": 0.3329, + "step": 142 + }, + { + "epoch": 0.015361228898311865, + "grad_norm": 16.88015370164898, + "learning_rate": 3.070362473347548e-07, + "loss": -0.1371, + "step": 144 + }, + { + "epoch": 0.015574579299677307, + "grad_norm": 26.13573562942401, + "learning_rate": 3.113006396588486e-07, + "loss": -0.4183, + "step": 146 + }, + { + "epoch": 0.01578792970104275, + "grad_norm": 22.309803740611596, + "learning_rate": 3.1556503198294244e-07, + "loss": -0.3246, + "step": 148 + }, + { + "epoch": 0.016001280102408193, + "grad_norm": 22.52244155087573, + "learning_rate": 3.1982942430703626e-07, + "loss": -0.5349, + "step": 150 + }, + { + "epoch": 0.016214630503773635, + "grad_norm": 18.473624086858514, + "learning_rate": 3.240938166311301e-07, + "loss": 0.2352, + "step": 152 + }, + { + "epoch": 0.016427980905139077, + "grad_norm": 7.7748886163241355, + "learning_rate": 3.2835820895522385e-07, + "loss": 0.2151, + "step": 154 + }, + { + "epoch": 0.016641331306504522, + "grad_norm": 10.802467734897878, + "learning_rate": 3.3262260127931767e-07, + "loss": 0.2848, + "step": 156 + }, + { + "epoch": 0.016854681707869964, + "grad_norm": 18.162628483420374, + "learning_rate": 3.368869936034115e-07, + "loss": -0.168, + "step": 158 + }, + { + "epoch": 0.017068032109235406, + "grad_norm": 40.528567461071695, + "learning_rate": 3.411513859275053e-07, + "loss": -0.4126, + "step": 160 + }, + { + "epoch": 0.017281382510600848, + "grad_norm": 35.36550572570942, + "learning_rate": 3.4541577825159914e-07, + "loss": 0.0287, + "step": 162 + }, + { + "epoch": 0.01749473291196629, + "grad_norm": 10.892875403462355, + "learning_rate": 3.4968017057569296e-07, + "loss": -0.613, + "step": 164 + }, + { + "epoch": 0.01770808331333173, + "grad_norm": 27.141686915753116, + "learning_rate": 3.539445628997867e-07, + "loss": -0.9305, + "step": 166 + }, + { + "epoch": 0.017921433714697177, + "grad_norm": 37.37206055456951, + "learning_rate": 3.5820895522388055e-07, + "loss": -0.3178, + "step": 168 + }, + { + "epoch": 0.01813478411606262, + "grad_norm": 52.89469341474386, + "learning_rate": 3.6247334754797437e-07, + "loss": -0.6419, + "step": 170 + }, + { + "epoch": 0.01834813451742806, + "grad_norm": 23.215765487324806, + "learning_rate": 3.6673773987206824e-07, + "loss": -0.0533, + "step": 172 + }, + { + "epoch": 0.018561484918793503, + "grad_norm": 21.171847082083467, + "learning_rate": 3.7100213219616206e-07, + "loss": -1.0652, + "step": 174 + }, + { + "epoch": 0.018774835320158945, + "grad_norm": 17.335722663927694, + "learning_rate": 3.752665245202559e-07, + "loss": -0.169, + "step": 176 + }, + { + "epoch": 0.01898818572152439, + "grad_norm": 13.15039858318736, + "learning_rate": 3.795309168443497e-07, + "loss": 0.4414, + "step": 178 + }, + { + "epoch": 0.019201536122889832, + "grad_norm": 9.293181092571949, + "learning_rate": 3.8379530916844347e-07, + "loss": 0.1714, + "step": 180 + }, + { + "epoch": 0.019414886524255274, + "grad_norm": 27.253514499053658, + "learning_rate": 3.880597014925373e-07, + "loss": -0.3719, + "step": 182 + }, + { + "epoch": 0.019628236925620716, + "grad_norm": 11.179212679495947, + "learning_rate": 3.923240938166311e-07, + "loss": -1.5775, + "step": 184 + }, + { + "epoch": 0.019841587326986158, + "grad_norm": 28.002238164167814, + "learning_rate": 3.9658848614072494e-07, + "loss": -0.9116, + "step": 186 + }, + { + "epoch": 0.020054937728351603, + "grad_norm": 20.304811169854457, + "learning_rate": 4.0085287846481876e-07, + "loss": -0.6315, + "step": 188 + }, + { + "epoch": 0.020268288129717045, + "grad_norm": 43.26972545321108, + "learning_rate": 4.051172707889126e-07, + "loss": 0.0818, + "step": 190 + }, + { + "epoch": 0.020481638531082487, + "grad_norm": 23.280256775268967, + "learning_rate": 4.0938166311300635e-07, + "loss": -0.2754, + "step": 192 + }, + { + "epoch": 0.02069498893244793, + "grad_norm": 25.086595962784372, + "learning_rate": 4.1364605543710017e-07, + "loss": 0.1783, + "step": 194 + }, + { + "epoch": 0.02090833933381337, + "grad_norm": 17.929854339567687, + "learning_rate": 4.17910447761194e-07, + "loss": -0.3967, + "step": 196 + }, + { + "epoch": 0.021121689735178813, + "grad_norm": 26.684280905377257, + "learning_rate": 4.221748400852878e-07, + "loss": -0.2432, + "step": 198 + }, + { + "epoch": 0.021335040136544258, + "grad_norm": 20.811875320249584, + "learning_rate": 4.2643923240938163e-07, + "loss": -0.6011, + "step": 200 + }, + { + "epoch": 0.0215483905379097, + "grad_norm": 28.776367397287462, + "learning_rate": 4.3070362473347545e-07, + "loss": -1.6853, + "step": 202 + }, + { + "epoch": 0.021761740939275142, + "grad_norm": 22.23424675819631, + "learning_rate": 4.349680170575693e-07, + "loss": 0.2222, + "step": 204 + }, + { + "epoch": 0.021975091340640584, + "grad_norm": 15.408003573608147, + "learning_rate": 4.392324093816631e-07, + "loss": -0.0164, + "step": 206 + }, + { + "epoch": 0.022188441742006026, + "grad_norm": 8.60164597442568, + "learning_rate": 4.434968017057569e-07, + "loss": -0.5671, + "step": 208 + }, + { + "epoch": 0.02240179214337147, + "grad_norm": 10.343966416597475, + "learning_rate": 4.4776119402985074e-07, + "loss": -0.815, + "step": 210 + }, + { + "epoch": 0.022615142544736913, + "grad_norm": 15.741330211675757, + "learning_rate": 4.5202558635394456e-07, + "loss": -1.0932, + "step": 212 + }, + { + "epoch": 0.022828492946102355, + "grad_norm": 16.39054521840376, + "learning_rate": 4.562899786780384e-07, + "loss": -1.7177, + "step": 214 + }, + { + "epoch": 0.023041843347467797, + "grad_norm": 11.781875373813582, + "learning_rate": 4.605543710021322e-07, + "loss": 0.3606, + "step": 216 + }, + { + "epoch": 0.02325519374883324, + "grad_norm": 10.097449683599512, + "learning_rate": 4.64818763326226e-07, + "loss": -0.7045, + "step": 218 + }, + { + "epoch": 0.023468544150198684, + "grad_norm": 15.715275415653137, + "learning_rate": 4.690831556503198e-07, + "loss": -0.0658, + "step": 220 + }, + { + "epoch": 0.023681894551564126, + "grad_norm": 26.933594441832767, + "learning_rate": 4.733475479744136e-07, + "loss": -0.1298, + "step": 222 + }, + { + "epoch": 0.023895244952929568, + "grad_norm": 8.712912536861461, + "learning_rate": 4.776119402985074e-07, + "loss": 0.095, + "step": 224 + }, + { + "epoch": 0.02410859535429501, + "grad_norm": 21.120479987680937, + "learning_rate": 4.818763326226012e-07, + "loss": -0.0818, + "step": 226 + }, + { + "epoch": 0.024321945755660452, + "grad_norm": 10.071445207074044, + "learning_rate": 4.861407249466951e-07, + "loss": -0.1419, + "step": 228 + }, + { + "epoch": 0.024535296157025894, + "grad_norm": 10.801668877306875, + "learning_rate": 4.904051172707888e-07, + "loss": -0.3432, + "step": 230 + }, + { + "epoch": 0.02474864655839134, + "grad_norm": 13.692305863205236, + "learning_rate": 4.946695095948827e-07, + "loss": -0.1519, + "step": 232 + }, + { + "epoch": 0.02496199695975678, + "grad_norm": 22.11353305341118, + "learning_rate": 4.989339019189765e-07, + "loss": -0.0445, + "step": 234 + }, + { + "epoch": 0.025175347361122223, + "grad_norm": 25.883156385523233, + "learning_rate": 5.031982942430704e-07, + "loss": -0.4949, + "step": 236 + }, + { + "epoch": 0.025388697762487665, + "grad_norm": 61.080118987133744, + "learning_rate": 5.074626865671642e-07, + "loss": -1.1921, + "step": 238 + }, + { + "epoch": 0.025602048163853107, + "grad_norm": 9.282902236974593, + "learning_rate": 5.11727078891258e-07, + "loss": 0.1989, + "step": 240 + }, + { + "epoch": 0.025815398565218552, + "grad_norm": 9.781179043931736, + "learning_rate": 5.159914712153518e-07, + "loss": 0.9967, + "step": 242 + }, + { + "epoch": 0.026028748966583994, + "grad_norm": 12.216217874660604, + "learning_rate": 5.202558635394456e-07, + "loss": -0.3518, + "step": 244 + }, + { + "epoch": 0.026242099367949436, + "grad_norm": 24.42587736751593, + "learning_rate": 5.245202558635394e-07, + "loss": -1.0814, + "step": 246 + }, + { + "epoch": 0.026455449769314878, + "grad_norm": 38.12352896449889, + "learning_rate": 5.287846481876333e-07, + "loss": 0.0811, + "step": 248 + }, + { + "epoch": 0.02666880017068032, + "grad_norm": 24.52163737825959, + "learning_rate": 5.33049040511727e-07, + "loss": -0.8824, + "step": 250 + }, + { + "epoch": 0.026882150572045762, + "grad_norm": 19.231993611076483, + "learning_rate": 5.373134328358208e-07, + "loss": 0.5571, + "step": 252 + }, + { + "epoch": 0.027095500973411207, + "grad_norm": 17.129505372417224, + "learning_rate": 5.415778251599147e-07, + "loss": -0.6865, + "step": 254 + }, + { + "epoch": 0.02730885137477665, + "grad_norm": 19.406038647540843, + "learning_rate": 5.458422174840085e-07, + "loss": -0.4991, + "step": 256 + }, + { + "epoch": 0.02752220177614209, + "grad_norm": 6.847722005977987, + "learning_rate": 5.501066098081023e-07, + "loss": 0.1979, + "step": 258 + }, + { + "epoch": 0.027735552177507533, + "grad_norm": 13.405288361475698, + "learning_rate": 5.543710021321961e-07, + "loss": -0.3812, + "step": 260 + }, + { + "epoch": 0.027948902578872975, + "grad_norm": 21.529785897303153, + "learning_rate": 5.5863539445629e-07, + "loss": -0.6946, + "step": 262 + }, + { + "epoch": 0.02816225298023842, + "grad_norm": 14.20190258955381, + "learning_rate": 5.628997867803837e-07, + "loss": -0.7182, + "step": 264 + }, + { + "epoch": 0.028375603381603862, + "grad_norm": 20.09792579726273, + "learning_rate": 5.671641791044775e-07, + "loss": -0.2583, + "step": 266 + }, + { + "epoch": 0.028588953782969304, + "grad_norm": 17.213177542585473, + "learning_rate": 5.714285714285714e-07, + "loss": 0.3218, + "step": 268 + }, + { + "epoch": 0.028802304184334746, + "grad_norm": 11.185587231889192, + "learning_rate": 5.756929637526652e-07, + "loss": -0.0663, + "step": 270 + }, + { + "epoch": 0.029015654585700188, + "grad_norm": 27.722334316990437, + "learning_rate": 5.79957356076759e-07, + "loss": 0.1254, + "step": 272 + }, + { + "epoch": 0.029229004987065633, + "grad_norm": 20.49167387185228, + "learning_rate": 5.842217484008528e-07, + "loss": -0.3077, + "step": 274 + }, + { + "epoch": 0.029442355388431075, + "grad_norm": 17.084103781873182, + "learning_rate": 5.884861407249467e-07, + "loss": -1.0397, + "step": 276 + }, + { + "epoch": 0.029655705789796517, + "grad_norm": 23.870809301137577, + "learning_rate": 5.927505330490404e-07, + "loss": -0.401, + "step": 278 + }, + { + "epoch": 0.02986905619116196, + "grad_norm": 11.313699213427757, + "learning_rate": 5.970149253731342e-07, + "loss": -1.1555, + "step": 280 + }, + { + "epoch": 0.0300824065925274, + "grad_norm": 9.331131563295791, + "learning_rate": 6.012793176972282e-07, + "loss": -0.5128, + "step": 282 + }, + { + "epoch": 0.030295756993892843, + "grad_norm": 10.71153796479133, + "learning_rate": 6.05543710021322e-07, + "loss": 0.239, + "step": 284 + }, + { + "epoch": 0.03050910739525829, + "grad_norm": 28.31484795145744, + "learning_rate": 6.098081023454158e-07, + "loss": -0.2084, + "step": 286 + }, + { + "epoch": 0.03072245779662373, + "grad_norm": 11.917796878106765, + "learning_rate": 6.140724946695096e-07, + "loss": -0.2701, + "step": 288 + }, + { + "epoch": 0.030935808197989172, + "grad_norm": 15.31345988143714, + "learning_rate": 6.183368869936035e-07, + "loss": -0.9096, + "step": 290 + }, + { + "epoch": 0.031149158599354614, + "grad_norm": 49.77491988047747, + "learning_rate": 6.226012793176972e-07, + "loss": 0.1465, + "step": 292 + }, + { + "epoch": 0.03136250900072006, + "grad_norm": 12.884205486410131, + "learning_rate": 6.26865671641791e-07, + "loss": 0.5899, + "step": 294 + }, + { + "epoch": 0.0315758594020855, + "grad_norm": 22.77696725291766, + "learning_rate": 6.311300639658849e-07, + "loss": -0.3255, + "step": 296 + }, + { + "epoch": 0.031789209803450943, + "grad_norm": 14.760034845954518, + "learning_rate": 6.353944562899787e-07, + "loss": 0.2191, + "step": 298 + }, + { + "epoch": 0.032002560204816385, + "grad_norm": 16.04873208614678, + "learning_rate": 6.396588486140725e-07, + "loss": 0.9425, + "step": 300 + }, + { + "epoch": 0.03221591060618183, + "grad_norm": 21.948692800323336, + "learning_rate": 6.439232409381663e-07, + "loss": 0.4721, + "step": 302 + }, + { + "epoch": 0.03242926100754727, + "grad_norm": 17.475770533963207, + "learning_rate": 6.481876332622602e-07, + "loss": -0.8516, + "step": 304 + }, + { + "epoch": 0.03264261140891271, + "grad_norm": 21.938554689368157, + "learning_rate": 6.524520255863539e-07, + "loss": 0.1675, + "step": 306 + }, + { + "epoch": 0.03285596181027815, + "grad_norm": 46.95903237109459, + "learning_rate": 6.567164179104477e-07, + "loss": 0.7013, + "step": 308 + }, + { + "epoch": 0.033069312211643595, + "grad_norm": 21.7738424183671, + "learning_rate": 6.609808102345416e-07, + "loss": -0.8641, + "step": 310 + }, + { + "epoch": 0.033282662613009044, + "grad_norm": 20.818736283324778, + "learning_rate": 6.652452025586353e-07, + "loss": -0.2203, + "step": 312 + }, + { + "epoch": 0.033496013014374486, + "grad_norm": 12.191027612540129, + "learning_rate": 6.695095948827292e-07, + "loss": -0.2385, + "step": 314 + }, + { + "epoch": 0.03370936341573993, + "grad_norm": 17.485900290878302, + "learning_rate": 6.73773987206823e-07, + "loss": -0.6482, + "step": 316 + }, + { + "epoch": 0.03392271381710537, + "grad_norm": 12.365985112237825, + "learning_rate": 6.780383795309168e-07, + "loss": -0.0618, + "step": 318 + }, + { + "epoch": 0.03413606421847081, + "grad_norm": 19.889441682727426, + "learning_rate": 6.823027718550106e-07, + "loss": 0.255, + "step": 320 + }, + { + "epoch": 0.034349414619836253, + "grad_norm": 19.5015710993416, + "learning_rate": 6.865671641791044e-07, + "loss": -0.6494, + "step": 322 + }, + { + "epoch": 0.034562765021201695, + "grad_norm": 21.45614945706602, + "learning_rate": 6.908315565031983e-07, + "loss": -0.4695, + "step": 324 + }, + { + "epoch": 0.03477611542256714, + "grad_norm": 11.714892632525226, + "learning_rate": 6.95095948827292e-07, + "loss": -0.6134, + "step": 326 + }, + { + "epoch": 0.03498946582393258, + "grad_norm": 17.08341230963109, + "learning_rate": 6.993603411513859e-07, + "loss": -0.9212, + "step": 328 + }, + { + "epoch": 0.03520281622529802, + "grad_norm": 18.71372293407707, + "learning_rate": 7.036247334754797e-07, + "loss": -0.1783, + "step": 330 + }, + { + "epoch": 0.03541616662666346, + "grad_norm": 40.394000763247476, + "learning_rate": 7.078891257995734e-07, + "loss": -0.1696, + "step": 332 + }, + { + "epoch": 0.03562951702802891, + "grad_norm": 16.851717713795054, + "learning_rate": 7.121535181236673e-07, + "loss": -0.168, + "step": 334 + }, + { + "epoch": 0.035842867429394354, + "grad_norm": 20.980121165459167, + "learning_rate": 7.164179104477611e-07, + "loss": -0.1898, + "step": 336 + }, + { + "epoch": 0.036056217830759796, + "grad_norm": 5.128039309618372, + "learning_rate": 7.20682302771855e-07, + "loss": -0.3406, + "step": 338 + }, + { + "epoch": 0.03626956823212524, + "grad_norm": 17.130506401349365, + "learning_rate": 7.249466950959487e-07, + "loss": -0.2751, + "step": 340 + }, + { + "epoch": 0.03648291863349068, + "grad_norm": 17.383816800029777, + "learning_rate": 7.292110874200426e-07, + "loss": -0.0005, + "step": 342 + }, + { + "epoch": 0.03669626903485612, + "grad_norm": 43.198248806456, + "learning_rate": 7.334754797441365e-07, + "loss": -0.2181, + "step": 344 + }, + { + "epoch": 0.036909619436221563, + "grad_norm": 16.46060851996884, + "learning_rate": 7.377398720682303e-07, + "loss": 0.2075, + "step": 346 + }, + { + "epoch": 0.037122969837587005, + "grad_norm": 12.986350336767204, + "learning_rate": 7.420042643923241e-07, + "loss": -0.5395, + "step": 348 + }, + { + "epoch": 0.03733632023895245, + "grad_norm": 19.883036986347154, + "learning_rate": 7.462686567164179e-07, + "loss": -0.047, + "step": 350 + }, + { + "epoch": 0.03754967064031789, + "grad_norm": 8.289068924690984, + "learning_rate": 7.505330490405118e-07, + "loss": 0.9832, + "step": 352 + }, + { + "epoch": 0.03776302104168334, + "grad_norm": 10.173680365988295, + "learning_rate": 7.547974413646055e-07, + "loss": -0.4718, + "step": 354 + }, + { + "epoch": 0.03797637144304878, + "grad_norm": 29.406476571062164, + "learning_rate": 7.590618336886994e-07, + "loss": -0.3167, + "step": 356 + }, + { + "epoch": 0.03818972184441422, + "grad_norm": 27.934726009197963, + "learning_rate": 7.633262260127932e-07, + "loss": -0.1972, + "step": 358 + }, + { + "epoch": 0.038403072245779664, + "grad_norm": 25.076744001467002, + "learning_rate": 7.675906183368869e-07, + "loss": 0.2433, + "step": 360 + }, + { + "epoch": 0.038616422647145106, + "grad_norm": 17.034753146666695, + "learning_rate": 7.718550106609808e-07, + "loss": -0.6862, + "step": 362 + }, + { + "epoch": 0.03882977304851055, + "grad_norm": 30.28266067136412, + "learning_rate": 7.761194029850746e-07, + "loss": 0.0555, + "step": 364 + }, + { + "epoch": 0.03904312344987599, + "grad_norm": 15.433190582955902, + "learning_rate": 7.803837953091685e-07, + "loss": -0.6999, + "step": 366 + }, + { + "epoch": 0.03925647385124143, + "grad_norm": 16.705763291209895, + "learning_rate": 7.846481876332622e-07, + "loss": 0.145, + "step": 368 + }, + { + "epoch": 0.039469824252606873, + "grad_norm": 7.207700811021795, + "learning_rate": 7.889125799573561e-07, + "loss": 0.4911, + "step": 370 + }, + { + "epoch": 0.039683174653972315, + "grad_norm": 12.58245126026889, + "learning_rate": 7.931769722814499e-07, + "loss": -0.4105, + "step": 372 + }, + { + "epoch": 0.03989652505533776, + "grad_norm": 9.727684047283205, + "learning_rate": 7.974413646055436e-07, + "loss": -0.4456, + "step": 374 + }, + { + "epoch": 0.040109875456703206, + "grad_norm": 37.82883818815438, + "learning_rate": 8.017057569296375e-07, + "loss": -1.003, + "step": 376 + }, + { + "epoch": 0.04032322585806865, + "grad_norm": 9.205831543930358, + "learning_rate": 8.059701492537313e-07, + "loss": -0.9034, + "step": 378 + }, + { + "epoch": 0.04053657625943409, + "grad_norm": 17.5439627015, + "learning_rate": 8.102345415778252e-07, + "loss": -0.3568, + "step": 380 + }, + { + "epoch": 0.04074992666079953, + "grad_norm": 15.109693571978122, + "learning_rate": 8.144989339019189e-07, + "loss": -0.3776, + "step": 382 + }, + { + "epoch": 0.040963277062164974, + "grad_norm": 10.552742457862886, + "learning_rate": 8.187633262260127e-07, + "loss": 0.2015, + "step": 384 + }, + { + "epoch": 0.041176627463530416, + "grad_norm": 19.08793342231466, + "learning_rate": 8.230277185501066e-07, + "loss": -1.4421, + "step": 386 + }, + { + "epoch": 0.04138997786489586, + "grad_norm": 34.056443898662884, + "learning_rate": 8.272921108742003e-07, + "loss": -0.197, + "step": 388 + }, + { + "epoch": 0.0416033282662613, + "grad_norm": 7.384540107176898, + "learning_rate": 8.315565031982942e-07, + "loss": 0.7544, + "step": 390 + }, + { + "epoch": 0.04181667866762674, + "grad_norm": 23.683836639585067, + "learning_rate": 8.35820895522388e-07, + "loss": -0.3735, + "step": 392 + }, + { + "epoch": 0.042030029068992183, + "grad_norm": 19.629725497894157, + "learning_rate": 8.400852878464819e-07, + "loss": -0.6826, + "step": 394 + }, + { + "epoch": 0.042243379470357625, + "grad_norm": 13.85103939830437, + "learning_rate": 8.443496801705756e-07, + "loss": -0.2708, + "step": 396 + }, + { + "epoch": 0.042456729871723074, + "grad_norm": 19.710815160612455, + "learning_rate": 8.486140724946694e-07, + "loss": 0.0877, + "step": 398 + }, + { + "epoch": 0.042670080273088516, + "grad_norm": 4.706110486614041, + "learning_rate": 8.528784648187633e-07, + "loss": 0.1916, + "step": 400 + }, + { + "epoch": 0.04288343067445396, + "grad_norm": 12.864617887211866, + "learning_rate": 8.57142857142857e-07, + "loss": 0.0179, + "step": 402 + }, + { + "epoch": 0.0430967810758194, + "grad_norm": 9.560466147390757, + "learning_rate": 8.614072494669509e-07, + "loss": -0.0813, + "step": 404 + }, + { + "epoch": 0.04331013147718484, + "grad_norm": 21.130991357317583, + "learning_rate": 8.656716417910447e-07, + "loss": -1.1384, + "step": 406 + }, + { + "epoch": 0.043523481878550284, + "grad_norm": 19.392285709372985, + "learning_rate": 8.699360341151387e-07, + "loss": 0.1794, + "step": 408 + }, + { + "epoch": 0.043736832279915726, + "grad_norm": 9.266238784116867, + "learning_rate": 8.742004264392324e-07, + "loss": -1.768, + "step": 410 + }, + { + "epoch": 0.04395018268128117, + "grad_norm": 40.16756332202219, + "learning_rate": 8.784648187633262e-07, + "loss": -0.8232, + "step": 412 + }, + { + "epoch": 0.04416353308264661, + "grad_norm": 39.24330802424692, + "learning_rate": 8.827292110874201e-07, + "loss": -1.6178, + "step": 414 + }, + { + "epoch": 0.04437688348401205, + "grad_norm": 11.88082478788392, + "learning_rate": 8.869936034115138e-07, + "loss": -0.3053, + "step": 416 + }, + { + "epoch": 0.044590233885377493, + "grad_norm": 6.357526673930185, + "learning_rate": 8.912579957356077e-07, + "loss": -1.0471, + "step": 418 + }, + { + "epoch": 0.04480358428674294, + "grad_norm": 16.303453044925423, + "learning_rate": 8.955223880597015e-07, + "loss": -0.1262, + "step": 420 + }, + { + "epoch": 0.045016934688108384, + "grad_norm": 14.6518934802552, + "learning_rate": 8.997867803837953e-07, + "loss": -0.2323, + "step": 422 + }, + { + "epoch": 0.045230285089473826, + "grad_norm": 8.02837999022306, + "learning_rate": 9.040511727078891e-07, + "loss": 0.1657, + "step": 424 + }, + { + "epoch": 0.04544363549083927, + "grad_norm": 15.64582877736265, + "learning_rate": 9.083155650319829e-07, + "loss": -0.1882, + "step": 426 + }, + { + "epoch": 0.04565698589220471, + "grad_norm": 16.363233536467853, + "learning_rate": 9.125799573560768e-07, + "loss": -0.4399, + "step": 428 + }, + { + "epoch": 0.04587033629357015, + "grad_norm": 15.548489913061752, + "learning_rate": 9.168443496801705e-07, + "loss": 0.3602, + "step": 430 + }, + { + "epoch": 0.046083686694935594, + "grad_norm": 7.007708536573836, + "learning_rate": 9.211087420042644e-07, + "loss": -0.2646, + "step": 432 + }, + { + "epoch": 0.046297037096301036, + "grad_norm": 13.687256516704583, + "learning_rate": 9.253731343283582e-07, + "loss": -1.0053, + "step": 434 + }, + { + "epoch": 0.04651038749766648, + "grad_norm": 30.306488264414572, + "learning_rate": 9.29637526652452e-07, + "loss": -0.3203, + "step": 436 + }, + { + "epoch": 0.04672373789903192, + "grad_norm": 26.094659474249983, + "learning_rate": 9.339019189765458e-07, + "loss": -0.6359, + "step": 438 + }, + { + "epoch": 0.04693708830039737, + "grad_norm": 13.796812579064204, + "learning_rate": 9.381663113006396e-07, + "loss": -0.3471, + "step": 440 + }, + { + "epoch": 0.04715043870176281, + "grad_norm": 14.69389520446514, + "learning_rate": 9.424307036247334e-07, + "loss": -0.8145, + "step": 442 + }, + { + "epoch": 0.04736378910312825, + "grad_norm": 16.918575972701632, + "learning_rate": 9.466950959488272e-07, + "loss": 0.2185, + "step": 444 + }, + { + "epoch": 0.047577139504493694, + "grad_norm": 10.422480971977198, + "learning_rate": 9.509594882729211e-07, + "loss": -0.4649, + "step": 446 + }, + { + "epoch": 0.047790489905859136, + "grad_norm": 7.03443725132983, + "learning_rate": 9.552238805970149e-07, + "loss": -0.3627, + "step": 448 + }, + { + "epoch": 0.04800384030722458, + "grad_norm": 18.359610314742397, + "learning_rate": 9.594882729211086e-07, + "loss": -0.9404, + "step": 450 + }, + { + "epoch": 0.04821719070859002, + "grad_norm": 8.277072501759134, + "learning_rate": 9.637526652452024e-07, + "loss": 0.2409, + "step": 452 + }, + { + "epoch": 0.04843054110995546, + "grad_norm": 13.071634752922243, + "learning_rate": 9.680170575692964e-07, + "loss": 0.3506, + "step": 454 + }, + { + "epoch": 0.048643891511320904, + "grad_norm": 36.801112890968085, + "learning_rate": 9.722814498933901e-07, + "loss": -0.3961, + "step": 456 + }, + { + "epoch": 0.048857241912686346, + "grad_norm": 40.38991240984631, + "learning_rate": 9.76545842217484e-07, + "loss": -0.3497, + "step": 458 + }, + { + "epoch": 0.04907059231405179, + "grad_norm": 8.414591188760289, + "learning_rate": 9.808102345415777e-07, + "loss": -0.5775, + "step": 460 + }, + { + "epoch": 0.04928394271541724, + "grad_norm": 8.76108225719733, + "learning_rate": 9.850746268656714e-07, + "loss": 0.0839, + "step": 462 + }, + { + "epoch": 0.04949729311678268, + "grad_norm": 9.275399140799998, + "learning_rate": 9.893390191897654e-07, + "loss": 0.0137, + "step": 464 + }, + { + "epoch": 0.04971064351814812, + "grad_norm": 9.832501914571974, + "learning_rate": 9.936034115138592e-07, + "loss": 0.1014, + "step": 466 + }, + { + "epoch": 0.04992399391951356, + "grad_norm": 19.204369086700787, + "learning_rate": 9.97867803837953e-07, + "loss": -0.31, + "step": 468 + }, + { + "epoch": 0.050137344320879004, + "grad_norm": 6.109980991190154, + "learning_rate": 1.0021321961620467e-06, + "loss": -0.26, + "step": 470 + }, + { + "epoch": 0.050350694722244446, + "grad_norm": 31.865612558858906, + "learning_rate": 1.0063965884861407e-06, + "loss": -0.1208, + "step": 472 + }, + { + "epoch": 0.05056404512360989, + "grad_norm": 10.044184014650897, + "learning_rate": 1.0106609808102345e-06, + "loss": -0.2205, + "step": 474 + }, + { + "epoch": 0.05077739552497533, + "grad_norm": 21.875745094199374, + "learning_rate": 1.0149253731343285e-06, + "loss": -0.0812, + "step": 476 + }, + { + "epoch": 0.05099074592634077, + "grad_norm": 11.995975513229988, + "learning_rate": 1.019189765458422e-06, + "loss": -1.7783, + "step": 478 + }, + { + "epoch": 0.051204096327706214, + "grad_norm": 7.696707350691998, + "learning_rate": 1.023454157782516e-06, + "loss": 0.2012, + "step": 480 + }, + { + "epoch": 0.051417446729071656, + "grad_norm": 72.33916217166278, + "learning_rate": 1.0277185501066098e-06, + "loss": 0.0427, + "step": 482 + }, + { + "epoch": 0.051630797130437105, + "grad_norm": 13.569652394718384, + "learning_rate": 1.0319829424307035e-06, + "loss": 0.0953, + "step": 484 + }, + { + "epoch": 0.05184414753180255, + "grad_norm": 6.640484086255635, + "learning_rate": 1.0362473347547973e-06, + "loss": -1.0771, + "step": 486 + }, + { + "epoch": 0.05205749793316799, + "grad_norm": 16.284392321889953, + "learning_rate": 1.0405117270788913e-06, + "loss": -0.661, + "step": 488 + }, + { + "epoch": 0.05227084833453343, + "grad_norm": 10.396345506952937, + "learning_rate": 1.0447761194029848e-06, + "loss": -0.3095, + "step": 490 + }, + { + "epoch": 0.05248419873589887, + "grad_norm": 12.867700984841619, + "learning_rate": 1.0490405117270788e-06, + "loss": 0.4289, + "step": 492 + }, + { + "epoch": 0.052697549137264314, + "grad_norm": 18.31825168601856, + "learning_rate": 1.0533049040511726e-06, + "loss": 0.1729, + "step": 494 + }, + { + "epoch": 0.052910899538629756, + "grad_norm": 17.248690968213506, + "learning_rate": 1.0575692963752666e-06, + "loss": -1.1607, + "step": 496 + }, + { + "epoch": 0.0531242499399952, + "grad_norm": 8.51928074224459, + "learning_rate": 1.0618336886993601e-06, + "loss": -0.3654, + "step": 498 + }, + { + "epoch": 0.05333760034136064, + "grad_norm": 20.639043194792887, + "learning_rate": 1.066098081023454e-06, + "loss": -0.4893, + "step": 500 + }, + { + "epoch": 0.05355095074272608, + "grad_norm": 14.990753207547238, + "learning_rate": 1.070362473347548e-06, + "loss": -0.468, + "step": 502 + }, + { + "epoch": 0.053764301144091524, + "grad_norm": 24.695774003905925, + "learning_rate": 1.0746268656716416e-06, + "loss": -1.2465, + "step": 504 + }, + { + "epoch": 0.05397765154545697, + "grad_norm": 17.81220746000941, + "learning_rate": 1.0788912579957356e-06, + "loss": -0.232, + "step": 506 + }, + { + "epoch": 0.054191001946822415, + "grad_norm": 7.6840011548352924, + "learning_rate": 1.0831556503198294e-06, + "loss": 0.1394, + "step": 508 + }, + { + "epoch": 0.05440435234818786, + "grad_norm": 17.135280175028917, + "learning_rate": 1.0874200426439234e-06, + "loss": -0.1113, + "step": 510 + }, + { + "epoch": 0.0546177027495533, + "grad_norm": 21.577778290948245, + "learning_rate": 1.091684434968017e-06, + "loss": 0.1389, + "step": 512 + }, + { + "epoch": 0.05483105315091874, + "grad_norm": 26.22719409585662, + "learning_rate": 1.095948827292111e-06, + "loss": -0.1216, + "step": 514 + }, + { + "epoch": 0.05504440355228418, + "grad_norm": 8.763214398721058, + "learning_rate": 1.1002132196162047e-06, + "loss": -0.6157, + "step": 516 + }, + { + "epoch": 0.055257753953649624, + "grad_norm": 14.925448860683707, + "learning_rate": 1.1044776119402984e-06, + "loss": 0.0356, + "step": 518 + }, + { + "epoch": 0.055471104355015066, + "grad_norm": 14.365407282055516, + "learning_rate": 1.1087420042643922e-06, + "loss": -0.0624, + "step": 520 + }, + { + "epoch": 0.05568445475638051, + "grad_norm": 8.802366004297667, + "learning_rate": 1.1130063965884862e-06, + "loss": 0.5658, + "step": 522 + }, + { + "epoch": 0.05589780515774595, + "grad_norm": 24.251203850442998, + "learning_rate": 1.11727078891258e-06, + "loss": -0.0644, + "step": 524 + }, + { + "epoch": 0.0561111555591114, + "grad_norm": 16.632568797956132, + "learning_rate": 1.1215351812366737e-06, + "loss": -1.0315, + "step": 526 + }, + { + "epoch": 0.05632450596047684, + "grad_norm": 15.777059103835, + "learning_rate": 1.1257995735607675e-06, + "loss": 0.6824, + "step": 528 + }, + { + "epoch": 0.05653785636184228, + "grad_norm": 6.521931225844872, + "learning_rate": 1.1300639658848615e-06, + "loss": -0.2547, + "step": 530 + }, + { + "epoch": 0.056751206763207725, + "grad_norm": 16.18907262330339, + "learning_rate": 1.134328358208955e-06, + "loss": -0.714, + "step": 532 + }, + { + "epoch": 0.05696455716457317, + "grad_norm": 17.05716262320914, + "learning_rate": 1.138592750533049e-06, + "loss": -1.1937, + "step": 534 + }, + { + "epoch": 0.05717790756593861, + "grad_norm": 8.560189061728243, + "learning_rate": 1.1428571428571428e-06, + "loss": -0.3703, + "step": 536 + }, + { + "epoch": 0.05739125796730405, + "grad_norm": 11.56228500281614, + "learning_rate": 1.1471215351812368e-06, + "loss": -0.8073, + "step": 538 + }, + { + "epoch": 0.05760460836866949, + "grad_norm": 13.24672543046646, + "learning_rate": 1.1513859275053303e-06, + "loss": -0.3222, + "step": 540 + }, + { + "epoch": 0.057817958770034934, + "grad_norm": 14.035481073008786, + "learning_rate": 1.1556503198294243e-06, + "loss": 0.286, + "step": 542 + }, + { + "epoch": 0.058031309171400376, + "grad_norm": 28.95516126438781, + "learning_rate": 1.159914712153518e-06, + "loss": -1.189, + "step": 544 + }, + { + "epoch": 0.05824465957276582, + "grad_norm": 20.850060598425877, + "learning_rate": 1.1641791044776118e-06, + "loss": -0.6204, + "step": 546 + }, + { + "epoch": 0.05845800997413127, + "grad_norm": 21.211499139014034, + "learning_rate": 1.1684434968017056e-06, + "loss": -0.5428, + "step": 548 + }, + { + "epoch": 0.05867136037549671, + "grad_norm": 16.101447224508938, + "learning_rate": 1.1727078891257996e-06, + "loss": -0.7822, + "step": 550 + }, + { + "epoch": 0.05888471077686215, + "grad_norm": 60.24925771229807, + "learning_rate": 1.1769722814498933e-06, + "loss": -0.5338, + "step": 552 + }, + { + "epoch": 0.05909806117822759, + "grad_norm": 10.911944544305852, + "learning_rate": 1.1812366737739871e-06, + "loss": -0.1546, + "step": 554 + }, + { + "epoch": 0.059311411579593035, + "grad_norm": 20.123496073532923, + "learning_rate": 1.1855010660980809e-06, + "loss": 0.1078, + "step": 556 + }, + { + "epoch": 0.05952476198095848, + "grad_norm": 8.077431875506283, + "learning_rate": 1.1897654584221749e-06, + "loss": 0.6775, + "step": 558 + }, + { + "epoch": 0.05973811238232392, + "grad_norm": 20.69430693950377, + "learning_rate": 1.1940298507462684e-06, + "loss": -0.1531, + "step": 560 + }, + { + "epoch": 0.05995146278368936, + "grad_norm": 9.149229771649699, + "learning_rate": 1.1982942430703624e-06, + "loss": 0.2026, + "step": 562 + }, + { + "epoch": 0.0601648131850548, + "grad_norm": 35.409517307149535, + "learning_rate": 1.2025586353944564e-06, + "loss": 0.2081, + "step": 564 + }, + { + "epoch": 0.060378163586420244, + "grad_norm": 18.732961050530974, + "learning_rate": 1.2068230277185501e-06, + "loss": -0.4564, + "step": 566 + }, + { + "epoch": 0.060591513987785686, + "grad_norm": 18.17521906003365, + "learning_rate": 1.211087420042644e-06, + "loss": -0.4694, + "step": 568 + }, + { + "epoch": 0.060804864389151135, + "grad_norm": 15.950240307469345, + "learning_rate": 1.2153518123667377e-06, + "loss": -0.7607, + "step": 570 + }, + { + "epoch": 0.06101821479051658, + "grad_norm": 27.667171128676177, + "learning_rate": 1.2196162046908317e-06, + "loss": -0.4962, + "step": 572 + }, + { + "epoch": 0.06123156519188202, + "grad_norm": 6.875981558346762, + "learning_rate": 1.2238805970149252e-06, + "loss": 0.1071, + "step": 574 + }, + { + "epoch": 0.06144491559324746, + "grad_norm": 23.330801092034456, + "learning_rate": 1.2281449893390192e-06, + "loss": -1.1081, + "step": 576 + }, + { + "epoch": 0.0616582659946129, + "grad_norm": 18.097087985455573, + "learning_rate": 1.232409381663113e-06, + "loss": 0.292, + "step": 578 + }, + { + "epoch": 0.061871616395978345, + "grad_norm": 44.96075507661794, + "learning_rate": 1.236673773987207e-06, + "loss": -1.8344, + "step": 580 + }, + { + "epoch": 0.06208496679734379, + "grad_norm": 11.33849063045387, + "learning_rate": 1.2409381663113005e-06, + "loss": -0.5339, + "step": 582 + }, + { + "epoch": 0.06229831719870923, + "grad_norm": 25.655929914012017, + "learning_rate": 1.2452025586353945e-06, + "loss": 0.2773, + "step": 584 + }, + { + "epoch": 0.06251166760007468, + "grad_norm": 19.518509371933693, + "learning_rate": 1.2494669509594882e-06, + "loss": -0.3635, + "step": 586 + }, + { + "epoch": 0.06272501800144012, + "grad_norm": 19.337178836658445, + "learning_rate": 1.253731343283582e-06, + "loss": -0.4397, + "step": 588 + }, + { + "epoch": 0.06293836840280556, + "grad_norm": 15.629253302757428, + "learning_rate": 1.2579957356076758e-06, + "loss": -0.3098, + "step": 590 + }, + { + "epoch": 0.063151718804171, + "grad_norm": 35.95259160255183, + "learning_rate": 1.2622601279317698e-06, + "loss": -0.9708, + "step": 592 + }, + { + "epoch": 0.06336506920553645, + "grad_norm": 27.574715730226895, + "learning_rate": 1.2665245202558633e-06, + "loss": -0.5131, + "step": 594 + }, + { + "epoch": 0.06357841960690189, + "grad_norm": 47.74900017442824, + "learning_rate": 1.2707889125799573e-06, + "loss": 0.3332, + "step": 596 + }, + { + "epoch": 0.06379177000826733, + "grad_norm": 20.309365479498744, + "learning_rate": 1.275053304904051e-06, + "loss": -1.2309, + "step": 598 + }, + { + "epoch": 0.06400512040963277, + "grad_norm": 11.40208623029672, + "learning_rate": 1.279317697228145e-06, + "loss": -0.5857, + "step": 600 + }, + { + "epoch": 0.06421847081099821, + "grad_norm": 11.402181219311933, + "learning_rate": 1.2835820895522386e-06, + "loss": -0.3416, + "step": 602 + }, + { + "epoch": 0.06443182121236365, + "grad_norm": 15.010371839635921, + "learning_rate": 1.2878464818763326e-06, + "loss": -0.4071, + "step": 604 + }, + { + "epoch": 0.0646451716137291, + "grad_norm": 9.99633951597125, + "learning_rate": 1.2921108742004264e-06, + "loss": -1.1453, + "step": 606 + }, + { + "epoch": 0.06485852201509454, + "grad_norm": 21.31739921019411, + "learning_rate": 1.2963752665245203e-06, + "loss": 0.3107, + "step": 608 + }, + { + "epoch": 0.06507187241645998, + "grad_norm": 16.7015124046411, + "learning_rate": 1.3006396588486139e-06, + "loss": -0.9792, + "step": 610 + }, + { + "epoch": 0.06528522281782542, + "grad_norm": 17.983069493288237, + "learning_rate": 1.3049040511727079e-06, + "loss": -0.3921, + "step": 612 + }, + { + "epoch": 0.06549857321919086, + "grad_norm": 10.553594812872817, + "learning_rate": 1.3091684434968016e-06, + "loss": -1.2914, + "step": 614 + }, + { + "epoch": 0.0657119236205563, + "grad_norm": 14.041836178013272, + "learning_rate": 1.3134328358208954e-06, + "loss": -0.3353, + "step": 616 + }, + { + "epoch": 0.06592527402192175, + "grad_norm": 15.639430537257736, + "learning_rate": 1.3176972281449892e-06, + "loss": -0.4705, + "step": 618 + }, + { + "epoch": 0.06613862442328719, + "grad_norm": 12.786525687977138, + "learning_rate": 1.3219616204690832e-06, + "loss": -0.0215, + "step": 620 + }, + { + "epoch": 0.06635197482465263, + "grad_norm": 9.934313047441512, + "learning_rate": 1.3262260127931767e-06, + "loss": 0.6287, + "step": 622 + }, + { + "epoch": 0.06656532522601809, + "grad_norm": 9.779454896556423, + "learning_rate": 1.3304904051172707e-06, + "loss": -0.0751, + "step": 624 + }, + { + "epoch": 0.06677867562738353, + "grad_norm": 9.494698563936842, + "learning_rate": 1.3347547974413647e-06, + "loss": 0.1564, + "step": 626 + }, + { + "epoch": 0.06699202602874897, + "grad_norm": 10.740511386568414, + "learning_rate": 1.3390191897654584e-06, + "loss": -0.7072, + "step": 628 + }, + { + "epoch": 0.06720537643011441, + "grad_norm": 5.704581178120186, + "learning_rate": 1.3432835820895522e-06, + "loss": -0.6728, + "step": 630 + }, + { + "epoch": 0.06741872683147986, + "grad_norm": 22.298716006252704, + "learning_rate": 1.347547974413646e-06, + "loss": -0.7213, + "step": 632 + }, + { + "epoch": 0.0676320772328453, + "grad_norm": 15.330297898109906, + "learning_rate": 1.35181236673774e-06, + "loss": 0.2685, + "step": 634 + }, + { + "epoch": 0.06784542763421074, + "grad_norm": 22.69580065155436, + "learning_rate": 1.3560767590618335e-06, + "loss": 0.3992, + "step": 636 + }, + { + "epoch": 0.06805877803557618, + "grad_norm": 28.833859562335263, + "learning_rate": 1.3603411513859275e-06, + "loss": -0.7339, + "step": 638 + }, + { + "epoch": 0.06827212843694162, + "grad_norm": 11.283562143626995, + "learning_rate": 1.3646055437100213e-06, + "loss": -0.7638, + "step": 640 + }, + { + "epoch": 0.06848547883830707, + "grad_norm": 10.725377082827144, + "learning_rate": 1.3688699360341152e-06, + "loss": -0.8662, + "step": 642 + }, + { + "epoch": 0.06869882923967251, + "grad_norm": 26.31029696416489, + "learning_rate": 1.3731343283582088e-06, + "loss": -0.7486, + "step": 644 + }, + { + "epoch": 0.06891217964103795, + "grad_norm": 18.040887121015924, + "learning_rate": 1.3773987206823028e-06, + "loss": 0.3118, + "step": 646 + }, + { + "epoch": 0.06912553004240339, + "grad_norm": 10.96637271492357, + "learning_rate": 1.3816631130063965e-06, + "loss": -0.5209, + "step": 648 + }, + { + "epoch": 0.06933888044376883, + "grad_norm": 8.06597559224624, + "learning_rate": 1.3859275053304903e-06, + "loss": -0.4205, + "step": 650 + }, + { + "epoch": 0.06955223084513427, + "grad_norm": 17.471203906083133, + "learning_rate": 1.390191897654584e-06, + "loss": -0.5424, + "step": 652 + }, + { + "epoch": 0.06976558124649972, + "grad_norm": 11.059775225202081, + "learning_rate": 1.394456289978678e-06, + "loss": -1.2152, + "step": 654 + }, + { + "epoch": 0.06997893164786516, + "grad_norm": 8.31714286189907, + "learning_rate": 1.3987206823027718e-06, + "loss": -0.811, + "step": 656 + }, + { + "epoch": 0.0701922820492306, + "grad_norm": 28.554761860008615, + "learning_rate": 1.4029850746268656e-06, + "loss": -1.0846, + "step": 658 + }, + { + "epoch": 0.07040563245059604, + "grad_norm": 16.176945715631096, + "learning_rate": 1.4072494669509594e-06, + "loss": -0.5155, + "step": 660 + }, + { + "epoch": 0.07061898285196148, + "grad_norm": 21.625404385138676, + "learning_rate": 1.4115138592750533e-06, + "loss": -0.9076, + "step": 662 + }, + { + "epoch": 0.07083233325332693, + "grad_norm": 5.4202622935954405, + "learning_rate": 1.415778251599147e-06, + "loss": 0.2342, + "step": 664 + }, + { + "epoch": 0.07104568365469238, + "grad_norm": 17.194951603060776, + "learning_rate": 1.4200426439232409e-06, + "loss": 0.0548, + "step": 666 + }, + { + "epoch": 0.07125903405605782, + "grad_norm": 11.512096288311646, + "learning_rate": 1.4243070362473346e-06, + "loss": -0.886, + "step": 668 + }, + { + "epoch": 0.07147238445742327, + "grad_norm": 16.05900514880833, + "learning_rate": 1.4285714285714286e-06, + "loss": -0.1307, + "step": 670 + }, + { + "epoch": 0.07168573485878871, + "grad_norm": 9.185127296516999, + "learning_rate": 1.4328358208955222e-06, + "loss": -0.1077, + "step": 672 + }, + { + "epoch": 0.07189908526015415, + "grad_norm": 7.334484358389449, + "learning_rate": 1.4371002132196162e-06, + "loss": 0.1615, + "step": 674 + }, + { + "epoch": 0.07211243566151959, + "grad_norm": 18.964205902532676, + "learning_rate": 1.44136460554371e-06, + "loss": -1.4788, + "step": 676 + }, + { + "epoch": 0.07232578606288503, + "grad_norm": 17.11189372092431, + "learning_rate": 1.4456289978678037e-06, + "loss": 0.0577, + "step": 678 + }, + { + "epoch": 0.07253913646425048, + "grad_norm": 10.275590248100219, + "learning_rate": 1.4498933901918975e-06, + "loss": -0.0749, + "step": 680 + }, + { + "epoch": 0.07275248686561592, + "grad_norm": 25.814533145099567, + "learning_rate": 1.4541577825159914e-06, + "loss": 0.4326, + "step": 682 + }, + { + "epoch": 0.07296583726698136, + "grad_norm": 6.539318927578008, + "learning_rate": 1.4584221748400852e-06, + "loss": 0.0608, + "step": 684 + }, + { + "epoch": 0.0731791876683468, + "grad_norm": 27.253684296985305, + "learning_rate": 1.462686567164179e-06, + "loss": 0.1022, + "step": 686 + }, + { + "epoch": 0.07339253806971224, + "grad_norm": 12.415981292555502, + "learning_rate": 1.466950959488273e-06, + "loss": -0.3008, + "step": 688 + }, + { + "epoch": 0.07360588847107769, + "grad_norm": 21.077155825145116, + "learning_rate": 1.4712153518123667e-06, + "loss": -0.4936, + "step": 690 + }, + { + "epoch": 0.07381923887244313, + "grad_norm": 14.246036421919664, + "learning_rate": 1.4754797441364605e-06, + "loss": 0.0596, + "step": 692 + }, + { + "epoch": 0.07403258927380857, + "grad_norm": 15.479313497531585, + "learning_rate": 1.4797441364605543e-06, + "loss": -0.0542, + "step": 694 + }, + { + "epoch": 0.07424593967517401, + "grad_norm": 21.800805681217618, + "learning_rate": 1.4840085287846482e-06, + "loss": 0.8143, + "step": 696 + }, + { + "epoch": 0.07445929007653945, + "grad_norm": 12.447341265262597, + "learning_rate": 1.488272921108742e-06, + "loss": 0.2514, + "step": 698 + }, + { + "epoch": 0.0746726404779049, + "grad_norm": 15.457895001240969, + "learning_rate": 1.4925373134328358e-06, + "loss": -0.1873, + "step": 700 + }, + { + "epoch": 0.07488599087927034, + "grad_norm": 9.103159220217133, + "learning_rate": 1.4968017057569296e-06, + "loss": -0.1545, + "step": 702 + }, + { + "epoch": 0.07509934128063578, + "grad_norm": 13.892293596081212, + "learning_rate": 1.5010660980810235e-06, + "loss": -1.0031, + "step": 704 + }, + { + "epoch": 0.07531269168200122, + "grad_norm": 4.018531204967449, + "learning_rate": 1.505330490405117e-06, + "loss": -0.5123, + "step": 706 + }, + { + "epoch": 0.07552604208336668, + "grad_norm": 9.26403570634237, + "learning_rate": 1.509594882729211e-06, + "loss": -0.009, + "step": 708 + }, + { + "epoch": 0.07573939248473212, + "grad_norm": 23.108096955629904, + "learning_rate": 1.5138592750533048e-06, + "loss": 0.0175, + "step": 710 + }, + { + "epoch": 0.07595274288609756, + "grad_norm": 28.54965038619342, + "learning_rate": 1.5181236673773988e-06, + "loss": -0.4034, + "step": 712 + }, + { + "epoch": 0.076166093287463, + "grad_norm": 36.93993798487583, + "learning_rate": 1.5223880597014924e-06, + "loss": -1.3652, + "step": 714 + }, + { + "epoch": 0.07637944368882844, + "grad_norm": 9.796656002782315, + "learning_rate": 1.5266524520255864e-06, + "loss": 0.5496, + "step": 716 + }, + { + "epoch": 0.07659279409019389, + "grad_norm": 10.912135520925375, + "learning_rate": 1.5309168443496801e-06, + "loss": -0.3247, + "step": 718 + }, + { + "epoch": 0.07680614449155933, + "grad_norm": 14.84305599240473, + "learning_rate": 1.5351812366737739e-06, + "loss": -1.3197, + "step": 720 + }, + { + "epoch": 0.07701949489292477, + "grad_norm": 25.835978612302938, + "learning_rate": 1.5394456289978677e-06, + "loss": -0.2446, + "step": 722 + }, + { + "epoch": 0.07723284529429021, + "grad_norm": 7.4900190590260545, + "learning_rate": 1.5437100213219616e-06, + "loss": -0.6921, + "step": 724 + }, + { + "epoch": 0.07744619569565565, + "grad_norm": 12.73580854140001, + "learning_rate": 1.5479744136460552e-06, + "loss": 0.1706, + "step": 726 + }, + { + "epoch": 0.0776595460970211, + "grad_norm": 10.90211860054164, + "learning_rate": 1.5522388059701492e-06, + "loss": 0.0817, + "step": 728 + }, + { + "epoch": 0.07787289649838654, + "grad_norm": 6.002849805432761, + "learning_rate": 1.556503198294243e-06, + "loss": 0.0037, + "step": 730 + }, + { + "epoch": 0.07808624689975198, + "grad_norm": 15.988913842632046, + "learning_rate": 1.560767590618337e-06, + "loss": 0.3994, + "step": 732 + }, + { + "epoch": 0.07829959730111742, + "grad_norm": 21.834364945681322, + "learning_rate": 1.5650319829424305e-06, + "loss": -0.4599, + "step": 734 + }, + { + "epoch": 0.07851294770248286, + "grad_norm": 7.432963689652445, + "learning_rate": 1.5692963752665245e-06, + "loss": -0.3263, + "step": 736 + }, + { + "epoch": 0.0787262981038483, + "grad_norm": 16.20259948956077, + "learning_rate": 1.5735607675906182e-06, + "loss": -0.1557, + "step": 738 + }, + { + "epoch": 0.07893964850521375, + "grad_norm": 30.282091888880974, + "learning_rate": 1.5778251599147122e-06, + "loss": -1.2297, + "step": 740 + }, + { + "epoch": 0.07915299890657919, + "grad_norm": 18.86228833994148, + "learning_rate": 1.5820895522388058e-06, + "loss": -0.1715, + "step": 742 + }, + { + "epoch": 0.07936634930794463, + "grad_norm": 26.911000632499032, + "learning_rate": 1.5863539445628997e-06, + "loss": -0.2761, + "step": 744 + }, + { + "epoch": 0.07957969970931007, + "grad_norm": 55.67332241793552, + "learning_rate": 1.5906183368869935e-06, + "loss": -0.1245, + "step": 746 + }, + { + "epoch": 0.07979305011067551, + "grad_norm": 11.64128268726896, + "learning_rate": 1.5948827292110873e-06, + "loss": 0.2172, + "step": 748 + }, + { + "epoch": 0.08000640051204096, + "grad_norm": 7.141781123388319, + "learning_rate": 1.599147121535181e-06, + "loss": -1.0433, + "step": 750 + }, + { + "epoch": 0.08021975091340641, + "grad_norm": 14.355586850326564, + "learning_rate": 1.603411513859275e-06, + "loss": 0.7587, + "step": 752 + }, + { + "epoch": 0.08043310131477185, + "grad_norm": 21.229967868951906, + "learning_rate": 1.607675906183369e-06, + "loss": -1.1211, + "step": 754 + }, + { + "epoch": 0.0806464517161373, + "grad_norm": 11.519887266492887, + "learning_rate": 1.6119402985074626e-06, + "loss": 0.589, + "step": 756 + }, + { + "epoch": 0.08085980211750274, + "grad_norm": 14.339130440951164, + "learning_rate": 1.6162046908315565e-06, + "loss": -0.7255, + "step": 758 + }, + { + "epoch": 0.08107315251886818, + "grad_norm": 14.27705846557268, + "learning_rate": 1.6204690831556503e-06, + "loss": -1.539, + "step": 760 + }, + { + "epoch": 0.08128650292023362, + "grad_norm": 17.0155387877924, + "learning_rate": 1.624733475479744e-06, + "loss": -0.366, + "step": 762 + }, + { + "epoch": 0.08149985332159906, + "grad_norm": 7.661991050372059, + "learning_rate": 1.6289978678038378e-06, + "loss": -0.0189, + "step": 764 + }, + { + "epoch": 0.0817132037229645, + "grad_norm": 8.979485378676086, + "learning_rate": 1.6332622601279318e-06, + "loss": -1.96, + "step": 766 + }, + { + "epoch": 0.08192655412432995, + "grad_norm": 5.161703747401364, + "learning_rate": 1.6375266524520254e-06, + "loss": -0.5528, + "step": 768 + }, + { + "epoch": 0.08213990452569539, + "grad_norm": 8.76501482963132, + "learning_rate": 1.6417910447761194e-06, + "loss": -0.0766, + "step": 770 + }, + { + "epoch": 0.08235325492706083, + "grad_norm": 16.69504549029163, + "learning_rate": 1.6460554371002131e-06, + "loss": 0.3121, + "step": 772 + }, + { + "epoch": 0.08256660532842627, + "grad_norm": 17.43852691547062, + "learning_rate": 1.6503198294243071e-06, + "loss": -0.6262, + "step": 774 + }, + { + "epoch": 0.08277995572979172, + "grad_norm": 7.8082474899976075, + "learning_rate": 1.6545842217484007e-06, + "loss": 0.4352, + "step": 776 + }, + { + "epoch": 0.08299330613115716, + "grad_norm": 23.61906025471836, + "learning_rate": 1.6588486140724946e-06, + "loss": -0.586, + "step": 778 + }, + { + "epoch": 0.0832066565325226, + "grad_norm": 16.906426462089843, + "learning_rate": 1.6631130063965884e-06, + "loss": 0.1607, + "step": 780 + }, + { + "epoch": 0.08342000693388804, + "grad_norm": 12.849146673239158, + "learning_rate": 1.6673773987206822e-06, + "loss": -0.4005, + "step": 782 + }, + { + "epoch": 0.08363335733525348, + "grad_norm": 15.52998781884974, + "learning_rate": 1.671641791044776e-06, + "loss": -0.2504, + "step": 784 + }, + { + "epoch": 0.08384670773661893, + "grad_norm": 7.223884938731002, + "learning_rate": 1.67590618336887e-06, + "loss": -0.9925, + "step": 786 + }, + { + "epoch": 0.08406005813798437, + "grad_norm": 4.8718682784669145, + "learning_rate": 1.6801705756929637e-06, + "loss": -0.3866, + "step": 788 + }, + { + "epoch": 0.08427340853934981, + "grad_norm": 17.60760255975411, + "learning_rate": 1.6844349680170575e-06, + "loss": -0.9417, + "step": 790 + }, + { + "epoch": 0.08448675894071525, + "grad_norm": 27.714848425193637, + "learning_rate": 1.6886993603411512e-06, + "loss": 0.3963, + "step": 792 + }, + { + "epoch": 0.0847001093420807, + "grad_norm": 35.493014709513936, + "learning_rate": 1.6929637526652452e-06, + "loss": -0.4515, + "step": 794 + }, + { + "epoch": 0.08491345974344615, + "grad_norm": 21.400152124030786, + "learning_rate": 1.6972281449893388e-06, + "loss": -0.67, + "step": 796 + }, + { + "epoch": 0.08512681014481159, + "grad_norm": 16.911674814364677, + "learning_rate": 1.7014925373134328e-06, + "loss": -0.1268, + "step": 798 + }, + { + "epoch": 0.08534016054617703, + "grad_norm": 13.009852458798768, + "learning_rate": 1.7057569296375265e-06, + "loss": -0.5053, + "step": 800 + }, + { + "epoch": 0.08555351094754247, + "grad_norm": 16.266588077557774, + "learning_rate": 1.7100213219616205e-06, + "loss": -0.7199, + "step": 802 + }, + { + "epoch": 0.08576686134890792, + "grad_norm": 19.16342878443152, + "learning_rate": 1.714285714285714e-06, + "loss": 0.0409, + "step": 804 + }, + { + "epoch": 0.08598021175027336, + "grad_norm": 19.235257083323877, + "learning_rate": 1.718550106609808e-06, + "loss": -0.3201, + "step": 806 + }, + { + "epoch": 0.0861935621516388, + "grad_norm": 13.755965707142774, + "learning_rate": 1.7228144989339018e-06, + "loss": -0.7414, + "step": 808 + }, + { + "epoch": 0.08640691255300424, + "grad_norm": 9.2611346408095, + "learning_rate": 1.7270788912579956e-06, + "loss": 0.543, + "step": 810 + }, + { + "epoch": 0.08662026295436968, + "grad_norm": 11.293068548525245, + "learning_rate": 1.7313432835820893e-06, + "loss": 0.5024, + "step": 812 + }, + { + "epoch": 0.08683361335573513, + "grad_norm": 8.43885851672634, + "learning_rate": 1.7356076759061833e-06, + "loss": -0.8492, + "step": 814 + }, + { + "epoch": 0.08704696375710057, + "grad_norm": 10.34717338583048, + "learning_rate": 1.7398720682302773e-06, + "loss": -0.8168, + "step": 816 + }, + { + "epoch": 0.08726031415846601, + "grad_norm": 4.346106915743504, + "learning_rate": 1.7441364605543709e-06, + "loss": -0.781, + "step": 818 + }, + { + "epoch": 0.08747366455983145, + "grad_norm": 16.261736385850632, + "learning_rate": 1.7484008528784648e-06, + "loss": 0.3626, + "step": 820 + }, + { + "epoch": 0.0876870149611969, + "grad_norm": 6.451053388102568, + "learning_rate": 1.7526652452025586e-06, + "loss": 0.1613, + "step": 822 + }, + { + "epoch": 0.08790036536256234, + "grad_norm": 13.971393729868783, + "learning_rate": 1.7569296375266524e-06, + "loss": 0.1887, + "step": 824 + }, + { + "epoch": 0.08811371576392778, + "grad_norm": 4.60426711294295, + "learning_rate": 1.7611940298507461e-06, + "loss": -0.0714, + "step": 826 + }, + { + "epoch": 0.08832706616529322, + "grad_norm": 7.788765445569996, + "learning_rate": 1.7654584221748401e-06, + "loss": 0.1057, + "step": 828 + }, + { + "epoch": 0.08854041656665866, + "grad_norm": 16.242118293664294, + "learning_rate": 1.7697228144989339e-06, + "loss": 0.2466, + "step": 830 + }, + { + "epoch": 0.0887537669680241, + "grad_norm": 14.723029688197625, + "learning_rate": 1.7739872068230277e-06, + "loss": -0.6136, + "step": 832 + }, + { + "epoch": 0.08896711736938955, + "grad_norm": 20.958772423479118, + "learning_rate": 1.7782515991471214e-06, + "loss": -1.257, + "step": 834 + }, + { + "epoch": 0.08918046777075499, + "grad_norm": 12.999192030716888, + "learning_rate": 1.7825159914712154e-06, + "loss": 0.7867, + "step": 836 + }, + { + "epoch": 0.08939381817212044, + "grad_norm": 8.749254011513045, + "learning_rate": 1.786780383795309e-06, + "loss": -0.1854, + "step": 838 + }, + { + "epoch": 0.08960716857348588, + "grad_norm": 28.375671216859047, + "learning_rate": 1.791044776119403e-06, + "loss": -1.1224, + "step": 840 + }, + { + "epoch": 0.08982051897485133, + "grad_norm": 11.982995716340508, + "learning_rate": 1.7953091684434967e-06, + "loss": 0.1255, + "step": 842 + }, + { + "epoch": 0.09003386937621677, + "grad_norm": 13.991706133756487, + "learning_rate": 1.7995735607675907e-06, + "loss": -0.8058, + "step": 844 + }, + { + "epoch": 0.09024721977758221, + "grad_norm": 6.919587436813207, + "learning_rate": 1.8038379530916842e-06, + "loss": -0.514, + "step": 846 + }, + { + "epoch": 0.09046057017894765, + "grad_norm": 8.081461324634596, + "learning_rate": 1.8081023454157782e-06, + "loss": -0.4794, + "step": 848 + }, + { + "epoch": 0.0906739205803131, + "grad_norm": 21.839476666247815, + "learning_rate": 1.812366737739872e-06, + "loss": -0.1143, + "step": 850 + }, + { + "epoch": 0.09088727098167854, + "grad_norm": 4.526543841561235, + "learning_rate": 1.8166311300639658e-06, + "loss": -0.2663, + "step": 852 + }, + { + "epoch": 0.09110062138304398, + "grad_norm": 4.910992516997152, + "learning_rate": 1.8208955223880595e-06, + "loss": -0.79, + "step": 854 + }, + { + "epoch": 0.09131397178440942, + "grad_norm": 13.643412575777333, + "learning_rate": 1.8251599147121535e-06, + "loss": 0.3456, + "step": 856 + }, + { + "epoch": 0.09152732218577486, + "grad_norm": 12.207406995564389, + "learning_rate": 1.829424307036247e-06, + "loss": -0.1668, + "step": 858 + }, + { + "epoch": 0.0917406725871403, + "grad_norm": 10.619915846442005, + "learning_rate": 1.833688699360341e-06, + "loss": 0.8485, + "step": 860 + }, + { + "epoch": 0.09195402298850575, + "grad_norm": 7.23403783091102, + "learning_rate": 1.8379530916844348e-06, + "loss": 0.0591, + "step": 862 + }, + { + "epoch": 0.09216737338987119, + "grad_norm": 8.49916268019599, + "learning_rate": 1.8422174840085288e-06, + "loss": 0.3622, + "step": 864 + }, + { + "epoch": 0.09238072379123663, + "grad_norm": 17.881493738990475, + "learning_rate": 1.8464818763326224e-06, + "loss": 0.5555, + "step": 866 + }, + { + "epoch": 0.09259407419260207, + "grad_norm": 12.811241804624386, + "learning_rate": 1.8507462686567163e-06, + "loss": 0.1441, + "step": 868 + }, + { + "epoch": 0.09280742459396751, + "grad_norm": 17.08876136574821, + "learning_rate": 1.85501066098081e-06, + "loss": -0.2161, + "step": 870 + }, + { + "epoch": 0.09302077499533296, + "grad_norm": 18.87972131413687, + "learning_rate": 1.859275053304904e-06, + "loss": -0.643, + "step": 872 + }, + { + "epoch": 0.0932341253966984, + "grad_norm": 19.214085243462733, + "learning_rate": 1.8635394456289976e-06, + "loss": -0.3228, + "step": 874 + }, + { + "epoch": 0.09344747579806384, + "grad_norm": 13.107944155833772, + "learning_rate": 1.8678038379530916e-06, + "loss": -0.0718, + "step": 876 + }, + { + "epoch": 0.09366082619942928, + "grad_norm": 11.902725378153237, + "learning_rate": 1.8720682302771856e-06, + "loss": 0.1609, + "step": 878 + }, + { + "epoch": 0.09387417660079474, + "grad_norm": 12.04227842087194, + "learning_rate": 1.8763326226012792e-06, + "loss": 0.542, + "step": 880 + }, + { + "epoch": 0.09408752700216018, + "grad_norm": 10.083129476715843, + "learning_rate": 1.8805970149253731e-06, + "loss": -0.2667, + "step": 882 + }, + { + "epoch": 0.09430087740352562, + "grad_norm": 13.758222232767348, + "learning_rate": 1.884861407249467e-06, + "loss": 0.8839, + "step": 884 + }, + { + "epoch": 0.09451422780489106, + "grad_norm": 26.46812082435916, + "learning_rate": 1.8891257995735609e-06, + "loss": -0.808, + "step": 886 + }, + { + "epoch": 0.0947275782062565, + "grad_norm": 22.4696755994172, + "learning_rate": 1.8933901918976544e-06, + "loss": 0.206, + "step": 888 + }, + { + "epoch": 0.09494092860762195, + "grad_norm": 13.650968128133929, + "learning_rate": 1.8976545842217484e-06, + "loss": -0.2053, + "step": 890 + }, + { + "epoch": 0.09515427900898739, + "grad_norm": 25.295891533832265, + "learning_rate": 1.9019189765458422e-06, + "loss": 0.5393, + "step": 892 + }, + { + "epoch": 0.09536762941035283, + "grad_norm": 4.717180645046515, + "learning_rate": 1.906183368869936e-06, + "loss": -0.1717, + "step": 894 + }, + { + "epoch": 0.09558097981171827, + "grad_norm": 3.980783546053527, + "learning_rate": 1.9104477611940297e-06, + "loss": -0.3342, + "step": 896 + }, + { + "epoch": 0.09579433021308371, + "grad_norm": 17.53966519035405, + "learning_rate": 1.9147121535181237e-06, + "loss": 0.1547, + "step": 898 + }, + { + "epoch": 0.09600768061444916, + "grad_norm": 37.64965817072852, + "learning_rate": 1.9189765458422173e-06, + "loss": -0.1405, + "step": 900 + }, + { + "epoch": 0.0962210310158146, + "grad_norm": 13.214607736783556, + "learning_rate": 1.9232409381663112e-06, + "loss": -0.6144, + "step": 902 + }, + { + "epoch": 0.09643438141718004, + "grad_norm": 13.755458504610731, + "learning_rate": 1.927505330490405e-06, + "loss": -0.5176, + "step": 904 + }, + { + "epoch": 0.09664773181854548, + "grad_norm": 13.160266215659087, + "learning_rate": 1.9317697228144988e-06, + "loss": -0.3805, + "step": 906 + }, + { + "epoch": 0.09686108221991092, + "grad_norm": 8.56427462579027, + "learning_rate": 1.9360341151385928e-06, + "loss": -0.3108, + "step": 908 + }, + { + "epoch": 0.09707443262127637, + "grad_norm": 7.732136866046191, + "learning_rate": 1.9402985074626867e-06, + "loss": -0.7029, + "step": 910 + }, + { + "epoch": 0.09728778302264181, + "grad_norm": 3.674972386186435, + "learning_rate": 1.9445628997867803e-06, + "loss": -0.171, + "step": 912 + }, + { + "epoch": 0.09750113342400725, + "grad_norm": 5.033223647050098, + "learning_rate": 1.9488272921108743e-06, + "loss": -0.3051, + "step": 914 + }, + { + "epoch": 0.09771448382537269, + "grad_norm": 6.185985978437299, + "learning_rate": 1.953091684434968e-06, + "loss": -0.2443, + "step": 916 + }, + { + "epoch": 0.09792783422673813, + "grad_norm": 12.390573810874344, + "learning_rate": 1.957356076759062e-06, + "loss": -0.9419, + "step": 918 + }, + { + "epoch": 0.09814118462810358, + "grad_norm": 33.36024010674656, + "learning_rate": 1.9616204690831554e-06, + "loss": -1.5106, + "step": 920 + }, + { + "epoch": 0.09835453502946902, + "grad_norm": 8.028565562422017, + "learning_rate": 1.9658848614072493e-06, + "loss": -0.0367, + "step": 922 + }, + { + "epoch": 0.09856788543083447, + "grad_norm": 11.244074568741834, + "learning_rate": 1.970149253731343e-06, + "loss": -0.198, + "step": 924 + }, + { + "epoch": 0.09878123583219992, + "grad_norm": 8.155488460294738, + "learning_rate": 1.974413646055437e-06, + "loss": -0.4533, + "step": 926 + }, + { + "epoch": 0.09899458623356536, + "grad_norm": 8.455296272398023, + "learning_rate": 1.978678038379531e-06, + "loss": -0.2075, + "step": 928 + }, + { + "epoch": 0.0992079366349308, + "grad_norm": 19.66763752484763, + "learning_rate": 1.982942430703625e-06, + "loss": -0.8438, + "step": 930 + }, + { + "epoch": 0.09942128703629624, + "grad_norm": 16.941346284552324, + "learning_rate": 1.9872068230277184e-06, + "loss": 0.2668, + "step": 932 + }, + { + "epoch": 0.09963463743766168, + "grad_norm": 30.19526185636887, + "learning_rate": 1.9914712153518124e-06, + "loss": 0.7882, + "step": 934 + }, + { + "epoch": 0.09984798783902712, + "grad_norm": 7.7652033417557185, + "learning_rate": 1.995735607675906e-06, + "loss": -0.5121, + "step": 936 + }, + { + "epoch": 0.10006133824039257, + "grad_norm": 10.002945250026167, + "learning_rate": 2e-06, + "loss": -0.4891, + "step": 938 + }, + { + "epoch": 0.10027468864175801, + "grad_norm": 9.562548994737613, + "learning_rate": 1.999999722631857e-06, + "loss": -0.1357, + "step": 940 + }, + { + "epoch": 0.10048803904312345, + "grad_norm": 9.18438104835198, + "learning_rate": 1.999998890527582e-06, + "loss": -1.313, + "step": 942 + }, + { + "epoch": 0.10070138944448889, + "grad_norm": 19.038858970547153, + "learning_rate": 1.9999975036876364e-06, + "loss": 0.5632, + "step": 944 + }, + { + "epoch": 0.10091473984585433, + "grad_norm": 8.880460982287538, + "learning_rate": 1.9999955621127898e-06, + "loss": -0.1845, + "step": 946 + }, + { + "epoch": 0.10112809024721978, + "grad_norm": 27.328430670871857, + "learning_rate": 1.999993065804119e-06, + "loss": -1.1547, + "step": 948 + }, + { + "epoch": 0.10134144064858522, + "grad_norm": 3.7529838788448946, + "learning_rate": 1.9999900147630093e-06, + "loss": 0.4223, + "step": 950 + }, + { + "epoch": 0.10155479104995066, + "grad_norm": 20.065464957241378, + "learning_rate": 1.9999864089911524e-06, + "loss": -0.6613, + "step": 952 + }, + { + "epoch": 0.1017681414513161, + "grad_norm": 6.44845480619213, + "learning_rate": 1.9999822484905493e-06, + "loss": -0.1856, + "step": 954 + }, + { + "epoch": 0.10198149185268154, + "grad_norm": 15.587441612646101, + "learning_rate": 1.9999775332635073e-06, + "loss": -0.659, + "step": 956 + }, + { + "epoch": 0.10219484225404699, + "grad_norm": 10.95597370533827, + "learning_rate": 1.9999722633126426e-06, + "loss": 0.1653, + "step": 958 + }, + { + "epoch": 0.10240819265541243, + "grad_norm": 8.043054511297433, + "learning_rate": 1.9999664386408786e-06, + "loss": -0.5782, + "step": 960 + }, + { + "epoch": 0.10262154305677787, + "grad_norm": 19.158675843982078, + "learning_rate": 1.9999600592514464e-06, + "loss": -0.2572, + "step": 962 + }, + { + "epoch": 0.10283489345814331, + "grad_norm": 6.444667549256743, + "learning_rate": 1.9999531251478848e-06, + "loss": -0.3405, + "step": 964 + }, + { + "epoch": 0.10304824385950877, + "grad_norm": 59.77200968326433, + "learning_rate": 1.9999456363340406e-06, + "loss": -0.3214, + "step": 966 + }, + { + "epoch": 0.10326159426087421, + "grad_norm": 6.48727133868538, + "learning_rate": 1.999937592814068e-06, + "loss": 0.193, + "step": 968 + }, + { + "epoch": 0.10347494466223965, + "grad_norm": 9.899885529844648, + "learning_rate": 1.999928994592429e-06, + "loss": -0.8417, + "step": 970 + }, + { + "epoch": 0.1036882950636051, + "grad_norm": 10.671301533657493, + "learning_rate": 1.999919841673893e-06, + "loss": 0.6551, + "step": 972 + }, + { + "epoch": 0.10390164546497054, + "grad_norm": 18.54534340015214, + "learning_rate": 1.999910134063538e-06, + "loss": -0.2989, + "step": 974 + }, + { + "epoch": 0.10411499586633598, + "grad_norm": 4.6709829991723835, + "learning_rate": 1.999899871766749e-06, + "loss": 0.0672, + "step": 976 + }, + { + "epoch": 0.10432834626770142, + "grad_norm": 5.48735260106571, + "learning_rate": 1.9998890547892183e-06, + "loss": -0.6261, + "step": 978 + }, + { + "epoch": 0.10454169666906686, + "grad_norm": 16.54236866901039, + "learning_rate": 1.9998776831369476e-06, + "loss": -0.2955, + "step": 980 + }, + { + "epoch": 0.1047550470704323, + "grad_norm": 18.85187898982959, + "learning_rate": 1.9998657568162446e-06, + "loss": 0.1206, + "step": 982 + }, + { + "epoch": 0.10496839747179774, + "grad_norm": 12.054223122838927, + "learning_rate": 1.999853275833725e-06, + "loss": 0.4311, + "step": 984 + }, + { + "epoch": 0.10518174787316319, + "grad_norm": 12.198982090572391, + "learning_rate": 1.9998402401963128e-06, + "loss": -0.442, + "step": 986 + }, + { + "epoch": 0.10539509827452863, + "grad_norm": 37.98649241207374, + "learning_rate": 1.999826649911239e-06, + "loss": -1.3668, + "step": 988 + }, + { + "epoch": 0.10560844867589407, + "grad_norm": 11.038918775818233, + "learning_rate": 1.9998125049860433e-06, + "loss": 0.6653, + "step": 990 + }, + { + "epoch": 0.10582179907725951, + "grad_norm": 6.476149411837476, + "learning_rate": 1.999797805428572e-06, + "loss": 0.2109, + "step": 992 + }, + { + "epoch": 0.10603514947862495, + "grad_norm": 15.406789152771664, + "learning_rate": 1.999782551246979e-06, + "loss": -0.5677, + "step": 994 + }, + { + "epoch": 0.1062484998799904, + "grad_norm": 29.873738526715083, + "learning_rate": 1.999766742449727e-06, + "loss": -1.3612, + "step": 996 + }, + { + "epoch": 0.10646185028135584, + "grad_norm": 12.480796329993506, + "learning_rate": 1.999750379045585e-06, + "loss": -0.3667, + "step": 998 + }, + { + "epoch": 0.10667520068272128, + "grad_norm": 13.76196266074063, + "learning_rate": 1.9997334610436318e-06, + "loss": 0.1408, + "step": 1000 + }, + { + "epoch": 0.10688855108408672, + "grad_norm": 7.342086872809186, + "learning_rate": 1.999715988453251e-06, + "loss": -0.1217, + "step": 1002 + }, + { + "epoch": 0.10710190148545216, + "grad_norm": 24.793407128549426, + "learning_rate": 1.9996979612841357e-06, + "loss": -0.2125, + "step": 1004 + }, + { + "epoch": 0.1073152518868176, + "grad_norm": 12.462548496218666, + "learning_rate": 1.9996793795462862e-06, + "loss": -0.9109, + "step": 1006 + }, + { + "epoch": 0.10752860228818305, + "grad_norm": 22.456483713157958, + "learning_rate": 1.9996602432500107e-06, + "loss": -0.8032, + "step": 1008 + }, + { + "epoch": 0.1077419526895485, + "grad_norm": 8.279610471346707, + "learning_rate": 1.999640552405925e-06, + "loss": 0.5294, + "step": 1010 + }, + { + "epoch": 0.10795530309091395, + "grad_norm": 8.492474727142435, + "learning_rate": 1.9996203070249514e-06, + "loss": 0.345, + "step": 1012 + }, + { + "epoch": 0.10816865349227939, + "grad_norm": 10.855775876716889, + "learning_rate": 1.9995995071183215e-06, + "loss": -0.0323, + "step": 1014 + }, + { + "epoch": 0.10838200389364483, + "grad_norm": 13.044679276302682, + "learning_rate": 1.999578152697574e-06, + "loss": -0.9284, + "step": 1016 + }, + { + "epoch": 0.10859535429501027, + "grad_norm": 13.041568910636132, + "learning_rate": 1.999556243774554e-06, + "loss": 0.0727, + "step": 1018 + }, + { + "epoch": 0.10880870469637571, + "grad_norm": 8.840285871574821, + "learning_rate": 1.9995337803614165e-06, + "loss": -0.8054, + "step": 1020 + }, + { + "epoch": 0.10902205509774116, + "grad_norm": 11.045434971938723, + "learning_rate": 1.999510762470621e-06, + "loss": -0.4123, + "step": 1022 + }, + { + "epoch": 0.1092354054991066, + "grad_norm": 9.987536744093788, + "learning_rate": 1.999487190114938e-06, + "loss": -0.276, + "step": 1024 + }, + { + "epoch": 0.10944875590047204, + "grad_norm": 21.660050733355092, + "learning_rate": 1.9994630633074433e-06, + "loss": 0.0614, + "step": 1026 + }, + { + "epoch": 0.10966210630183748, + "grad_norm": 13.372785558415854, + "learning_rate": 1.9994383820615212e-06, + "loss": -0.5344, + "step": 1028 + }, + { + "epoch": 0.10987545670320292, + "grad_norm": 12.356702814123958, + "learning_rate": 1.9994131463908624e-06, + "loss": -0.1296, + "step": 1030 + }, + { + "epoch": 0.11008880710456836, + "grad_norm": 4.739222690282173, + "learning_rate": 1.999387356309467e-06, + "loss": -0.0715, + "step": 1032 + }, + { + "epoch": 0.1103021575059338, + "grad_norm": 2.353063878699358, + "learning_rate": 1.9993610118316415e-06, + "loss": -0.0108, + "step": 1034 + }, + { + "epoch": 0.11051550790729925, + "grad_norm": 9.018922760973295, + "learning_rate": 1.9993341129719997e-06, + "loss": 0.1682, + "step": 1036 + }, + { + "epoch": 0.11072885830866469, + "grad_norm": 11.347020997940431, + "learning_rate": 1.9993066597454637e-06, + "loss": 0.503, + "step": 1038 + }, + { + "epoch": 0.11094220871003013, + "grad_norm": 6.599406732799808, + "learning_rate": 1.9992786521672633e-06, + "loss": -1.0294, + "step": 1040 + }, + { + "epoch": 0.11115555911139557, + "grad_norm": 10.664927908662923, + "learning_rate": 1.999250090252934e-06, + "loss": -0.0274, + "step": 1042 + }, + { + "epoch": 0.11136890951276102, + "grad_norm": 12.59123280862704, + "learning_rate": 1.9992209740183212e-06, + "loss": -1.3216, + "step": 1044 + }, + { + "epoch": 0.11158225991412646, + "grad_norm": 12.964624626643758, + "learning_rate": 1.9991913034795767e-06, + "loss": 0.4125, + "step": 1046 + }, + { + "epoch": 0.1117956103154919, + "grad_norm": 4.273338653928415, + "learning_rate": 1.9991610786531593e-06, + "loss": -0.4966, + "step": 1048 + }, + { + "epoch": 0.11200896071685734, + "grad_norm": 5.088476515803206, + "learning_rate": 1.999130299555836e-06, + "loss": 0.2971, + "step": 1050 + }, + { + "epoch": 0.1122223111182228, + "grad_norm": 6.974108763676147, + "learning_rate": 1.9990989662046816e-06, + "loss": -0.7651, + "step": 1052 + }, + { + "epoch": 0.11243566151958824, + "grad_norm": 5.249430049582833, + "learning_rate": 1.999067078617077e-06, + "loss": -0.5608, + "step": 1054 + }, + { + "epoch": 0.11264901192095368, + "grad_norm": 15.65538195259475, + "learning_rate": 1.999034636810712e-06, + "loss": -0.5415, + "step": 1056 + }, + { + "epoch": 0.11286236232231912, + "grad_norm": 5.252233548968885, + "learning_rate": 1.999001640803583e-06, + "loss": -0.7343, + "step": 1058 + }, + { + "epoch": 0.11307571272368457, + "grad_norm": 8.691749940228224, + "learning_rate": 1.998968090613994e-06, + "loss": -0.418, + "step": 1060 + }, + { + "epoch": 0.11328906312505001, + "grad_norm": 13.48795251406841, + "learning_rate": 1.998933986260557e-06, + "loss": -0.7516, + "step": 1062 + }, + { + "epoch": 0.11350241352641545, + "grad_norm": 19.397334567221574, + "learning_rate": 1.99889932776219e-06, + "loss": 0.3052, + "step": 1064 + }, + { + "epoch": 0.11371576392778089, + "grad_norm": 13.172606242995984, + "learning_rate": 1.99886411513812e-06, + "loss": -0.4304, + "step": 1066 + }, + { + "epoch": 0.11392911432914633, + "grad_norm": 28.550101024674554, + "learning_rate": 1.9988283484078813e-06, + "loss": 0.2051, + "step": 1068 + }, + { + "epoch": 0.11414246473051178, + "grad_norm": 11.786086088715077, + "learning_rate": 1.9987920275913135e-06, + "loss": 0.5976, + "step": 1070 + }, + { + "epoch": 0.11435581513187722, + "grad_norm": 20.47705125595093, + "learning_rate": 1.9987551527085665e-06, + "loss": -0.6878, + "step": 1072 + }, + { + "epoch": 0.11456916553324266, + "grad_norm": 17.256084624740367, + "learning_rate": 1.9987177237800954e-06, + "loss": -1.1181, + "step": 1074 + }, + { + "epoch": 0.1147825159346081, + "grad_norm": 20.121126793429465, + "learning_rate": 1.9986797408266633e-06, + "loss": -1.1092, + "step": 1076 + }, + { + "epoch": 0.11499586633597354, + "grad_norm": 9.787270565538346, + "learning_rate": 1.998641203869341e-06, + "loss": -0.0358, + "step": 1078 + }, + { + "epoch": 0.11520921673733898, + "grad_norm": 10.072654902116856, + "learning_rate": 1.9986021129295067e-06, + "loss": -1.121, + "step": 1080 + }, + { + "epoch": 0.11542256713870443, + "grad_norm": 26.902121573368476, + "learning_rate": 1.9985624680288445e-06, + "loss": -0.1283, + "step": 1082 + }, + { + "epoch": 0.11563591754006987, + "grad_norm": 6.9266710792537936, + "learning_rate": 1.998522269189348e-06, + "loss": -0.1049, + "step": 1084 + }, + { + "epoch": 0.11584926794143531, + "grad_norm": 11.110711462850878, + "learning_rate": 1.998481516433316e-06, + "loss": -0.51, + "step": 1086 + }, + { + "epoch": 0.11606261834280075, + "grad_norm": 12.887226914453029, + "learning_rate": 1.9984402097833563e-06, + "loss": -0.1358, + "step": 1088 + }, + { + "epoch": 0.1162759687441662, + "grad_norm": 11.597183300597381, + "learning_rate": 1.998398349262383e-06, + "loss": -1.1465, + "step": 1090 + }, + { + "epoch": 0.11648931914553164, + "grad_norm": 9.612007934917326, + "learning_rate": 1.9983559348936175e-06, + "loss": -0.4243, + "step": 1092 + }, + { + "epoch": 0.11670266954689708, + "grad_norm": 7.480442846976291, + "learning_rate": 1.9983129667005884e-06, + "loss": -0.8296, + "step": 1094 + }, + { + "epoch": 0.11691601994826253, + "grad_norm": 11.901507316301505, + "learning_rate": 1.998269444707132e-06, + "loss": 0.2059, + "step": 1096 + }, + { + "epoch": 0.11712937034962798, + "grad_norm": 11.84934594902733, + "learning_rate": 1.9982253689373918e-06, + "loss": -0.7015, + "step": 1098 + }, + { + "epoch": 0.11734272075099342, + "grad_norm": 18.98160723910663, + "learning_rate": 1.9981807394158177e-06, + "loss": 0.3484, + "step": 1100 + }, + { + "epoch": 0.11755607115235886, + "grad_norm": 9.999100561355233, + "learning_rate": 1.9981355561671677e-06, + "loss": -0.0919, + "step": 1102 + }, + { + "epoch": 0.1177694215537243, + "grad_norm": 8.253006466198785, + "learning_rate": 1.9980898192165063e-06, + "loss": -0.3401, + "step": 1104 + }, + { + "epoch": 0.11798277195508974, + "grad_norm": 21.38855116207188, + "learning_rate": 1.998043528589205e-06, + "loss": -0.2649, + "step": 1106 + }, + { + "epoch": 0.11819612235645519, + "grad_norm": 17.581276449720193, + "learning_rate": 1.9979966843109445e-06, + "loss": -0.7882, + "step": 1108 + }, + { + "epoch": 0.11840947275782063, + "grad_norm": 17.881994136606902, + "learning_rate": 1.9979492864077094e-06, + "loss": -0.9395, + "step": 1110 + }, + { + "epoch": 0.11862282315918607, + "grad_norm": 27.450698807590868, + "learning_rate": 1.9979013349057932e-06, + "loss": -1.1812, + "step": 1112 + }, + { + "epoch": 0.11883617356055151, + "grad_norm": 11.85016744188475, + "learning_rate": 1.997852829831797e-06, + "loss": -0.2322, + "step": 1114 + }, + { + "epoch": 0.11904952396191695, + "grad_norm": 19.57162822354579, + "learning_rate": 1.997803771212629e-06, + "loss": -0.2706, + "step": 1116 + }, + { + "epoch": 0.1192628743632824, + "grad_norm": 10.830735208685383, + "learning_rate": 1.997754159075502e-06, + "loss": -1.1194, + "step": 1118 + }, + { + "epoch": 0.11947622476464784, + "grad_norm": 4.730754320481347, + "learning_rate": 1.9977039934479385e-06, + "loss": -0.7898, + "step": 1120 + }, + { + "epoch": 0.11968957516601328, + "grad_norm": 8.745657757788235, + "learning_rate": 1.9976532743577673e-06, + "loss": 0.0379, + "step": 1122 + }, + { + "epoch": 0.11990292556737872, + "grad_norm": 28.813446860024147, + "learning_rate": 1.9976020018331243e-06, + "loss": 0.3645, + "step": 1124 + }, + { + "epoch": 0.12011627596874416, + "grad_norm": 10.440814030501423, + "learning_rate": 1.9975501759024517e-06, + "loss": -0.0548, + "step": 1126 + }, + { + "epoch": 0.1203296263701096, + "grad_norm": 11.416071472081667, + "learning_rate": 1.9974977965945e-06, + "loss": 0.2929, + "step": 1128 + }, + { + "epoch": 0.12054297677147505, + "grad_norm": 8.441152040615334, + "learning_rate": 1.9974448639383244e-06, + "loss": -0.6188, + "step": 1130 + }, + { + "epoch": 0.12075632717284049, + "grad_norm": 5.663303106994784, + "learning_rate": 1.9973913779632904e-06, + "loss": -0.4657, + "step": 1132 + }, + { + "epoch": 0.12096967757420593, + "grad_norm": 18.35892778992275, + "learning_rate": 1.9973373386990674e-06, + "loss": 0.2044, + "step": 1134 + }, + { + "epoch": 0.12118302797557137, + "grad_norm": 23.684412769651576, + "learning_rate": 1.9972827461756334e-06, + "loss": -0.4693, + "step": 1136 + }, + { + "epoch": 0.12139637837693683, + "grad_norm": 14.171496402604255, + "learning_rate": 1.997227600423273e-06, + "loss": -0.6943, + "step": 1138 + }, + { + "epoch": 0.12160972877830227, + "grad_norm": 13.922460920458215, + "learning_rate": 1.9971719014725768e-06, + "loss": 0.8172, + "step": 1140 + }, + { + "epoch": 0.12182307917966771, + "grad_norm": 21.72636134013763, + "learning_rate": 1.9971156493544437e-06, + "loss": -0.4408, + "step": 1142 + }, + { + "epoch": 0.12203642958103315, + "grad_norm": 20.82069946728396, + "learning_rate": 1.9970588441000787e-06, + "loss": -0.3291, + "step": 1144 + }, + { + "epoch": 0.1222497799823986, + "grad_norm": 15.957243371369229, + "learning_rate": 1.9970014857409936e-06, + "loss": -0.3534, + "step": 1146 + }, + { + "epoch": 0.12246313038376404, + "grad_norm": 13.228394989772614, + "learning_rate": 1.9969435743090076e-06, + "loss": -0.4047, + "step": 1148 + }, + { + "epoch": 0.12267648078512948, + "grad_norm": 9.336630176087915, + "learning_rate": 1.9968851098362455e-06, + "loss": -0.2281, + "step": 1150 + }, + { + "epoch": 0.12288983118649492, + "grad_norm": 21.372600065820446, + "learning_rate": 1.9968260923551405e-06, + "loss": -0.1527, + "step": 1152 + }, + { + "epoch": 0.12310318158786036, + "grad_norm": 10.49080827854095, + "learning_rate": 1.9967665218984306e-06, + "loss": 0.2273, + "step": 1154 + }, + { + "epoch": 0.1233165319892258, + "grad_norm": 26.54581332312223, + "learning_rate": 1.996706398499163e-06, + "loss": -0.0176, + "step": 1156 + }, + { + "epoch": 0.12352988239059125, + "grad_norm": 10.60684954766727, + "learning_rate": 1.9966457221906893e-06, + "loss": -0.7239, + "step": 1158 + }, + { + "epoch": 0.12374323279195669, + "grad_norm": 18.45210289211234, + "learning_rate": 1.9965844930066696e-06, + "loss": -0.4982, + "step": 1160 + }, + { + "epoch": 0.12395658319332213, + "grad_norm": 20.290379711344247, + "learning_rate": 1.9965227109810694e-06, + "loss": -0.1957, + "step": 1162 + }, + { + "epoch": 0.12416993359468757, + "grad_norm": 10.365692652900515, + "learning_rate": 1.9964603761481623e-06, + "loss": 0.3669, + "step": 1164 + }, + { + "epoch": 0.12438328399605302, + "grad_norm": 13.125311380390214, + "learning_rate": 1.996397488542526e-06, + "loss": -0.3221, + "step": 1166 + }, + { + "epoch": 0.12459663439741846, + "grad_norm": 14.361058641237294, + "learning_rate": 1.9963340481990486e-06, + "loss": 0.1227, + "step": 1168 + }, + { + "epoch": 0.1248099847987839, + "grad_norm": 7.056112272117484, + "learning_rate": 1.9962700551529214e-06, + "loss": 0.0827, + "step": 1170 + }, + { + "epoch": 0.12502333520014935, + "grad_norm": 20.11483934762056, + "learning_rate": 1.996205509439644e-06, + "loss": -1.1759, + "step": 1172 + }, + { + "epoch": 0.1252366856015148, + "grad_norm": 9.441979966620615, + "learning_rate": 1.996140411095022e-06, + "loss": -0.2787, + "step": 1174 + }, + { + "epoch": 0.12545003600288024, + "grad_norm": 6.0147967546848395, + "learning_rate": 1.9960747601551686e-06, + "loss": -0.791, + "step": 1176 + }, + { + "epoch": 0.12566338640424568, + "grad_norm": 27.231381943588612, + "learning_rate": 1.9960085566565015e-06, + "loss": -0.3671, + "step": 1178 + }, + { + "epoch": 0.12587673680561112, + "grad_norm": 9.81504324283188, + "learning_rate": 1.9959418006357476e-06, + "loss": 0.398, + "step": 1180 + }, + { + "epoch": 0.12609008720697656, + "grad_norm": 9.244327583663429, + "learning_rate": 1.995874492129938e-06, + "loss": -0.4957, + "step": 1182 + }, + { + "epoch": 0.126303437608342, + "grad_norm": 12.948378749756142, + "learning_rate": 1.995806631176411e-06, + "loss": -0.6401, + "step": 1184 + }, + { + "epoch": 0.12651678800970745, + "grad_norm": 14.684805388151638, + "learning_rate": 1.9957382178128122e-06, + "loss": -0.7437, + "step": 1186 + }, + { + "epoch": 0.1267301384110729, + "grad_norm": 12.31300809828398, + "learning_rate": 1.995669252077093e-06, + "loss": 0.1044, + "step": 1188 + }, + { + "epoch": 0.12694348881243833, + "grad_norm": 11.76491596226188, + "learning_rate": 1.9955997340075107e-06, + "loss": -0.4917, + "step": 1190 + }, + { + "epoch": 0.12715683921380377, + "grad_norm": 10.89131589218926, + "learning_rate": 1.9955296636426294e-06, + "loss": -0.0883, + "step": 1192 + }, + { + "epoch": 0.12737018961516922, + "grad_norm": 7.320449762607573, + "learning_rate": 1.9954590410213204e-06, + "loss": -0.4345, + "step": 1194 + }, + { + "epoch": 0.12758354001653466, + "grad_norm": 7.738748707298395, + "learning_rate": 1.9953878661827603e-06, + "loss": -0.1405, + "step": 1196 + }, + { + "epoch": 0.1277968904179001, + "grad_norm": 29.041058801442883, + "learning_rate": 1.9953161391664314e-06, + "loss": -0.0113, + "step": 1198 + }, + { + "epoch": 0.12801024081926554, + "grad_norm": 18.64878794716449, + "learning_rate": 1.9952438600121247e-06, + "loss": -0.3338, + "step": 1200 + }, + { + "epoch": 0.12822359122063098, + "grad_norm": 12.138740310190034, + "learning_rate": 1.995171028759936e-06, + "loss": 0.1787, + "step": 1202 + }, + { + "epoch": 0.12843694162199643, + "grad_norm": 9.910448331113573, + "learning_rate": 1.9950976454502658e-06, + "loss": -0.1987, + "step": 1204 + }, + { + "epoch": 0.12865029202336187, + "grad_norm": 8.303854195961756, + "learning_rate": 1.995023710123824e-06, + "loss": -0.3242, + "step": 1206 + }, + { + "epoch": 0.1288636424247273, + "grad_norm": 2.8280360825998887, + "learning_rate": 1.9949492228216255e-06, + "loss": -0.5063, + "step": 1208 + }, + { + "epoch": 0.12907699282609275, + "grad_norm": 24.78519545923043, + "learning_rate": 1.99487418358499e-06, + "loss": -0.6436, + "step": 1210 + }, + { + "epoch": 0.1292903432274582, + "grad_norm": 5.247991941700894, + "learning_rate": 1.994798592455545e-06, + "loss": -0.3478, + "step": 1212 + }, + { + "epoch": 0.12950369362882364, + "grad_norm": 12.131817051335103, + "learning_rate": 1.9947224494752233e-06, + "loss": -0.4232, + "step": 1214 + }, + { + "epoch": 0.12971704403018908, + "grad_norm": 9.201909172979075, + "learning_rate": 1.9946457546862645e-06, + "loss": -0.1835, + "step": 1216 + }, + { + "epoch": 0.12993039443155452, + "grad_norm": 4.964833051326164, + "learning_rate": 1.994568508131214e-06, + "loss": -0.7964, + "step": 1218 + }, + { + "epoch": 0.13014374483291996, + "grad_norm": 9.786105618411568, + "learning_rate": 1.9944907098529234e-06, + "loss": -1.2816, + "step": 1220 + }, + { + "epoch": 0.1303570952342854, + "grad_norm": 12.270192445358681, + "learning_rate": 1.9944123598945498e-06, + "loss": 0.3211, + "step": 1222 + }, + { + "epoch": 0.13057044563565084, + "grad_norm": 16.548756210800725, + "learning_rate": 1.994333458299557e-06, + "loss": 0.202, + "step": 1224 + }, + { + "epoch": 0.1307837960370163, + "grad_norm": 9.753353434876177, + "learning_rate": 1.9942540051117148e-06, + "loss": 0.1426, + "step": 1226 + }, + { + "epoch": 0.13099714643838173, + "grad_norm": 11.58213155194585, + "learning_rate": 1.9941740003750984e-06, + "loss": -0.2848, + "step": 1228 + }, + { + "epoch": 0.13121049683974717, + "grad_norm": 13.290613366769495, + "learning_rate": 1.994093444134089e-06, + "loss": -0.0448, + "step": 1230 + }, + { + "epoch": 0.1314238472411126, + "grad_norm": 12.964788671743946, + "learning_rate": 1.9940123364333753e-06, + "loss": -0.1976, + "step": 1232 + }, + { + "epoch": 0.13163719764247805, + "grad_norm": 26.125246341050705, + "learning_rate": 1.9939306773179494e-06, + "loss": 0.1018, + "step": 1234 + }, + { + "epoch": 0.1318505480438435, + "grad_norm": 21.460777062383563, + "learning_rate": 1.9938484668331112e-06, + "loss": -0.8494, + "step": 1236 + }, + { + "epoch": 0.13206389844520894, + "grad_norm": 14.475281026483652, + "learning_rate": 1.993765705024466e-06, + "loss": -0.0824, + "step": 1238 + }, + { + "epoch": 0.13227724884657438, + "grad_norm": 19.194731333848527, + "learning_rate": 1.993682391937924e-06, + "loss": -0.8551, + "step": 1240 + }, + { + "epoch": 0.13249059924793982, + "grad_norm": 8.025263166829147, + "learning_rate": 1.993598527619703e-06, + "loss": -0.277, + "step": 1242 + }, + { + "epoch": 0.13270394964930526, + "grad_norm": 7.417238847002507, + "learning_rate": 1.9935141121163247e-06, + "loss": -1.7068, + "step": 1244 + }, + { + "epoch": 0.13291730005067073, + "grad_norm": 6.013712861217545, + "learning_rate": 1.993429145474618e-06, + "loss": -0.0375, + "step": 1246 + }, + { + "epoch": 0.13313065045203618, + "grad_norm": 6.222089997970306, + "learning_rate": 1.9933436277417167e-06, + "loss": -0.2544, + "step": 1248 + }, + { + "epoch": 0.13334400085340162, + "grad_norm": 8.644621723582299, + "learning_rate": 1.9932575589650607e-06, + "loss": -0.6859, + "step": 1250 + }, + { + "epoch": 0.13355735125476706, + "grad_norm": 8.81918821940543, + "learning_rate": 1.993170939192396e-06, + "loss": 0.3251, + "step": 1252 + }, + { + "epoch": 0.1337707016561325, + "grad_norm": 9.151176110701087, + "learning_rate": 1.9930837684717724e-06, + "loss": 0.5739, + "step": 1254 + }, + { + "epoch": 0.13398405205749794, + "grad_norm": 20.48823759322151, + "learning_rate": 1.9929960468515477e-06, + "loss": 0.2761, + "step": 1256 + }, + { + "epoch": 0.13419740245886339, + "grad_norm": 12.154278550654555, + "learning_rate": 1.9929077743803843e-06, + "loss": -0.3458, + "step": 1258 + }, + { + "epoch": 0.13441075286022883, + "grad_norm": 14.265983416521257, + "learning_rate": 1.9928189511072497e-06, + "loss": 0.8493, + "step": 1260 + }, + { + "epoch": 0.13462410326159427, + "grad_norm": 4.676628174710065, + "learning_rate": 1.9927295770814173e-06, + "loss": 0.2909, + "step": 1262 + }, + { + "epoch": 0.1348374536629597, + "grad_norm": 9.90417389583978, + "learning_rate": 1.9926396523524665e-06, + "loss": -1.2819, + "step": 1264 + }, + { + "epoch": 0.13505080406432515, + "grad_norm": 9.304209273056328, + "learning_rate": 1.9925491769702822e-06, + "loss": -0.5668, + "step": 1266 + }, + { + "epoch": 0.1352641544656906, + "grad_norm": 7.4993572899829335, + "learning_rate": 1.992458150985053e-06, + "loss": -0.2587, + "step": 1268 + }, + { + "epoch": 0.13547750486705604, + "grad_norm": 8.723865402043476, + "learning_rate": 1.992366574447276e-06, + "loss": 0.1551, + "step": 1270 + }, + { + "epoch": 0.13569085526842148, + "grad_norm": 5.057075668658937, + "learning_rate": 1.9922744474077508e-06, + "loss": -0.3648, + "step": 1272 + }, + { + "epoch": 0.13590420566978692, + "grad_norm": 16.770112625398436, + "learning_rate": 1.9921817699175844e-06, + "loss": -1.0814, + "step": 1274 + }, + { + "epoch": 0.13611755607115236, + "grad_norm": 33.36589858143468, + "learning_rate": 1.9920885420281872e-06, + "loss": -0.4457, + "step": 1276 + }, + { + "epoch": 0.1363309064725178, + "grad_norm": 6.946967661217, + "learning_rate": 1.9919947637912777e-06, + "loss": -1.0567, + "step": 1278 + }, + { + "epoch": 0.13654425687388325, + "grad_norm": 8.519674061007464, + "learning_rate": 1.9919004352588765e-06, + "loss": -0.2459, + "step": 1280 + }, + { + "epoch": 0.1367576072752487, + "grad_norm": 17.53676710023234, + "learning_rate": 1.9918055564833123e-06, + "loss": 0.1388, + "step": 1282 + }, + { + "epoch": 0.13697095767661413, + "grad_norm": 20.708957064208715, + "learning_rate": 1.9917101275172173e-06, + "loss": -0.1265, + "step": 1284 + }, + { + "epoch": 0.13718430807797957, + "grad_norm": 8.901672321260623, + "learning_rate": 1.9916141484135297e-06, + "loss": -0.4476, + "step": 1286 + }, + { + "epoch": 0.13739765847934501, + "grad_norm": 7.343042152992284, + "learning_rate": 1.991517619225492e-06, + "loss": -0.5943, + "step": 1288 + }, + { + "epoch": 0.13761100888071046, + "grad_norm": 27.76539957519422, + "learning_rate": 1.9914205400066527e-06, + "loss": -0.3234, + "step": 1290 + }, + { + "epoch": 0.1378243592820759, + "grad_norm": 50.99311574078685, + "learning_rate": 1.9913229108108657e-06, + "loss": -1.0094, + "step": 1292 + }, + { + "epoch": 0.13803770968344134, + "grad_norm": 14.695157878092305, + "learning_rate": 1.991224731692288e-06, + "loss": -1.3651, + "step": 1294 + }, + { + "epoch": 0.13825106008480678, + "grad_norm": 3.960187280725374, + "learning_rate": 1.9911260027053853e-06, + "loss": -0.5186, + "step": 1296 + }, + { + "epoch": 0.13846441048617222, + "grad_norm": 12.529647758062435, + "learning_rate": 1.9910267239049244e-06, + "loss": -0.368, + "step": 1298 + }, + { + "epoch": 0.13867776088753767, + "grad_norm": 20.251391963231992, + "learning_rate": 1.9909268953459796e-06, + "loss": 0.193, + "step": 1300 + }, + { + "epoch": 0.1388911112889031, + "grad_norm": 8.363591199706423, + "learning_rate": 1.9908265170839287e-06, + "loss": -1.2309, + "step": 1302 + }, + { + "epoch": 0.13910446169026855, + "grad_norm": 19.744142730100023, + "learning_rate": 1.990725589174456e-06, + "loss": -0.8554, + "step": 1304 + }, + { + "epoch": 0.139317812091634, + "grad_norm": 12.061297088872797, + "learning_rate": 1.99062411167355e-06, + "loss": -0.0811, + "step": 1306 + }, + { + "epoch": 0.13953116249299943, + "grad_norm": 11.228939123696309, + "learning_rate": 1.990522084637503e-06, + "loss": -0.0931, + "step": 1308 + }, + { + "epoch": 0.13974451289436488, + "grad_norm": 12.144929824556511, + "learning_rate": 1.990419508122914e-06, + "loss": -0.4765, + "step": 1310 + }, + { + "epoch": 0.13995786329573032, + "grad_norm": 20.656060496728355, + "learning_rate": 1.9903163821866854e-06, + "loss": 0.2659, + "step": 1312 + }, + { + "epoch": 0.14017121369709576, + "grad_norm": 14.649962220479885, + "learning_rate": 1.9902127068860254e-06, + "loss": 0.4123, + "step": 1314 + }, + { + "epoch": 0.1403845640984612, + "grad_norm": 10.2750178410792, + "learning_rate": 1.9901084822784454e-06, + "loss": 0.0955, + "step": 1316 + }, + { + "epoch": 0.14059791449982664, + "grad_norm": 5.90043808850163, + "learning_rate": 1.9900037084217634e-06, + "loss": -1.1914, + "step": 1318 + }, + { + "epoch": 0.14081126490119208, + "grad_norm": 15.885552307589213, + "learning_rate": 1.9898983853741016e-06, + "loss": 0.1728, + "step": 1320 + }, + { + "epoch": 0.14102461530255753, + "grad_norm": 9.891069798654515, + "learning_rate": 1.989792513193886e-06, + "loss": 0.1657, + "step": 1322 + }, + { + "epoch": 0.14123796570392297, + "grad_norm": 7.782182874303118, + "learning_rate": 1.9896860919398477e-06, + "loss": -0.3856, + "step": 1324 + }, + { + "epoch": 0.1414513161052884, + "grad_norm": 11.641176185781765, + "learning_rate": 1.989579121671022e-06, + "loss": -0.3068, + "step": 1326 + }, + { + "epoch": 0.14166466650665385, + "grad_norm": 4.805403193069003, + "learning_rate": 1.98947160244675e-06, + "loss": -0.4986, + "step": 1328 + }, + { + "epoch": 0.14187801690801932, + "grad_norm": 16.862956588429178, + "learning_rate": 1.9893635343266765e-06, + "loss": 0.3268, + "step": 1330 + }, + { + "epoch": 0.14209136730938476, + "grad_norm": 7.248484194373781, + "learning_rate": 1.9892549173707506e-06, + "loss": -0.0999, + "step": 1332 + }, + { + "epoch": 0.1423047177107502, + "grad_norm": 14.217945557585596, + "learning_rate": 1.9891457516392255e-06, + "loss": 0.2878, + "step": 1334 + }, + { + "epoch": 0.14251806811211565, + "grad_norm": 9.841439441353879, + "learning_rate": 1.9890360371926603e-06, + "loss": -0.6387, + "step": 1336 + }, + { + "epoch": 0.1427314185134811, + "grad_norm": 15.714978276816597, + "learning_rate": 1.9889257740919173e-06, + "loss": -0.0362, + "step": 1338 + }, + { + "epoch": 0.14294476891484653, + "grad_norm": 24.38558873057061, + "learning_rate": 1.988814962398163e-06, + "loss": -0.1845, + "step": 1340 + }, + { + "epoch": 0.14315811931621197, + "grad_norm": 19.37524940612685, + "learning_rate": 1.988703602172869e-06, + "loss": -0.051, + "step": 1342 + }, + { + "epoch": 0.14337146971757742, + "grad_norm": 17.53928719020653, + "learning_rate": 1.988591693477811e-06, + "loss": 0.3123, + "step": 1344 + }, + { + "epoch": 0.14358482011894286, + "grad_norm": 7.651959842433343, + "learning_rate": 1.988479236375068e-06, + "loss": 0.6345, + "step": 1346 + }, + { + "epoch": 0.1437981705203083, + "grad_norm": 12.849121747612804, + "learning_rate": 1.9883662309270255e-06, + "loss": -0.3795, + "step": 1348 + }, + { + "epoch": 0.14401152092167374, + "grad_norm": 10.42307066045314, + "learning_rate": 1.9882526771963705e-06, + "loss": 0.0273, + "step": 1350 + }, + { + "epoch": 0.14422487132303918, + "grad_norm": 10.998417457164084, + "learning_rate": 1.988138575246096e-06, + "loss": 0.068, + "step": 1352 + }, + { + "epoch": 0.14443822172440463, + "grad_norm": 8.217438827795817, + "learning_rate": 1.9880239251394984e-06, + "loss": 0.1393, + "step": 1354 + }, + { + "epoch": 0.14465157212577007, + "grad_norm": 7.09193880673836, + "learning_rate": 1.987908726940178e-06, + "loss": -0.4732, + "step": 1356 + }, + { + "epoch": 0.1448649225271355, + "grad_norm": 6.9152413451529045, + "learning_rate": 1.9877929807120394e-06, + "loss": -0.5944, + "step": 1358 + }, + { + "epoch": 0.14507827292850095, + "grad_norm": 21.317287958934948, + "learning_rate": 1.9876766865192917e-06, + "loss": -0.7226, + "step": 1360 + }, + { + "epoch": 0.1452916233298664, + "grad_norm": 7.0748819724154055, + "learning_rate": 1.987559844426447e-06, + "loss": 0.0925, + "step": 1362 + }, + { + "epoch": 0.14550497373123183, + "grad_norm": 8.277532517958686, + "learning_rate": 1.987442454498322e-06, + "loss": -0.1877, + "step": 1364 + }, + { + "epoch": 0.14571832413259728, + "grad_norm": 7.796349199058688, + "learning_rate": 1.9873245168000374e-06, + "loss": -1.2002, + "step": 1366 + }, + { + "epoch": 0.14593167453396272, + "grad_norm": 9.519972662407087, + "learning_rate": 1.9872060313970172e-06, + "loss": 0.1226, + "step": 1368 + }, + { + "epoch": 0.14614502493532816, + "grad_norm": 51.22794539984388, + "learning_rate": 1.98708699835499e-06, + "loss": 0.0132, + "step": 1370 + }, + { + "epoch": 0.1463583753366936, + "grad_norm": 9.89616190200503, + "learning_rate": 1.9869674177399875e-06, + "loss": -0.9277, + "step": 1372 + }, + { + "epoch": 0.14657172573805904, + "grad_norm": 11.215904956332496, + "learning_rate": 1.9868472896183447e-06, + "loss": -0.4874, + "step": 1374 + }, + { + "epoch": 0.1467850761394245, + "grad_norm": 13.872064184762836, + "learning_rate": 1.9867266140567022e-06, + "loss": -0.3183, + "step": 1376 + }, + { + "epoch": 0.14699842654078993, + "grad_norm": 7.995930683354116, + "learning_rate": 1.9866053911220023e-06, + "loss": -0.5589, + "step": 1378 + }, + { + "epoch": 0.14721177694215537, + "grad_norm": 16.378366719543713, + "learning_rate": 1.986483620881492e-06, + "loss": -0.2039, + "step": 1380 + }, + { + "epoch": 0.1474251273435208, + "grad_norm": 7.493508160443857, + "learning_rate": 1.9863613034027223e-06, + "loss": -0.0562, + "step": 1382 + }, + { + "epoch": 0.14763847774488625, + "grad_norm": 12.664830255838945, + "learning_rate": 1.986238438753546e-06, + "loss": -0.7556, + "step": 1384 + }, + { + "epoch": 0.1478518281462517, + "grad_norm": 19.397178687021295, + "learning_rate": 1.9861150270021217e-06, + "loss": -0.6734, + "step": 1386 + }, + { + "epoch": 0.14806517854761714, + "grad_norm": 5.704825687189972, + "learning_rate": 1.9859910682169094e-06, + "loss": 0.0273, + "step": 1388 + }, + { + "epoch": 0.14827852894898258, + "grad_norm": 18.74154359230761, + "learning_rate": 1.9858665624666736e-06, + "loss": -0.4775, + "step": 1390 + }, + { + "epoch": 0.14849187935034802, + "grad_norm": 5.773692619924285, + "learning_rate": 1.985741509820483e-06, + "loss": -0.6141, + "step": 1392 + }, + { + "epoch": 0.14870522975171346, + "grad_norm": 19.282598779403987, + "learning_rate": 1.9856159103477083e-06, + "loss": -0.8339, + "step": 1394 + }, + { + "epoch": 0.1489185801530789, + "grad_norm": 5.2700845447883005, + "learning_rate": 1.9854897641180243e-06, + "loss": 0.0385, + "step": 1396 + }, + { + "epoch": 0.14913193055444435, + "grad_norm": 9.026756714744467, + "learning_rate": 1.9853630712014084e-06, + "loss": -0.4472, + "step": 1398 + }, + { + "epoch": 0.1493452809558098, + "grad_norm": 10.545420164055413, + "learning_rate": 1.9852358316681423e-06, + "loss": -0.622, + "step": 1400 + }, + { + "epoch": 0.14955863135717523, + "grad_norm": 8.733561962250285, + "learning_rate": 1.98510804558881e-06, + "loss": 0.4292, + "step": 1402 + }, + { + "epoch": 0.14977198175854067, + "grad_norm": 6.668865041230664, + "learning_rate": 1.9849797130342994e-06, + "loss": -0.4685, + "step": 1404 + }, + { + "epoch": 0.14998533215990612, + "grad_norm": 3.5886436542959625, + "learning_rate": 1.984850834075801e-06, + "loss": 0.105, + "step": 1406 + }, + { + "epoch": 0.15019868256127156, + "grad_norm": 5.202738707402359, + "learning_rate": 1.9847214087848086e-06, + "loss": -0.2023, + "step": 1408 + }, + { + "epoch": 0.150412032962637, + "grad_norm": 15.567853060306508, + "learning_rate": 1.9845914372331193e-06, + "loss": 0.1152, + "step": 1410 + }, + { + "epoch": 0.15062538336400244, + "grad_norm": 8.300291377654755, + "learning_rate": 1.984460919492833e-06, + "loss": 0.4582, + "step": 1412 + }, + { + "epoch": 0.15083873376536788, + "grad_norm": 12.462553911552765, + "learning_rate": 1.9843298556363528e-06, + "loss": -1.0064, + "step": 1414 + }, + { + "epoch": 0.15105208416673335, + "grad_norm": 7.664251809863465, + "learning_rate": 1.9841982457363836e-06, + "loss": -0.6192, + "step": 1416 + }, + { + "epoch": 0.1512654345680988, + "grad_norm": 15.092728868230994, + "learning_rate": 1.9840660898659357e-06, + "loss": -1.0103, + "step": 1418 + }, + { + "epoch": 0.15147878496946424, + "grad_norm": 11.503676789333593, + "learning_rate": 1.98393338809832e-06, + "loss": 0.4677, + "step": 1420 + }, + { + "epoch": 0.15169213537082968, + "grad_norm": 4.562471025499068, + "learning_rate": 1.9838001405071504e-06, + "loss": -0.5226, + "step": 1422 + }, + { + "epoch": 0.15190548577219512, + "grad_norm": 4.567933913154999, + "learning_rate": 1.983666347166345e-06, + "loss": -0.4826, + "step": 1424 + }, + { + "epoch": 0.15211883617356056, + "grad_norm": 8.818307342546014, + "learning_rate": 1.983532008150124e-06, + "loss": 0.1747, + "step": 1426 + }, + { + "epoch": 0.152332186574926, + "grad_norm": 13.988100683233654, + "learning_rate": 1.9833971235330092e-06, + "loss": 0.0731, + "step": 1428 + }, + { + "epoch": 0.15254553697629145, + "grad_norm": 8.85657833332133, + "learning_rate": 1.9832616933898266e-06, + "loss": 0.171, + "step": 1430 + }, + { + "epoch": 0.1527588873776569, + "grad_norm": 6.619556052601107, + "learning_rate": 1.983125717795704e-06, + "loss": -0.1667, + "step": 1432 + }, + { + "epoch": 0.15297223777902233, + "grad_norm": 3.9770093006568827, + "learning_rate": 1.9829891968260724e-06, + "loss": -0.3355, + "step": 1434 + }, + { + "epoch": 0.15318558818038777, + "grad_norm": 10.59606320986383, + "learning_rate": 1.9828521305566644e-06, + "loss": 0.2674, + "step": 1436 + }, + { + "epoch": 0.1533989385817532, + "grad_norm": 13.074191446717647, + "learning_rate": 1.982714519063516e-06, + "loss": -0.6665, + "step": 1438 + }, + { + "epoch": 0.15361228898311866, + "grad_norm": 10.66118503206778, + "learning_rate": 1.9825763624229654e-06, + "loss": -0.0641, + "step": 1440 + }, + { + "epoch": 0.1538256393844841, + "grad_norm": 39.14537892546988, + "learning_rate": 1.9824376607116526e-06, + "loss": -0.0822, + "step": 1442 + }, + { + "epoch": 0.15403898978584954, + "grad_norm": 6.254113572658406, + "learning_rate": 1.9822984140065205e-06, + "loss": -0.2203, + "step": 1444 + }, + { + "epoch": 0.15425234018721498, + "grad_norm": 11.512879109806274, + "learning_rate": 1.982158622384815e-06, + "loss": 0.0883, + "step": 1446 + }, + { + "epoch": 0.15446569058858042, + "grad_norm": 10.839138101904458, + "learning_rate": 1.9820182859240824e-06, + "loss": -0.2504, + "step": 1448 + }, + { + "epoch": 0.15467904098994587, + "grad_norm": 27.771953229709435, + "learning_rate": 1.981877404702174e-06, + "loss": -0.4597, + "step": 1450 + }, + { + "epoch": 0.1548923913913113, + "grad_norm": 15.71893609797128, + "learning_rate": 1.9817359787972404e-06, + "loss": -0.5152, + "step": 1452 + }, + { + "epoch": 0.15510574179267675, + "grad_norm": 19.049058206585627, + "learning_rate": 1.9815940082877363e-06, + "loss": 0.2758, + "step": 1454 + }, + { + "epoch": 0.1553190921940422, + "grad_norm": 10.809514750469875, + "learning_rate": 1.9814514932524176e-06, + "loss": 0.4074, + "step": 1456 + }, + { + "epoch": 0.15553244259540763, + "grad_norm": 12.124473119383847, + "learning_rate": 1.981308433770343e-06, + "loss": -0.7027, + "step": 1458 + }, + { + "epoch": 0.15574579299677307, + "grad_norm": 11.324041223470877, + "learning_rate": 1.9811648299208726e-06, + "loss": -1.1957, + "step": 1460 + }, + { + "epoch": 0.15595914339813852, + "grad_norm": 12.422283117147003, + "learning_rate": 1.9810206817836682e-06, + "loss": 0.0484, + "step": 1462 + }, + { + "epoch": 0.15617249379950396, + "grad_norm": 12.624745930225414, + "learning_rate": 1.9808759894386945e-06, + "loss": -0.8822, + "step": 1464 + }, + { + "epoch": 0.1563858442008694, + "grad_norm": 13.78310812890328, + "learning_rate": 1.9807307529662174e-06, + "loss": -0.3963, + "step": 1466 + }, + { + "epoch": 0.15659919460223484, + "grad_norm": 4.4208276670995135, + "learning_rate": 1.9805849724468046e-06, + "loss": -0.0069, + "step": 1468 + }, + { + "epoch": 0.15681254500360028, + "grad_norm": 7.565092495263654, + "learning_rate": 1.9804386479613267e-06, + "loss": 0.3157, + "step": 1470 + }, + { + "epoch": 0.15702589540496573, + "grad_norm": 12.825713095133725, + "learning_rate": 1.980291779590954e-06, + "loss": 0.5112, + "step": 1472 + }, + { + "epoch": 0.15723924580633117, + "grad_norm": 7.5102366021430464, + "learning_rate": 1.980144367417161e-06, + "loss": -0.2039, + "step": 1474 + }, + { + "epoch": 0.1574525962076966, + "grad_norm": 6.8029536488511555, + "learning_rate": 1.979996411521722e-06, + "loss": -0.7558, + "step": 1476 + }, + { + "epoch": 0.15766594660906205, + "grad_norm": 15.641742410552869, + "learning_rate": 1.9798479119867133e-06, + "loss": -0.7574, + "step": 1478 + }, + { + "epoch": 0.1578792970104275, + "grad_norm": 12.63337979627095, + "learning_rate": 1.9796988688945125e-06, + "loss": -0.7699, + "step": 1480 + }, + { + "epoch": 0.15809264741179294, + "grad_norm": 11.011081565801963, + "learning_rate": 1.9795492823278006e-06, + "loss": -0.8634, + "step": 1482 + }, + { + "epoch": 0.15830599781315838, + "grad_norm": 8.113917603381717, + "learning_rate": 1.9793991523695575e-06, + "loss": 0.1113, + "step": 1484 + }, + { + "epoch": 0.15851934821452382, + "grad_norm": 5.879735279694428, + "learning_rate": 1.9792484791030664e-06, + "loss": -0.312, + "step": 1486 + }, + { + "epoch": 0.15873269861588926, + "grad_norm": 7.522005806855097, + "learning_rate": 1.979097262611911e-06, + "loss": 0.0032, + "step": 1488 + }, + { + "epoch": 0.1589460490172547, + "grad_norm": 13.65889902407673, + "learning_rate": 1.9789455029799764e-06, + "loss": -0.2951, + "step": 1490 + }, + { + "epoch": 0.15915939941862015, + "grad_norm": 8.700882293835516, + "learning_rate": 1.9787932002914495e-06, + "loss": -0.7851, + "step": 1492 + }, + { + "epoch": 0.1593727498199856, + "grad_norm": 7.9538304857949305, + "learning_rate": 1.978640354630818e-06, + "loss": 0.1009, + "step": 1494 + }, + { + "epoch": 0.15958610022135103, + "grad_norm": 10.861290314706679, + "learning_rate": 1.978486966082871e-06, + "loss": 0.3189, + "step": 1496 + }, + { + "epoch": 0.15979945062271647, + "grad_norm": 7.444611191380993, + "learning_rate": 1.9783330347326983e-06, + "loss": 0.515, + "step": 1498 + }, + { + "epoch": 0.1600128010240819, + "grad_norm": 11.912608153339749, + "learning_rate": 1.9781785606656914e-06, + "loss": 0.053, + "step": 1500 + }, + { + "epoch": 0.16022615142544738, + "grad_norm": 10.988092252697443, + "learning_rate": 1.978023543967543e-06, + "loss": -0.3173, + "step": 1502 + }, + { + "epoch": 0.16043950182681282, + "grad_norm": 19.7730500472891, + "learning_rate": 1.9778679847242463e-06, + "loss": -0.4978, + "step": 1504 + }, + { + "epoch": 0.16065285222817827, + "grad_norm": 7.565552992288537, + "learning_rate": 1.9777118830220954e-06, + "loss": -0.2651, + "step": 1506 + }, + { + "epoch": 0.1608662026295437, + "grad_norm": 7.04611081329321, + "learning_rate": 1.9775552389476863e-06, + "loss": -0.2007, + "step": 1508 + }, + { + "epoch": 0.16107955303090915, + "grad_norm": 21.76340794506112, + "learning_rate": 1.977398052587914e-06, + "loss": -0.3304, + "step": 1510 + }, + { + "epoch": 0.1612929034322746, + "grad_norm": 11.941091552743108, + "learning_rate": 1.9772403240299765e-06, + "loss": -0.5119, + "step": 1512 + }, + { + "epoch": 0.16150625383364003, + "grad_norm": 13.448296358290223, + "learning_rate": 1.977082053361371e-06, + "loss": -0.6099, + "step": 1514 + }, + { + "epoch": 0.16171960423500548, + "grad_norm": 8.098957923702608, + "learning_rate": 1.9769232406698964e-06, + "loss": -0.2868, + "step": 1516 + }, + { + "epoch": 0.16193295463637092, + "grad_norm": 15.658109835847851, + "learning_rate": 1.9767638860436518e-06, + "loss": -1.877, + "step": 1518 + }, + { + "epoch": 0.16214630503773636, + "grad_norm": 10.196319213005983, + "learning_rate": 1.9766039895710364e-06, + "loss": -0.0268, + "step": 1520 + }, + { + "epoch": 0.1623596554391018, + "grad_norm": 14.344746364689398, + "learning_rate": 1.9764435513407516e-06, + "loss": -0.676, + "step": 1522 + }, + { + "epoch": 0.16257300584046724, + "grad_norm": 14.616404912722821, + "learning_rate": 1.976282571441797e-06, + "loss": 0.1502, + "step": 1524 + }, + { + "epoch": 0.16278635624183269, + "grad_norm": 14.30221231141689, + "learning_rate": 1.976121049963475e-06, + "loss": -0.0454, + "step": 1526 + }, + { + "epoch": 0.16299970664319813, + "grad_norm": 13.915829726975026, + "learning_rate": 1.975958986995387e-06, + "loss": -0.5742, + "step": 1528 + }, + { + "epoch": 0.16321305704456357, + "grad_norm": 13.686767546017077, + "learning_rate": 1.9757963826274354e-06, + "loss": -0.1671, + "step": 1530 + }, + { + "epoch": 0.163426407445929, + "grad_norm": 3.9764711677563103, + "learning_rate": 1.975633236949823e-06, + "loss": -0.3207, + "step": 1532 + }, + { + "epoch": 0.16363975784729445, + "grad_norm": 3.1340618346223548, + "learning_rate": 1.9754695500530516e-06, + "loss": 0.0105, + "step": 1534 + }, + { + "epoch": 0.1638531082486599, + "grad_norm": 6.283022078366911, + "learning_rate": 1.975305322027926e-06, + "loss": -0.6137, + "step": 1536 + }, + { + "epoch": 0.16406645865002534, + "grad_norm": 20.324734801308242, + "learning_rate": 1.9751405529655473e-06, + "loss": -0.1634, + "step": 1538 + }, + { + "epoch": 0.16427980905139078, + "grad_norm": 9.296359495009186, + "learning_rate": 1.9749752429573204e-06, + "loss": 0.4357, + "step": 1540 + }, + { + "epoch": 0.16449315945275622, + "grad_norm": 18.66220522940308, + "learning_rate": 1.9748093920949485e-06, + "loss": -0.8267, + "step": 1542 + }, + { + "epoch": 0.16470650985412166, + "grad_norm": 36.09313965747721, + "learning_rate": 1.974643000470435e-06, + "loss": -0.535, + "step": 1544 + }, + { + "epoch": 0.1649198602554871, + "grad_norm": 14.698967454534976, + "learning_rate": 1.9744760681760832e-06, + "loss": 0.5589, + "step": 1546 + }, + { + "epoch": 0.16513321065685255, + "grad_norm": 12.989032273668574, + "learning_rate": 1.9743085953044963e-06, + "loss": -0.6752, + "step": 1548 + }, + { + "epoch": 0.165346561058218, + "grad_norm": 12.687629449396232, + "learning_rate": 1.9741405819485782e-06, + "loss": -0.2574, + "step": 1550 + }, + { + "epoch": 0.16555991145958343, + "grad_norm": 12.16900146230176, + "learning_rate": 1.973972028201532e-06, + "loss": 0.5595, + "step": 1552 + }, + { + "epoch": 0.16577326186094887, + "grad_norm": 7.84906480756588, + "learning_rate": 1.97380293415686e-06, + "loss": 0.3431, + "step": 1554 + }, + { + "epoch": 0.16598661226231431, + "grad_norm": 13.810151633893732, + "learning_rate": 1.9736332999083647e-06, + "loss": -0.1136, + "step": 1556 + }, + { + "epoch": 0.16619996266367976, + "grad_norm": 23.64843089807792, + "learning_rate": 1.973463125550149e-06, + "loss": -0.2038, + "step": 1558 + }, + { + "epoch": 0.1664133130650452, + "grad_norm": 6.1814145775741745, + "learning_rate": 1.9732924111766148e-06, + "loss": -0.0059, + "step": 1560 + }, + { + "epoch": 0.16662666346641064, + "grad_norm": 9.735355549985982, + "learning_rate": 1.973121156882463e-06, + "loss": -0.1945, + "step": 1562 + }, + { + "epoch": 0.16684001386777608, + "grad_norm": 10.150551002834774, + "learning_rate": 1.972949362762695e-06, + "loss": -0.9193, + "step": 1564 + }, + { + "epoch": 0.16705336426914152, + "grad_norm": 12.368494093848792, + "learning_rate": 1.972777028912611e-06, + "loss": -0.9772, + "step": 1566 + }, + { + "epoch": 0.16726671467050697, + "grad_norm": 20.463056118932432, + "learning_rate": 1.972604155427811e-06, + "loss": 0.3249, + "step": 1568 + }, + { + "epoch": 0.1674800650718724, + "grad_norm": 4.719164605200082, + "learning_rate": 1.9724307424041943e-06, + "loss": 0.4832, + "step": 1570 + }, + { + "epoch": 0.16769341547323785, + "grad_norm": 13.753871214297394, + "learning_rate": 1.972256789937959e-06, + "loss": 0.2147, + "step": 1572 + }, + { + "epoch": 0.1679067658746033, + "grad_norm": 8.370963490532764, + "learning_rate": 1.9720822981256032e-06, + "loss": -0.3386, + "step": 1574 + }, + { + "epoch": 0.16812011627596873, + "grad_norm": 6.477965817350643, + "learning_rate": 1.971907267063924e-06, + "loss": -0.5137, + "step": 1576 + }, + { + "epoch": 0.16833346667733418, + "grad_norm": 8.997371018929906, + "learning_rate": 1.9717316968500165e-06, + "loss": -0.0199, + "step": 1578 + }, + { + "epoch": 0.16854681707869962, + "grad_norm": 6.994891692529196, + "learning_rate": 1.971555587581277e-06, + "loss": -0.9777, + "step": 1580 + }, + { + "epoch": 0.16876016748006506, + "grad_norm": 12.156101613671975, + "learning_rate": 1.971378939355399e-06, + "loss": -0.0555, + "step": 1582 + }, + { + "epoch": 0.1689735178814305, + "grad_norm": 19.837439965100955, + "learning_rate": 1.971201752270376e-06, + "loss": -0.2187, + "step": 1584 + }, + { + "epoch": 0.16918686828279594, + "grad_norm": 23.06678942310109, + "learning_rate": 1.9710240264245003e-06, + "loss": -1.3332, + "step": 1586 + }, + { + "epoch": 0.1694002186841614, + "grad_norm": 6.863429399480339, + "learning_rate": 1.9708457619163627e-06, + "loss": 0.2579, + "step": 1588 + }, + { + "epoch": 0.16961356908552686, + "grad_norm": 3.240255593370213, + "learning_rate": 1.970666958844853e-06, + "loss": -0.2973, + "step": 1590 + }, + { + "epoch": 0.1698269194868923, + "grad_norm": 10.38048415473619, + "learning_rate": 1.9704876173091593e-06, + "loss": -1.2771, + "step": 1592 + }, + { + "epoch": 0.17004026988825774, + "grad_norm": 9.3643177662047, + "learning_rate": 1.9703077374087692e-06, + "loss": -0.8776, + "step": 1594 + }, + { + "epoch": 0.17025362028962318, + "grad_norm": 11.223809862732512, + "learning_rate": 1.9701273192434687e-06, + "loss": 0.1148, + "step": 1596 + }, + { + "epoch": 0.17046697069098862, + "grad_norm": 3.1834765213057814, + "learning_rate": 1.9699463629133423e-06, + "loss": 0.4166, + "step": 1598 + }, + { + "epoch": 0.17068032109235406, + "grad_norm": 18.265604789498173, + "learning_rate": 1.969764868518773e-06, + "loss": -1.0235, + "step": 1600 + }, + { + "epoch": 0.1708936714937195, + "grad_norm": 8.13932852787834, + "learning_rate": 1.9695828361604426e-06, + "loss": -0.2186, + "step": 1602 + }, + { + "epoch": 0.17110702189508495, + "grad_norm": 5.110377199191138, + "learning_rate": 1.9694002659393305e-06, + "loss": -1.507, + "step": 1604 + }, + { + "epoch": 0.1713203722964504, + "grad_norm": 9.437111240728154, + "learning_rate": 1.9692171579567153e-06, + "loss": -0.4275, + "step": 1606 + }, + { + "epoch": 0.17153372269781583, + "grad_norm": 6.317175247246368, + "learning_rate": 1.9690335123141736e-06, + "loss": -0.4688, + "step": 1608 + }, + { + "epoch": 0.17174707309918127, + "grad_norm": 8.211655480975368, + "learning_rate": 1.96884932911358e-06, + "loss": -1.2091, + "step": 1610 + }, + { + "epoch": 0.17196042350054672, + "grad_norm": 9.167271377226179, + "learning_rate": 1.9686646084571088e-06, + "loss": 0.0305, + "step": 1612 + }, + { + "epoch": 0.17217377390191216, + "grad_norm": 18.132861007920468, + "learning_rate": 1.9684793504472297e-06, + "loss": 0.9484, + "step": 1614 + }, + { + "epoch": 0.1723871243032776, + "grad_norm": 9.030723482308117, + "learning_rate": 1.9682935551867126e-06, + "loss": -0.6449, + "step": 1616 + }, + { + "epoch": 0.17260047470464304, + "grad_norm": 7.535160574600248, + "learning_rate": 1.9681072227786257e-06, + "loss": -0.6224, + "step": 1618 + }, + { + "epoch": 0.17281382510600848, + "grad_norm": 6.8318680978024595, + "learning_rate": 1.9679203533263333e-06, + "loss": -0.466, + "step": 1620 + }, + { + "epoch": 0.17302717550737393, + "grad_norm": 11.853415128133399, + "learning_rate": 1.967732946933499e-06, + "loss": -0.3621, + "step": 1622 + }, + { + "epoch": 0.17324052590873937, + "grad_norm": 8.62809433845374, + "learning_rate": 1.9675450037040835e-06, + "loss": -0.7561, + "step": 1624 + }, + { + "epoch": 0.1734538763101048, + "grad_norm": 10.182519096586946, + "learning_rate": 1.967356523742347e-06, + "loss": -0.0543, + "step": 1626 + }, + { + "epoch": 0.17366722671147025, + "grad_norm": 4.9289897884289635, + "learning_rate": 1.967167507152845e-06, + "loss": 0.1213, + "step": 1628 + }, + { + "epoch": 0.1738805771128357, + "grad_norm": 13.67496482188253, + "learning_rate": 1.9669779540404317e-06, + "loss": 0.0458, + "step": 1630 + }, + { + "epoch": 0.17409392751420114, + "grad_norm": 3.9691478655360264, + "learning_rate": 1.9667878645102602e-06, + "loss": -0.7859, + "step": 1632 + }, + { + "epoch": 0.17430727791556658, + "grad_norm": 8.028314392529628, + "learning_rate": 1.9665972386677795e-06, + "loss": -0.1328, + "step": 1634 + }, + { + "epoch": 0.17452062831693202, + "grad_norm": 23.734954082296028, + "learning_rate": 1.9664060766187363e-06, + "loss": 0.0432, + "step": 1636 + }, + { + "epoch": 0.17473397871829746, + "grad_norm": 2.062739370041361, + "learning_rate": 1.9662143784691755e-06, + "loss": -0.421, + "step": 1638 + }, + { + "epoch": 0.1749473291196629, + "grad_norm": 10.085952838950082, + "learning_rate": 1.966022144325439e-06, + "loss": -1.5764, + "step": 1640 + }, + { + "epoch": 0.17516067952102835, + "grad_norm": 12.13344771384856, + "learning_rate": 1.9658293742941664e-06, + "loss": -0.715, + "step": 1642 + }, + { + "epoch": 0.1753740299223938, + "grad_norm": 5.349791815496225, + "learning_rate": 1.9656360684822936e-06, + "loss": -0.0556, + "step": 1644 + }, + { + "epoch": 0.17558738032375923, + "grad_norm": 25.20028676441013, + "learning_rate": 1.9654422269970543e-06, + "loss": 0.1781, + "step": 1646 + }, + { + "epoch": 0.17580073072512467, + "grad_norm": 10.844013728581611, + "learning_rate": 1.96524784994598e-06, + "loss": -0.1389, + "step": 1648 + }, + { + "epoch": 0.1760140811264901, + "grad_norm": 6.66565646129455, + "learning_rate": 1.9650529374368986e-06, + "loss": -0.6699, + "step": 1650 + }, + { + "epoch": 0.17622743152785555, + "grad_norm": 8.012849338836023, + "learning_rate": 1.9648574895779347e-06, + "loss": -0.2213, + "step": 1652 + }, + { + "epoch": 0.176440781929221, + "grad_norm": 17.49951854490355, + "learning_rate": 1.9646615064775105e-06, + "loss": -0.121, + "step": 1654 + }, + { + "epoch": 0.17665413233058644, + "grad_norm": 23.843475618352024, + "learning_rate": 1.9644649882443453e-06, + "loss": -0.7095, + "step": 1656 + }, + { + "epoch": 0.17686748273195188, + "grad_norm": 10.069359913823204, + "learning_rate": 1.9642679349874544e-06, + "loss": -0.2746, + "step": 1658 + }, + { + "epoch": 0.17708083313331732, + "grad_norm": 8.436240471923892, + "learning_rate": 1.9640703468161507e-06, + "loss": -0.2465, + "step": 1660 + }, + { + "epoch": 0.17729418353468276, + "grad_norm": 19.20945740161944, + "learning_rate": 1.9638722238400433e-06, + "loss": -0.4992, + "step": 1662 + }, + { + "epoch": 0.1775075339360482, + "grad_norm": 15.246967161527042, + "learning_rate": 1.963673566169038e-06, + "loss": 0.0541, + "step": 1664 + }, + { + "epoch": 0.17772088433741365, + "grad_norm": 22.910616685380933, + "learning_rate": 1.9634743739133387e-06, + "loss": -0.0508, + "step": 1666 + }, + { + "epoch": 0.1779342347387791, + "grad_norm": 9.253592958142804, + "learning_rate": 1.963274647183443e-06, + "loss": -0.6378, + "step": 1668 + }, + { + "epoch": 0.17814758514014453, + "grad_norm": 7.223034507517141, + "learning_rate": 1.963074386090147e-06, + "loss": -0.7481, + "step": 1670 + }, + { + "epoch": 0.17836093554150997, + "grad_norm": 20.0730233250035, + "learning_rate": 1.9628735907445437e-06, + "loss": -1.1028, + "step": 1672 + }, + { + "epoch": 0.17857428594287544, + "grad_norm": 11.506960048731827, + "learning_rate": 1.96267226125802e-06, + "loss": -0.1797, + "step": 1674 + }, + { + "epoch": 0.17878763634424089, + "grad_norm": 8.61513532588195, + "learning_rate": 1.962470397742262e-06, + "loss": 0.3524, + "step": 1676 + }, + { + "epoch": 0.17900098674560633, + "grad_norm": 8.862740228622737, + "learning_rate": 1.9622680003092503e-06, + "loss": -0.146, + "step": 1678 + }, + { + "epoch": 0.17921433714697177, + "grad_norm": 12.139553131432608, + "learning_rate": 1.9620650690712618e-06, + "loss": -1.4315, + "step": 1680 + }, + { + "epoch": 0.1794276875483372, + "grad_norm": 10.44652923791041, + "learning_rate": 1.9618616041408703e-06, + "loss": 0.3068, + "step": 1682 + }, + { + "epoch": 0.17964103794970265, + "grad_norm": 14.831352041276144, + "learning_rate": 1.9616576056309447e-06, + "loss": 0.1342, + "step": 1684 + }, + { + "epoch": 0.1798543883510681, + "grad_norm": 6.713153774669659, + "learning_rate": 1.9614530736546507e-06, + "loss": -0.3189, + "step": 1686 + }, + { + "epoch": 0.18006773875243354, + "grad_norm": 12.095730762690835, + "learning_rate": 1.9612480083254496e-06, + "loss": -0.659, + "step": 1688 + }, + { + "epoch": 0.18028108915379898, + "grad_norm": 5.424181191521463, + "learning_rate": 1.9610424097570983e-06, + "loss": 0.4658, + "step": 1690 + }, + { + "epoch": 0.18049443955516442, + "grad_norm": 7.526381273568718, + "learning_rate": 1.9608362780636503e-06, + "loss": -0.1184, + "step": 1692 + }, + { + "epoch": 0.18070778995652986, + "grad_norm": 50.05765764524042, + "learning_rate": 1.9606296133594538e-06, + "loss": 0.3701, + "step": 1694 + }, + { + "epoch": 0.1809211403578953, + "grad_norm": 20.147742107794826, + "learning_rate": 1.9604224157591537e-06, + "loss": 0.2118, + "step": 1696 + }, + { + "epoch": 0.18113449075926075, + "grad_norm": 7.219813897301699, + "learning_rate": 1.960214685377689e-06, + "loss": -0.3109, + "step": 1698 + }, + { + "epoch": 0.1813478411606262, + "grad_norm": 8.540044296402842, + "learning_rate": 1.960006422330297e-06, + "loss": -0.6274, + "step": 1700 + }, + { + "epoch": 0.18156119156199163, + "grad_norm": 8.159546369231789, + "learning_rate": 1.9597976267325072e-06, + "loss": -0.4328, + "step": 1702 + }, + { + "epoch": 0.18177454196335707, + "grad_norm": 5.447742906774788, + "learning_rate": 1.959588298700147e-06, + "loss": -0.4139, + "step": 1704 + }, + { + "epoch": 0.18198789236472251, + "grad_norm": 4.784512936635263, + "learning_rate": 1.9593784383493377e-06, + "loss": -0.2258, + "step": 1706 + }, + { + "epoch": 0.18220124276608796, + "grad_norm": 12.694443490406826, + "learning_rate": 1.959168045796497e-06, + "loss": -0.402, + "step": 1708 + }, + { + "epoch": 0.1824145931674534, + "grad_norm": 20.749368077941035, + "learning_rate": 1.9589571211583367e-06, + "loss": -0.8768, + "step": 1710 + }, + { + "epoch": 0.18262794356881884, + "grad_norm": 10.332727657869777, + "learning_rate": 1.958745664551865e-06, + "loss": -0.4763, + "step": 1712 + }, + { + "epoch": 0.18284129397018428, + "grad_norm": 14.081473828895792, + "learning_rate": 1.9585336760943838e-06, + "loss": -0.7508, + "step": 1714 + }, + { + "epoch": 0.18305464437154972, + "grad_norm": 8.083033120627904, + "learning_rate": 1.9583211559034912e-06, + "loss": 0.2669, + "step": 1716 + }, + { + "epoch": 0.18326799477291517, + "grad_norm": 12.716253561595414, + "learning_rate": 1.9581081040970803e-06, + "loss": -0.3489, + "step": 1718 + }, + { + "epoch": 0.1834813451742806, + "grad_norm": 14.831224862239058, + "learning_rate": 1.9578945207933378e-06, + "loss": -0.257, + "step": 1720 + }, + { + "epoch": 0.18369469557564605, + "grad_norm": 10.0598638106194, + "learning_rate": 1.9576804061107468e-06, + "loss": -0.2818, + "step": 1722 + }, + { + "epoch": 0.1839080459770115, + "grad_norm": 11.916352138065047, + "learning_rate": 1.9574657601680837e-06, + "loss": -0.3157, + "step": 1724 + }, + { + "epoch": 0.18412139637837693, + "grad_norm": 13.293812004739776, + "learning_rate": 1.957250583084421e-06, + "loss": -1.3926, + "step": 1726 + }, + { + "epoch": 0.18433474677974238, + "grad_norm": 17.673810422897805, + "learning_rate": 1.9570348749791258e-06, + "loss": 0.3864, + "step": 1728 + }, + { + "epoch": 0.18454809718110782, + "grad_norm": 8.56725810020144, + "learning_rate": 1.956818635971858e-06, + "loss": 0.1284, + "step": 1730 + }, + { + "epoch": 0.18476144758247326, + "grad_norm": 7.539384803035816, + "learning_rate": 1.9566018661825735e-06, + "loss": -0.0348, + "step": 1732 + }, + { + "epoch": 0.1849747979838387, + "grad_norm": 9.139522421612835, + "learning_rate": 1.9563845657315233e-06, + "loss": -0.44, + "step": 1734 + }, + { + "epoch": 0.18518814838520414, + "grad_norm": 5.987861862159247, + "learning_rate": 1.956166734739251e-06, + "loss": -0.1993, + "step": 1736 + }, + { + "epoch": 0.18540149878656959, + "grad_norm": 8.191304855710381, + "learning_rate": 1.955948373326595e-06, + "loss": 0.0556, + "step": 1738 + }, + { + "epoch": 0.18561484918793503, + "grad_norm": 10.731204984938158, + "learning_rate": 1.9557294816146896e-06, + "loss": -0.3543, + "step": 1740 + }, + { + "epoch": 0.18582819958930047, + "grad_norm": 19.367490367006926, + "learning_rate": 1.9555100597249606e-06, + "loss": 0.349, + "step": 1742 + }, + { + "epoch": 0.1860415499906659, + "grad_norm": 26.972692980980362, + "learning_rate": 1.9552901077791305e-06, + "loss": -0.7377, + "step": 1744 + }, + { + "epoch": 0.18625490039203135, + "grad_norm": 15.656004899623815, + "learning_rate": 1.9550696258992135e-06, + "loss": -0.0474, + "step": 1746 + }, + { + "epoch": 0.1864682507933968, + "grad_norm": 7.872925492553763, + "learning_rate": 1.9548486142075195e-06, + "loss": -1.9635, + "step": 1748 + }, + { + "epoch": 0.18668160119476224, + "grad_norm": 11.0120416086597, + "learning_rate": 1.954627072826652e-06, + "loss": 0.3609, + "step": 1750 + }, + { + "epoch": 0.18689495159612768, + "grad_norm": 16.318707337248572, + "learning_rate": 1.9544050018795075e-06, + "loss": -0.2341, + "step": 1752 + }, + { + "epoch": 0.18710830199749312, + "grad_norm": 6.793061500721003, + "learning_rate": 1.9541824014892766e-06, + "loss": -0.8057, + "step": 1754 + }, + { + "epoch": 0.18732165239885856, + "grad_norm": 8.257666721715685, + "learning_rate": 1.953959271779445e-06, + "loss": -1.125, + "step": 1756 + }, + { + "epoch": 0.187535002800224, + "grad_norm": 2.2312803347930865, + "learning_rate": 1.953735612873789e-06, + "loss": -0.529, + "step": 1758 + }, + { + "epoch": 0.18774835320158947, + "grad_norm": 6.494227985980778, + "learning_rate": 1.9535114248963823e-06, + "loss": 0.4977, + "step": 1760 + }, + { + "epoch": 0.18796170360295492, + "grad_norm": 2.6225868521760884, + "learning_rate": 1.9532867079715885e-06, + "loss": 0.1818, + "step": 1762 + }, + { + "epoch": 0.18817505400432036, + "grad_norm": 13.169535473158854, + "learning_rate": 1.953061462224067e-06, + "loss": -1.5464, + "step": 1764 + }, + { + "epoch": 0.1883884044056858, + "grad_norm": 5.961894477938251, + "learning_rate": 1.95283568777877e-06, + "loss": -0.0843, + "step": 1766 + }, + { + "epoch": 0.18860175480705124, + "grad_norm": 17.790751191487725, + "learning_rate": 1.9526093847609425e-06, + "loss": -0.193, + "step": 1768 + }, + { + "epoch": 0.18881510520841668, + "grad_norm": 4.894851849522952, + "learning_rate": 1.9523825532961226e-06, + "loss": 0.3596, + "step": 1770 + }, + { + "epoch": 0.18902845560978213, + "grad_norm": 26.609231468601866, + "learning_rate": 1.9521551935101422e-06, + "loss": -1.0124, + "step": 1772 + }, + { + "epoch": 0.18924180601114757, + "grad_norm": 11.973906564469583, + "learning_rate": 1.9519273055291264e-06, + "loss": -0.7964, + "step": 1774 + }, + { + "epoch": 0.189455156412513, + "grad_norm": 5.530101076170713, + "learning_rate": 1.9516988894794926e-06, + "loss": -0.6191, + "step": 1776 + }, + { + "epoch": 0.18966850681387845, + "grad_norm": 8.679591909640877, + "learning_rate": 1.9514699454879515e-06, + "loss": -0.4146, + "step": 1778 + }, + { + "epoch": 0.1898818572152439, + "grad_norm": 11.728585457641236, + "learning_rate": 1.951240473681507e-06, + "loss": -0.4557, + "step": 1780 + }, + { + "epoch": 0.19009520761660934, + "grad_norm": 10.564389393242319, + "learning_rate": 1.9510104741874544e-06, + "loss": 0.2524, + "step": 1782 + }, + { + "epoch": 0.19030855801797478, + "grad_norm": 9.41590267221953, + "learning_rate": 1.9507799471333838e-06, + "loss": -0.6369, + "step": 1784 + }, + { + "epoch": 0.19052190841934022, + "grad_norm": 37.30175714716622, + "learning_rate": 1.950548892647177e-06, + "loss": -0.248, + "step": 1786 + }, + { + "epoch": 0.19073525882070566, + "grad_norm": 3.3075363360363474, + "learning_rate": 1.9503173108570073e-06, + "loss": 0.5654, + "step": 1788 + }, + { + "epoch": 0.1909486092220711, + "grad_norm": 24.48449478152585, + "learning_rate": 1.9500852018913423e-06, + "loss": -0.2383, + "step": 1790 + }, + { + "epoch": 0.19116195962343654, + "grad_norm": 15.853128801306614, + "learning_rate": 1.9498525658789415e-06, + "loss": -0.0832, + "step": 1792 + }, + { + "epoch": 0.191375310024802, + "grad_norm": 12.829677105255076, + "learning_rate": 1.9496194029488554e-06, + "loss": -0.9211, + "step": 1794 + }, + { + "epoch": 0.19158866042616743, + "grad_norm": 12.212923633405069, + "learning_rate": 1.949385713230429e-06, + "loss": -0.7124, + "step": 1796 + }, + { + "epoch": 0.19180201082753287, + "grad_norm": 9.920453632812915, + "learning_rate": 1.949151496853298e-06, + "loss": -0.0601, + "step": 1798 + }, + { + "epoch": 0.1920153612288983, + "grad_norm": 28.477961536263486, + "learning_rate": 1.9489167539473913e-06, + "loss": -0.2516, + "step": 1800 + }, + { + "epoch": 0.19222871163026375, + "grad_norm": 9.29396040625149, + "learning_rate": 1.9486814846429283e-06, + "loss": -0.5495, + "step": 1802 + }, + { + "epoch": 0.1924420620316292, + "grad_norm": 10.643418633887284, + "learning_rate": 1.948445689070422e-06, + "loss": -0.0461, + "step": 1804 + }, + { + "epoch": 0.19265541243299464, + "grad_norm": 6.505476361510747, + "learning_rate": 1.9482093673606772e-06, + "loss": -0.5896, + "step": 1806 + }, + { + "epoch": 0.19286876283436008, + "grad_norm": 10.585505862392617, + "learning_rate": 1.9479725196447896e-06, + "loss": 0.2174, + "step": 1808 + }, + { + "epoch": 0.19308211323572552, + "grad_norm": 10.74347579890419, + "learning_rate": 1.9477351460541472e-06, + "loss": 0.1333, + "step": 1810 + }, + { + "epoch": 0.19329546363709096, + "grad_norm": 7.644940311892833, + "learning_rate": 1.9474972467204294e-06, + "loss": -0.2822, + "step": 1812 + }, + { + "epoch": 0.1935088140384564, + "grad_norm": 4.730509511046018, + "learning_rate": 1.9472588217756086e-06, + "loss": 0.3581, + "step": 1814 + }, + { + "epoch": 0.19372216443982185, + "grad_norm": 5.724269421319521, + "learning_rate": 1.947019871351947e-06, + "loss": 0.567, + "step": 1816 + }, + { + "epoch": 0.1939355148411873, + "grad_norm": 11.725234173450394, + "learning_rate": 1.946780395581999e-06, + "loss": -0.1575, + "step": 1818 + }, + { + "epoch": 0.19414886524255273, + "grad_norm": 13.16677586483919, + "learning_rate": 1.946540394598611e-06, + "loss": -0.3832, + "step": 1820 + }, + { + "epoch": 0.19436221564391817, + "grad_norm": 12.990609192207351, + "learning_rate": 1.94629986853492e-06, + "loss": -0.4566, + "step": 1822 + }, + { + "epoch": 0.19457556604528362, + "grad_norm": 9.364327074112744, + "learning_rate": 1.9460588175243548e-06, + "loss": -0.1461, + "step": 1824 + }, + { + "epoch": 0.19478891644664906, + "grad_norm": 7.775551872545612, + "learning_rate": 1.9458172417006346e-06, + "loss": -0.9538, + "step": 1826 + }, + { + "epoch": 0.1950022668480145, + "grad_norm": 9.883774622479446, + "learning_rate": 1.9455751411977707e-06, + "loss": -0.6082, + "step": 1828 + }, + { + "epoch": 0.19521561724937994, + "grad_norm": 3.7575112892591123, + "learning_rate": 1.9453325161500646e-06, + "loss": -0.5966, + "step": 1830 + }, + { + "epoch": 0.19542896765074538, + "grad_norm": 5.420267155635353, + "learning_rate": 1.9450893666921098e-06, + "loss": 0.5389, + "step": 1832 + }, + { + "epoch": 0.19564231805211083, + "grad_norm": 8.197562118545699, + "learning_rate": 1.9448456929587898e-06, + "loss": 0.44, + "step": 1834 + }, + { + "epoch": 0.19585566845347627, + "grad_norm": 8.873455599519605, + "learning_rate": 1.9446014950852793e-06, + "loss": -0.6835, + "step": 1836 + }, + { + "epoch": 0.1960690188548417, + "grad_norm": 14.915885818798811, + "learning_rate": 1.9443567732070434e-06, + "loss": -0.6561, + "step": 1838 + }, + { + "epoch": 0.19628236925620715, + "grad_norm": 4.877252604264543, + "learning_rate": 1.944111527459839e-06, + "loss": -1.046, + "step": 1840 + }, + { + "epoch": 0.1964957196575726, + "grad_norm": 12.056248833737559, + "learning_rate": 1.9438657579797125e-06, + "loss": 0.0056, + "step": 1842 + }, + { + "epoch": 0.19670907005893803, + "grad_norm": 13.13224191691086, + "learning_rate": 1.9436194649030004e-06, + "loss": -0.0449, + "step": 1844 + }, + { + "epoch": 0.1969224204603035, + "grad_norm": 17.727545929984956, + "learning_rate": 1.9433726483663314e-06, + "loss": -0.6246, + "step": 1846 + }, + { + "epoch": 0.19713577086166895, + "grad_norm": 17.78918925352426, + "learning_rate": 1.943125308506623e-06, + "loss": -0.824, + "step": 1848 + }, + { + "epoch": 0.1973491212630344, + "grad_norm": 14.969243154757692, + "learning_rate": 1.942877445461084e-06, + "loss": 0.241, + "step": 1850 + }, + { + "epoch": 0.19756247166439983, + "grad_norm": 6.31904123376743, + "learning_rate": 1.942629059367213e-06, + "loss": -0.2239, + "step": 1852 + }, + { + "epoch": 0.19777582206576527, + "grad_norm": 8.57957389022486, + "learning_rate": 1.942380150362798e-06, + "loss": -0.3675, + "step": 1854 + }, + { + "epoch": 0.19798917246713071, + "grad_norm": 9.365987948081294, + "learning_rate": 1.9421307185859188e-06, + "loss": -0.4777, + "step": 1856 + }, + { + "epoch": 0.19820252286849616, + "grad_norm": 8.654835539439935, + "learning_rate": 1.941880764174944e-06, + "loss": -0.7596, + "step": 1858 + }, + { + "epoch": 0.1984158732698616, + "grad_norm": 8.098551922582622, + "learning_rate": 1.941630287268532e-06, + "loss": -0.8832, + "step": 1860 + }, + { + "epoch": 0.19862922367122704, + "grad_norm": 18.988381161750784, + "learning_rate": 1.941379288005632e-06, + "loss": 0.4242, + "step": 1862 + }, + { + "epoch": 0.19884257407259248, + "grad_norm": 16.079436898990718, + "learning_rate": 1.941127766525482e-06, + "loss": -0.3194, + "step": 1864 + }, + { + "epoch": 0.19905592447395792, + "grad_norm": 4.718296280940598, + "learning_rate": 1.9408757229676104e-06, + "loss": -0.324, + "step": 1866 + }, + { + "epoch": 0.19926927487532337, + "grad_norm": 5.673677060566251, + "learning_rate": 1.9406231574718343e-06, + "loss": -0.672, + "step": 1868 + }, + { + "epoch": 0.1994826252766888, + "grad_norm": 10.416325499823452, + "learning_rate": 1.9403700701782616e-06, + "loss": -0.1505, + "step": 1870 + }, + { + "epoch": 0.19969597567805425, + "grad_norm": 12.46215797459305, + "learning_rate": 1.9401164612272888e-06, + "loss": -0.9961, + "step": 1872 + }, + { + "epoch": 0.1999093260794197, + "grad_norm": 6.846444168934443, + "learning_rate": 1.939862330759602e-06, + "loss": 0.1682, + "step": 1874 + }, + { + "epoch": 0.20012267648078513, + "grad_norm": 9.365315641036963, + "learning_rate": 1.939607678916176e-06, + "loss": -0.4999, + "step": 1876 + }, + { + "epoch": 0.20033602688215058, + "grad_norm": 9.893208766151144, + "learning_rate": 1.9393525058382767e-06, + "loss": -0.3294, + "step": 1878 + }, + { + "epoch": 0.20054937728351602, + "grad_norm": 5.540154917275656, + "learning_rate": 1.939096811667457e-06, + "loss": 0.812, + "step": 1880 + }, + { + "epoch": 0.20076272768488146, + "grad_norm": 9.249926531646922, + "learning_rate": 1.9388405965455594e-06, + "loss": -1.6695, + "step": 1882 + }, + { + "epoch": 0.2009760780862469, + "grad_norm": 5.89678233946676, + "learning_rate": 1.9385838606147167e-06, + "loss": -0.0591, + "step": 1884 + }, + { + "epoch": 0.20118942848761234, + "grad_norm": 13.14256888070574, + "learning_rate": 1.938326604017349e-06, + "loss": -1.4884, + "step": 1886 + }, + { + "epoch": 0.20140277888897778, + "grad_norm": 7.94507307842013, + "learning_rate": 1.938068826896166e-06, + "loss": -0.6971, + "step": 1888 + }, + { + "epoch": 0.20161612929034323, + "grad_norm": 8.240327403029974, + "learning_rate": 1.9378105293941654e-06, + "loss": -0.0613, + "step": 1890 + }, + { + "epoch": 0.20182947969170867, + "grad_norm": 9.818936849267306, + "learning_rate": 1.9375517116546355e-06, + "loss": 0.3111, + "step": 1892 + }, + { + "epoch": 0.2020428300930741, + "grad_norm": 17.41443650473765, + "learning_rate": 1.9372923738211513e-06, + "loss": 0.5569, + "step": 1894 + }, + { + "epoch": 0.20225618049443955, + "grad_norm": 14.009083870989118, + "learning_rate": 1.9370325160375765e-06, + "loss": -0.4879, + "step": 1896 + }, + { + "epoch": 0.202469530895805, + "grad_norm": 6.68968872808115, + "learning_rate": 1.936772138448064e-06, + "loss": 0.165, + "step": 1898 + }, + { + "epoch": 0.20268288129717044, + "grad_norm": 11.073425085887829, + "learning_rate": 1.9365112411970546e-06, + "loss": -0.4699, + "step": 1900 + }, + { + "epoch": 0.20289623169853588, + "grad_norm": 13.72806606533039, + "learning_rate": 1.9362498244292777e-06, + "loss": -0.4613, + "step": 1902 + }, + { + "epoch": 0.20310958209990132, + "grad_norm": 7.420626928846953, + "learning_rate": 1.9359878882897504e-06, + "loss": 0.1518, + "step": 1904 + }, + { + "epoch": 0.20332293250126676, + "grad_norm": 24.333305223572363, + "learning_rate": 1.9357254329237782e-06, + "loss": -2.3156, + "step": 1906 + }, + { + "epoch": 0.2035362829026322, + "grad_norm": 9.517310195720865, + "learning_rate": 1.935462458476955e-06, + "loss": -0.1623, + "step": 1908 + }, + { + "epoch": 0.20374963330399765, + "grad_norm": 6.549306906184669, + "learning_rate": 1.9351989650951617e-06, + "loss": -0.0076, + "step": 1910 + }, + { + "epoch": 0.2039629837053631, + "grad_norm": 12.084117634395213, + "learning_rate": 1.934934952924568e-06, + "loss": -0.8675, + "step": 1912 + }, + { + "epoch": 0.20417633410672853, + "grad_norm": 23.006346032724178, + "learning_rate": 1.9346704221116304e-06, + "loss": 0.6388, + "step": 1914 + }, + { + "epoch": 0.20438968450809397, + "grad_norm": 11.25152656365477, + "learning_rate": 1.934405372803095e-06, + "loss": -0.2131, + "step": 1916 + }, + { + "epoch": 0.2046030349094594, + "grad_norm": 15.449233578378294, + "learning_rate": 1.934139805145993e-06, + "loss": -0.0512, + "step": 1918 + }, + { + "epoch": 0.20481638531082486, + "grad_norm": 25.967275303877503, + "learning_rate": 1.9338737192876455e-06, + "loss": -0.6831, + "step": 1920 + }, + { + "epoch": 0.2050297357121903, + "grad_norm": 22.999602017359916, + "learning_rate": 1.933607115375659e-06, + "loss": -0.1199, + "step": 1922 + }, + { + "epoch": 0.20524308611355574, + "grad_norm": 4.605511082587932, + "learning_rate": 1.933339993557929e-06, + "loss": 0.4724, + "step": 1924 + }, + { + "epoch": 0.20545643651492118, + "grad_norm": 11.515265673965152, + "learning_rate": 1.933072353982637e-06, + "loss": -0.601, + "step": 1926 + }, + { + "epoch": 0.20566978691628662, + "grad_norm": 5.596883504256079, + "learning_rate": 1.9328041967982535e-06, + "loss": 0.0537, + "step": 1928 + }, + { + "epoch": 0.20588313731765207, + "grad_norm": 9.822112001208648, + "learning_rate": 1.932535522153534e-06, + "loss": -0.0105, + "step": 1930 + }, + { + "epoch": 0.20609648771901753, + "grad_norm": 7.789424086801903, + "learning_rate": 1.9322663301975227e-06, + "loss": -0.9059, + "step": 1932 + }, + { + "epoch": 0.20630983812038298, + "grad_norm": 5.653484007709651, + "learning_rate": 1.9319966210795497e-06, + "loss": -0.6785, + "step": 1934 + }, + { + "epoch": 0.20652318852174842, + "grad_norm": 5.746107769290736, + "learning_rate": 1.9317263949492324e-06, + "loss": -0.5469, + "step": 1936 + }, + { + "epoch": 0.20673653892311386, + "grad_norm": 21.10901273760898, + "learning_rate": 1.9314556519564753e-06, + "loss": -0.6998, + "step": 1938 + }, + { + "epoch": 0.2069498893244793, + "grad_norm": 7.46443143657138, + "learning_rate": 1.93118439225147e-06, + "loss": -0.4524, + "step": 1940 + }, + { + "epoch": 0.20716323972584474, + "grad_norm": 7.024088389819032, + "learning_rate": 1.930912615984693e-06, + "loss": -0.3906, + "step": 1942 + }, + { + "epoch": 0.2073765901272102, + "grad_norm": 5.505478739537021, + "learning_rate": 1.9306403233069085e-06, + "loss": -0.4535, + "step": 1944 + }, + { + "epoch": 0.20758994052857563, + "grad_norm": 8.886336660167451, + "learning_rate": 1.930367514369168e-06, + "loss": -0.3084, + "step": 1946 + }, + { + "epoch": 0.20780329092994107, + "grad_norm": 5.907548024119052, + "learning_rate": 1.930094189322808e-06, + "loss": -0.6485, + "step": 1948 + }, + { + "epoch": 0.2080166413313065, + "grad_norm": 7.122391838967096, + "learning_rate": 1.9298203483194515e-06, + "loss": -0.2278, + "step": 1950 + }, + { + "epoch": 0.20822999173267195, + "grad_norm": 11.029026530105963, + "learning_rate": 1.929545991511009e-06, + "loss": -0.2532, + "step": 1952 + }, + { + "epoch": 0.2084433421340374, + "grad_norm": 14.189880824138287, + "learning_rate": 1.929271119049675e-06, + "loss": -0.3456, + "step": 1954 + }, + { + "epoch": 0.20865669253540284, + "grad_norm": 4.676708758764112, + "learning_rate": 1.9289957310879317e-06, + "loss": -0.7007, + "step": 1956 + }, + { + "epoch": 0.20887004293676828, + "grad_norm": 7.005194914139183, + "learning_rate": 1.9287198277785472e-06, + "loss": -0.417, + "step": 1958 + }, + { + "epoch": 0.20908339333813372, + "grad_norm": 11.66782653144503, + "learning_rate": 1.928443409274575e-06, + "loss": 0.045, + "step": 1960 + }, + { + "epoch": 0.20929674373949916, + "grad_norm": 8.522371273190833, + "learning_rate": 1.9281664757293535e-06, + "loss": -0.3577, + "step": 1962 + }, + { + "epoch": 0.2095100941408646, + "grad_norm": 7.8639416248062615, + "learning_rate": 1.9278890272965093e-06, + "loss": -0.3496, + "step": 1964 + }, + { + "epoch": 0.20972344454223005, + "grad_norm": 19.19411125509426, + "learning_rate": 1.927611064129952e-06, + "loss": -0.2211, + "step": 1966 + }, + { + "epoch": 0.2099367949435955, + "grad_norm": 13.62906798006574, + "learning_rate": 1.927332586383878e-06, + "loss": -0.3254, + "step": 1968 + }, + { + "epoch": 0.21015014534496093, + "grad_norm": 8.84634286062035, + "learning_rate": 1.9270535942127693e-06, + "loss": -0.9648, + "step": 1970 + }, + { + "epoch": 0.21036349574632637, + "grad_norm": 12.82625400789908, + "learning_rate": 1.9267740877713934e-06, + "loss": -0.9653, + "step": 1972 + }, + { + "epoch": 0.21057684614769182, + "grad_norm": 29.386125708909894, + "learning_rate": 1.9264940672148015e-06, + "loss": -0.2959, + "step": 1974 + }, + { + "epoch": 0.21079019654905726, + "grad_norm": 6.726999887691662, + "learning_rate": 1.9262135326983323e-06, + "loss": 0.0497, + "step": 1976 + }, + { + "epoch": 0.2110035469504227, + "grad_norm": 19.572986593090715, + "learning_rate": 1.925932484377608e-06, + "loss": -0.5565, + "step": 1978 + }, + { + "epoch": 0.21121689735178814, + "grad_norm": 10.420268612115242, + "learning_rate": 1.925650922408536e-06, + "loss": -0.2276, + "step": 1980 + }, + { + "epoch": 0.21143024775315358, + "grad_norm": 14.918445885705127, + "learning_rate": 1.925368846947309e-06, + "loss": -0.7972, + "step": 1982 + }, + { + "epoch": 0.21164359815451902, + "grad_norm": 15.183450954373223, + "learning_rate": 1.9250862581504054e-06, + "loss": -0.9765, + "step": 1984 + }, + { + "epoch": 0.21185694855588447, + "grad_norm": 12.655482376031465, + "learning_rate": 1.924803156174586e-06, + "loss": 0.2892, + "step": 1986 + }, + { + "epoch": 0.2120702989572499, + "grad_norm": 12.319565480053717, + "learning_rate": 1.924519541176899e-06, + "loss": 0.0639, + "step": 1988 + }, + { + "epoch": 0.21228364935861535, + "grad_norm": 11.682148008802292, + "learning_rate": 1.9242354133146755e-06, + "loss": -0.2449, + "step": 1990 + }, + { + "epoch": 0.2124969997599808, + "grad_norm": 10.683369380156918, + "learning_rate": 1.923950772745531e-06, + "loss": -0.6252, + "step": 1992 + }, + { + "epoch": 0.21271035016134623, + "grad_norm": 8.342190814941272, + "learning_rate": 1.9236656196273675e-06, + "loss": -0.3836, + "step": 1994 + }, + { + "epoch": 0.21292370056271168, + "grad_norm": 9.132897947243281, + "learning_rate": 1.9233799541183673e-06, + "loss": 0.242, + "step": 1996 + }, + { + "epoch": 0.21313705096407712, + "grad_norm": 8.302317272541245, + "learning_rate": 1.923093776377002e-06, + "loss": 0.2858, + "step": 1998 + }, + { + "epoch": 0.21335040136544256, + "grad_norm": 15.276476769675117, + "learning_rate": 1.922807086562023e-06, + "loss": -0.3019, + "step": 2000 + }, + { + "epoch": 0.213563751766808, + "grad_norm": 11.619082047088398, + "learning_rate": 1.9225198848324686e-06, + "loss": 0.7847, + "step": 2002 + }, + { + "epoch": 0.21377710216817344, + "grad_norm": 3.525780465406695, + "learning_rate": 1.922232171347659e-06, + "loss": -0.0267, + "step": 2004 + }, + { + "epoch": 0.21399045256953889, + "grad_norm": 13.017357127057464, + "learning_rate": 1.9219439462672005e-06, + "loss": -0.0434, + "step": 2006 + }, + { + "epoch": 0.21420380297090433, + "grad_norm": 8.743693252679565, + "learning_rate": 1.9216552097509813e-06, + "loss": -0.5469, + "step": 2008 + }, + { + "epoch": 0.21441715337226977, + "grad_norm": 8.531453317652758, + "learning_rate": 1.921365961959174e-06, + "loss": -0.5015, + "step": 2010 + }, + { + "epoch": 0.2146305037736352, + "grad_norm": 25.86829560721428, + "learning_rate": 1.921076203052235e-06, + "loss": -1.4061, + "step": 2012 + }, + { + "epoch": 0.21484385417500065, + "grad_norm": 6.313096967252886, + "learning_rate": 1.920785933190904e-06, + "loss": 0.4307, + "step": 2014 + }, + { + "epoch": 0.2150572045763661, + "grad_norm": 12.031438307717737, + "learning_rate": 1.9204951525362043e-06, + "loss": -0.1798, + "step": 2016 + }, + { + "epoch": 0.21527055497773157, + "grad_norm": 8.961712843572498, + "learning_rate": 1.9202038612494425e-06, + "loss": 0.5622, + "step": 2018 + }, + { + "epoch": 0.215483905379097, + "grad_norm": 10.317606430884625, + "learning_rate": 1.9199120594922086e-06, + "loss": 0.2731, + "step": 2020 + }, + { + "epoch": 0.21569725578046245, + "grad_norm": 6.844568645485288, + "learning_rate": 1.919619747426375e-06, + "loss": -0.2694, + "step": 2022 + }, + { + "epoch": 0.2159106061818279, + "grad_norm": 9.827733315135845, + "learning_rate": 1.9193269252140987e-06, + "loss": 0.1818, + "step": 2024 + }, + { + "epoch": 0.21612395658319333, + "grad_norm": 11.343018497482833, + "learning_rate": 1.919033593017818e-06, + "loss": -1.0073, + "step": 2026 + }, + { + "epoch": 0.21633730698455877, + "grad_norm": 28.273270087787377, + "learning_rate": 1.9187397510002556e-06, + "loss": -0.4873, + "step": 2028 + }, + { + "epoch": 0.21655065738592422, + "grad_norm": 8.284068681666643, + "learning_rate": 1.918445399324416e-06, + "loss": 0.2736, + "step": 2030 + }, + { + "epoch": 0.21676400778728966, + "grad_norm": 14.638969215885052, + "learning_rate": 1.918150538153586e-06, + "loss": 0.2588, + "step": 2032 + }, + { + "epoch": 0.2169773581886551, + "grad_norm": 22.94032037481521, + "learning_rate": 1.9178551676513374e-06, + "loss": -0.4289, + "step": 2034 + }, + { + "epoch": 0.21719070859002054, + "grad_norm": 6.7471627466004005, + "learning_rate": 1.9175592879815217e-06, + "loss": -0.5378, + "step": 2036 + }, + { + "epoch": 0.21740405899138598, + "grad_norm": 8.94487355277408, + "learning_rate": 1.9172628993082743e-06, + "loss": -0.3464, + "step": 2038 + }, + { + "epoch": 0.21761740939275143, + "grad_norm": 6.462015369560506, + "learning_rate": 1.9169660017960134e-06, + "loss": -0.3104, + "step": 2040 + }, + { + "epoch": 0.21783075979411687, + "grad_norm": 10.741585094567146, + "learning_rate": 1.916668595609438e-06, + "loss": -0.3802, + "step": 2042 + }, + { + "epoch": 0.2180441101954823, + "grad_norm": 4.590541070024735, + "learning_rate": 1.9163706809135305e-06, + "loss": -0.4504, + "step": 2044 + }, + { + "epoch": 0.21825746059684775, + "grad_norm": 7.9137038139570155, + "learning_rate": 1.916072257873555e-06, + "loss": -0.522, + "step": 2046 + }, + { + "epoch": 0.2184708109982132, + "grad_norm": 13.423712642720652, + "learning_rate": 1.915773326655057e-06, + "loss": -0.591, + "step": 2048 + }, + { + "epoch": 0.21868416139957864, + "grad_norm": 15.129478098463679, + "learning_rate": 1.915473887423866e-06, + "loss": -0.1386, + "step": 2050 + }, + { + "epoch": 0.21889751180094408, + "grad_norm": 16.08402082469581, + "learning_rate": 1.91517394034609e-06, + "loss": -1.3189, + "step": 2052 + }, + { + "epoch": 0.21911086220230952, + "grad_norm": 14.673361229198562, + "learning_rate": 1.9148734855881216e-06, + "loss": -0.4009, + "step": 2054 + }, + { + "epoch": 0.21932421260367496, + "grad_norm": 11.660044385661074, + "learning_rate": 1.9145725233166343e-06, + "loss": -1.1933, + "step": 2056 + }, + { + "epoch": 0.2195375630050404, + "grad_norm": 17.923848622734944, + "learning_rate": 1.9142710536985815e-06, + "loss": 0.2626, + "step": 2058 + }, + { + "epoch": 0.21975091340640585, + "grad_norm": 3.4973266521741135, + "learning_rate": 1.9139690769012e-06, + "loss": -0.172, + "step": 2060 + }, + { + "epoch": 0.2199642638077713, + "grad_norm": 7.976576276743693, + "learning_rate": 1.9136665930920075e-06, + "loss": 0.6982, + "step": 2062 + }, + { + "epoch": 0.22017761420913673, + "grad_norm": 7.634128727862831, + "learning_rate": 1.9133636024388025e-06, + "loss": 0.4352, + "step": 2064 + }, + { + "epoch": 0.22039096461050217, + "grad_norm": 7.972896156127331, + "learning_rate": 1.913060105109665e-06, + "loss": -0.1603, + "step": 2066 + }, + { + "epoch": 0.2206043150118676, + "grad_norm": 12.60188340929588, + "learning_rate": 1.912756101272956e-06, + "loss": 0.184, + "step": 2068 + }, + { + "epoch": 0.22081766541323306, + "grad_norm": 5.530674604822452, + "learning_rate": 1.9124515910973175e-06, + "loss": 0.2249, + "step": 2070 + }, + { + "epoch": 0.2210310158145985, + "grad_norm": 8.673190380791217, + "learning_rate": 1.9121465747516723e-06, + "loss": 0.2098, + "step": 2072 + }, + { + "epoch": 0.22124436621596394, + "grad_norm": 11.938116236594583, + "learning_rate": 1.911841052405224e-06, + "loss": -0.0012, + "step": 2074 + }, + { + "epoch": 0.22145771661732938, + "grad_norm": 15.000894923275073, + "learning_rate": 1.9115350242274565e-06, + "loss": -0.5997, + "step": 2076 + }, + { + "epoch": 0.22167106701869482, + "grad_norm": 13.084239961748656, + "learning_rate": 1.9112284903881357e-06, + "loss": -0.2505, + "step": 2078 + }, + { + "epoch": 0.22188441742006026, + "grad_norm": 25.251876912066013, + "learning_rate": 1.910921451057306e-06, + "loss": 0.5584, + "step": 2080 + }, + { + "epoch": 0.2220977678214257, + "grad_norm": 4.428008465032094, + "learning_rate": 1.9106139064052945e-06, + "loss": -0.5053, + "step": 2082 + }, + { + "epoch": 0.22231111822279115, + "grad_norm": 14.230541196268089, + "learning_rate": 1.910305856602706e-06, + "loss": -0.8245, + "step": 2084 + }, + { + "epoch": 0.2225244686241566, + "grad_norm": 7.0912436584168885, + "learning_rate": 1.909997301820428e-06, + "loss": -0.0826, + "step": 2086 + }, + { + "epoch": 0.22273781902552203, + "grad_norm": 8.156236127060636, + "learning_rate": 1.909688242229626e-06, + "loss": -1.6469, + "step": 2088 + }, + { + "epoch": 0.22295116942688747, + "grad_norm": 7.462748690551923, + "learning_rate": 1.9093786780017473e-06, + "loss": -0.441, + "step": 2090 + }, + { + "epoch": 0.22316451982825292, + "grad_norm": 7.556875177907825, + "learning_rate": 1.9090686093085186e-06, + "loss": -0.1868, + "step": 2092 + }, + { + "epoch": 0.22337787022961836, + "grad_norm": 8.074190404531432, + "learning_rate": 1.908758036321946e-06, + "loss": -0.6482, + "step": 2094 + }, + { + "epoch": 0.2235912206309838, + "grad_norm": 7.352300765570886, + "learning_rate": 1.908446959214315e-06, + "loss": 0.0087, + "step": 2096 + }, + { + "epoch": 0.22380457103234924, + "grad_norm": 19.643712400767132, + "learning_rate": 1.908135378158192e-06, + "loss": -0.1984, + "step": 2098 + }, + { + "epoch": 0.22401792143371468, + "grad_norm": 30.880473528270596, + "learning_rate": 1.9078232933264226e-06, + "loss": -0.4678, + "step": 2100 + }, + { + "epoch": 0.22423127183508013, + "grad_norm": 16.206460542957338, + "learning_rate": 1.907510704892131e-06, + "loss": -0.9372, + "step": 2102 + }, + { + "epoch": 0.2244446222364456, + "grad_norm": 11.207394954968997, + "learning_rate": 1.907197613028721e-06, + "loss": -0.3638, + "step": 2104 + }, + { + "epoch": 0.22465797263781104, + "grad_norm": 19.59690555861532, + "learning_rate": 1.9068840179098773e-06, + "loss": -0.2888, + "step": 2106 + }, + { + "epoch": 0.22487132303917648, + "grad_norm": 5.0064925129941376, + "learning_rate": 1.9065699197095615e-06, + "loss": 0.7401, + "step": 2108 + }, + { + "epoch": 0.22508467344054192, + "grad_norm": 9.163762043830785, + "learning_rate": 1.906255318602015e-06, + "loss": -0.5969, + "step": 2110 + }, + { + "epoch": 0.22529802384190736, + "grad_norm": 28.58997459140813, + "learning_rate": 1.9059402147617596e-06, + "loss": -0.7027, + "step": 2112 + }, + { + "epoch": 0.2255113742432728, + "grad_norm": 8.007982963485334, + "learning_rate": 1.9056246083635941e-06, + "loss": -0.9328, + "step": 2114 + }, + { + "epoch": 0.22572472464463825, + "grad_norm": 11.576637630010811, + "learning_rate": 1.9053084995825967e-06, + "loss": -0.1907, + "step": 2116 + }, + { + "epoch": 0.2259380750460037, + "grad_norm": 6.310277942518293, + "learning_rate": 1.9049918885941246e-06, + "loss": -1.1929, + "step": 2118 + }, + { + "epoch": 0.22615142544736913, + "grad_norm": 4.709824078091413, + "learning_rate": 1.9046747755738136e-06, + "loss": 0.2748, + "step": 2120 + }, + { + "epoch": 0.22636477584873457, + "grad_norm": 10.164719659031523, + "learning_rate": 1.9043571606975775e-06, + "loss": -0.373, + "step": 2122 + }, + { + "epoch": 0.22657812625010001, + "grad_norm": 14.655873994066797, + "learning_rate": 1.904039044141609e-06, + "loss": -0.6651, + "step": 2124 + }, + { + "epoch": 0.22679147665146546, + "grad_norm": 6.4111231342896495, + "learning_rate": 1.9037204260823785e-06, + "loss": -0.3842, + "step": 2126 + }, + { + "epoch": 0.2270048270528309, + "grad_norm": 12.295730157536532, + "learning_rate": 1.9034013066966356e-06, + "loss": -0.691, + "step": 2128 + }, + { + "epoch": 0.22721817745419634, + "grad_norm": 22.951864429058734, + "learning_rate": 1.903081686161407e-06, + "loss": -1.0715, + "step": 2130 + }, + { + "epoch": 0.22743152785556178, + "grad_norm": 5.816477108146452, + "learning_rate": 1.902761564653998e-06, + "loss": 0.2179, + "step": 2132 + }, + { + "epoch": 0.22764487825692722, + "grad_norm": 7.195074237627301, + "learning_rate": 1.9024409423519918e-06, + "loss": -0.0609, + "step": 2134 + }, + { + "epoch": 0.22785822865829267, + "grad_norm": 4.115971600041351, + "learning_rate": 1.9021198194332486e-06, + "loss": 0.1863, + "step": 2136 + }, + { + "epoch": 0.2280715790596581, + "grad_norm": 7.8396642857766485, + "learning_rate": 1.9017981960759072e-06, + "loss": -0.035, + "step": 2138 + }, + { + "epoch": 0.22828492946102355, + "grad_norm": 8.093340509337287, + "learning_rate": 1.9014760724583843e-06, + "loss": -0.032, + "step": 2140 + }, + { + "epoch": 0.228498279862389, + "grad_norm": 5.739007014210951, + "learning_rate": 1.901153448759373e-06, + "loss": -1.3343, + "step": 2142 + }, + { + "epoch": 0.22871163026375443, + "grad_norm": 6.451272014147544, + "learning_rate": 1.9008303251578442e-06, + "loss": 0.271, + "step": 2144 + }, + { + "epoch": 0.22892498066511988, + "grad_norm": 14.93163679272063, + "learning_rate": 1.9005067018330466e-06, + "loss": -0.6068, + "step": 2146 + }, + { + "epoch": 0.22913833106648532, + "grad_norm": 6.146740059828187, + "learning_rate": 1.900182578964506e-06, + "loss": -0.5449, + "step": 2148 + }, + { + "epoch": 0.22935168146785076, + "grad_norm": 4.058446027597233, + "learning_rate": 1.899857956732025e-06, + "loss": 0.4319, + "step": 2150 + }, + { + "epoch": 0.2295650318692162, + "grad_norm": 8.03450930760207, + "learning_rate": 1.899532835315683e-06, + "loss": -2.0862, + "step": 2152 + }, + { + "epoch": 0.22977838227058164, + "grad_norm": 10.29416339638645, + "learning_rate": 1.8992072148958367e-06, + "loss": -0.5918, + "step": 2154 + }, + { + "epoch": 0.22999173267194709, + "grad_norm": 6.839940264702009, + "learning_rate": 1.8988810956531199e-06, + "loss": 0.802, + "step": 2156 + }, + { + "epoch": 0.23020508307331253, + "grad_norm": 3.3657321385518157, + "learning_rate": 1.8985544777684425e-06, + "loss": -0.4858, + "step": 2158 + }, + { + "epoch": 0.23041843347467797, + "grad_norm": 4.831622255484031, + "learning_rate": 1.8982273614229915e-06, + "loss": -1.2581, + "step": 2160 + }, + { + "epoch": 0.2306317838760434, + "grad_norm": 10.448841520364184, + "learning_rate": 1.89789974679823e-06, + "loss": 0.6303, + "step": 2162 + }, + { + "epoch": 0.23084513427740885, + "grad_norm": 10.907611514407419, + "learning_rate": 1.897571634075898e-06, + "loss": 0.0288, + "step": 2164 + }, + { + "epoch": 0.2310584846787743, + "grad_norm": 14.545172431998253, + "learning_rate": 1.8972430234380112e-06, + "loss": -0.5095, + "step": 2166 + }, + { + "epoch": 0.23127183508013974, + "grad_norm": 6.533910079035624, + "learning_rate": 1.8969139150668622e-06, + "loss": -1.2148, + "step": 2168 + }, + { + "epoch": 0.23148518548150518, + "grad_norm": 15.995777148237812, + "learning_rate": 1.8965843091450192e-06, + "loss": 0.5337, + "step": 2170 + }, + { + "epoch": 0.23169853588287062, + "grad_norm": 13.54176406092635, + "learning_rate": 1.896254205855326e-06, + "loss": 0.515, + "step": 2172 + }, + { + "epoch": 0.23191188628423606, + "grad_norm": 7.719544270592234, + "learning_rate": 1.8959236053809038e-06, + "loss": 0.0471, + "step": 2174 + }, + { + "epoch": 0.2321252366856015, + "grad_norm": 10.598168740855463, + "learning_rate": 1.8955925079051482e-06, + "loss": -1.0524, + "step": 2176 + }, + { + "epoch": 0.23233858708696695, + "grad_norm": 36.37614463964094, + "learning_rate": 1.895260913611731e-06, + "loss": -1.3743, + "step": 2178 + }, + { + "epoch": 0.2325519374883324, + "grad_norm": 6.571035963967562, + "learning_rate": 1.8949288226845996e-06, + "loss": -0.1517, + "step": 2180 + }, + { + "epoch": 0.23276528788969783, + "grad_norm": 8.071430658220358, + "learning_rate": 1.8945962353079772e-06, + "loss": -0.7766, + "step": 2182 + }, + { + "epoch": 0.23297863829106327, + "grad_norm": 7.25892306887894, + "learning_rate": 1.8942631516663617e-06, + "loss": 0.4177, + "step": 2184 + }, + { + "epoch": 0.23319198869242871, + "grad_norm": 6.365349446271159, + "learning_rate": 1.8939295719445266e-06, + "loss": 0.2848, + "step": 2186 + }, + { + "epoch": 0.23340533909379416, + "grad_norm": 9.638039270016808, + "learning_rate": 1.893595496327521e-06, + "loss": -0.913, + "step": 2188 + }, + { + "epoch": 0.23361868949515963, + "grad_norm": 10.439771015875486, + "learning_rate": 1.8932609250006685e-06, + "loss": 0.5524, + "step": 2190 + }, + { + "epoch": 0.23383203989652507, + "grad_norm": 13.808518153837964, + "learning_rate": 1.8929258581495683e-06, + "loss": -0.1752, + "step": 2192 + }, + { + "epoch": 0.2340453902978905, + "grad_norm": 12.522051166061008, + "learning_rate": 1.892590295960094e-06, + "loss": -0.5719, + "step": 2194 + }, + { + "epoch": 0.23425874069925595, + "grad_norm": 11.841442410136267, + "learning_rate": 1.8922542386183939e-06, + "loss": -0.6225, + "step": 2196 + }, + { + "epoch": 0.2344720911006214, + "grad_norm": 3.815044461918325, + "learning_rate": 1.8919176863108914e-06, + "loss": -0.887, + "step": 2198 + }, + { + "epoch": 0.23468544150198684, + "grad_norm": 4.49105706199299, + "learning_rate": 1.8915806392242844e-06, + "loss": 0.1367, + "step": 2200 + }, + { + "epoch": 0.23489879190335228, + "grad_norm": 16.345355639549688, + "learning_rate": 1.8912430975455446e-06, + "loss": -0.4096, + "step": 2202 + }, + { + "epoch": 0.23511214230471772, + "grad_norm": 8.407716846145457, + "learning_rate": 1.8909050614619195e-06, + "loss": -0.0712, + "step": 2204 + }, + { + "epoch": 0.23532549270608316, + "grad_norm": 14.366397845363286, + "learning_rate": 1.890566531160929e-06, + "loss": 0.5799, + "step": 2206 + }, + { + "epoch": 0.2355388431074486, + "grad_norm": 14.191723389550512, + "learning_rate": 1.890227506830369e-06, + "loss": -0.5409, + "step": 2208 + }, + { + "epoch": 0.23575219350881405, + "grad_norm": 7.135790668212273, + "learning_rate": 1.8898879886583078e-06, + "loss": 0.085, + "step": 2210 + }, + { + "epoch": 0.2359655439101795, + "grad_norm": 7.391101446479228, + "learning_rate": 1.8895479768330893e-06, + "loss": -0.2057, + "step": 2212 + }, + { + "epoch": 0.23617889431154493, + "grad_norm": 14.674680488502363, + "learning_rate": 1.8892074715433299e-06, + "loss": 0.3252, + "step": 2214 + }, + { + "epoch": 0.23639224471291037, + "grad_norm": 17.01256516370184, + "learning_rate": 1.8888664729779202e-06, + "loss": 0.2447, + "step": 2216 + }, + { + "epoch": 0.2366055951142758, + "grad_norm": 36.16909610954152, + "learning_rate": 1.8885249813260248e-06, + "loss": 0.5304, + "step": 2218 + }, + { + "epoch": 0.23681894551564125, + "grad_norm": 14.119709954785915, + "learning_rate": 1.8881829967770809e-06, + "loss": -1.1017, + "step": 2220 + }, + { + "epoch": 0.2370322959170067, + "grad_norm": 6.982776203277627, + "learning_rate": 1.8878405195208004e-06, + "loss": -0.3025, + "step": 2222 + }, + { + "epoch": 0.23724564631837214, + "grad_norm": 12.717983826344323, + "learning_rate": 1.8874975497471676e-06, + "loss": -0.3265, + "step": 2224 + }, + { + "epoch": 0.23745899671973758, + "grad_norm": 7.295940732692786, + "learning_rate": 1.8871540876464402e-06, + "loss": -1.1326, + "step": 2226 + }, + { + "epoch": 0.23767234712110302, + "grad_norm": 16.019023665374803, + "learning_rate": 1.8868101334091492e-06, + "loss": -0.0291, + "step": 2228 + }, + { + "epoch": 0.23788569752246846, + "grad_norm": 11.942061426924056, + "learning_rate": 1.8864656872260985e-06, + "loss": -0.91, + "step": 2230 + }, + { + "epoch": 0.2380990479238339, + "grad_norm": 7.779109824391155, + "learning_rate": 1.8861207492883648e-06, + "loss": -0.2579, + "step": 2232 + }, + { + "epoch": 0.23831239832519935, + "grad_norm": 32.924364769086296, + "learning_rate": 1.8857753197872978e-06, + "loss": -0.5007, + "step": 2234 + }, + { + "epoch": 0.2385257487265648, + "grad_norm": 7.122888553196807, + "learning_rate": 1.8854293989145198e-06, + "loss": 0.3607, + "step": 2236 + }, + { + "epoch": 0.23873909912793023, + "grad_norm": 6.541430984020485, + "learning_rate": 1.8850829868619256e-06, + "loss": -0.4923, + "step": 2238 + }, + { + "epoch": 0.23895244952929567, + "grad_norm": 12.255213691954728, + "learning_rate": 1.8847360838216824e-06, + "loss": -0.0115, + "step": 2240 + }, + { + "epoch": 0.23916579993066112, + "grad_norm": 24.17513986308764, + "learning_rate": 1.8843886899862302e-06, + "loss": 0.1068, + "step": 2242 + }, + { + "epoch": 0.23937915033202656, + "grad_norm": 6.427296279370608, + "learning_rate": 1.8840408055482806e-06, + "loss": -1.4625, + "step": 2244 + }, + { + "epoch": 0.239592500733392, + "grad_norm": 4.057332435817227, + "learning_rate": 1.883692430700818e-06, + "loss": -1.0755, + "step": 2246 + }, + { + "epoch": 0.23980585113475744, + "grad_norm": 9.280279614228682, + "learning_rate": 1.8833435656370984e-06, + "loss": 0.4493, + "step": 2248 + }, + { + "epoch": 0.24001920153612288, + "grad_norm": 15.08946963378993, + "learning_rate": 1.8829942105506502e-06, + "loss": -0.5134, + "step": 2250 + }, + { + "epoch": 0.24023255193748833, + "grad_norm": 8.285147921021803, + "learning_rate": 1.8826443656352729e-06, + "loss": -0.2185, + "step": 2252 + }, + { + "epoch": 0.24044590233885377, + "grad_norm": 5.186081541990516, + "learning_rate": 1.8822940310850383e-06, + "loss": -0.841, + "step": 2254 + }, + { + "epoch": 0.2406592527402192, + "grad_norm": 12.892860884719964, + "learning_rate": 1.8819432070942903e-06, + "loss": -0.1838, + "step": 2256 + }, + { + "epoch": 0.24087260314158465, + "grad_norm": 9.786430202215282, + "learning_rate": 1.8815918938576427e-06, + "loss": 0.2237, + "step": 2258 + }, + { + "epoch": 0.2410859535429501, + "grad_norm": 11.474072273615928, + "learning_rate": 1.8812400915699826e-06, + "loss": -0.1261, + "step": 2260 + }, + { + "epoch": 0.24129930394431554, + "grad_norm": 11.833383877747751, + "learning_rate": 1.8808878004264668e-06, + "loss": 0.4263, + "step": 2262 + }, + { + "epoch": 0.24151265434568098, + "grad_norm": 22.976330366798624, + "learning_rate": 1.8805350206225246e-06, + "loss": -1.0103, + "step": 2264 + }, + { + "epoch": 0.24172600474704642, + "grad_norm": 10.5386166784528, + "learning_rate": 1.880181752353855e-06, + "loss": -0.2876, + "step": 2266 + }, + { + "epoch": 0.24193935514841186, + "grad_norm": 8.413708416616572, + "learning_rate": 1.8798279958164294e-06, + "loss": 0.1234, + "step": 2268 + }, + { + "epoch": 0.2421527055497773, + "grad_norm": 2.9593915851857346, + "learning_rate": 1.8794737512064888e-06, + "loss": -0.5359, + "step": 2270 + }, + { + "epoch": 0.24236605595114274, + "grad_norm": 9.484434089050737, + "learning_rate": 1.8791190187205463e-06, + "loss": -0.1062, + "step": 2272 + }, + { + "epoch": 0.24257940635250821, + "grad_norm": 11.888830573780263, + "learning_rate": 1.8787637985553843e-06, + "loss": 0.2508, + "step": 2274 + }, + { + "epoch": 0.24279275675387366, + "grad_norm": 3.1868895736649017, + "learning_rate": 1.8784080909080566e-06, + "loss": -0.0724, + "step": 2276 + }, + { + "epoch": 0.2430061071552391, + "grad_norm": 13.600473840973399, + "learning_rate": 1.878051895975887e-06, + "loss": -0.2573, + "step": 2278 + }, + { + "epoch": 0.24321945755660454, + "grad_norm": 26.27419949986061, + "learning_rate": 1.8776952139564695e-06, + "loss": 0.1723, + "step": 2280 + }, + { + "epoch": 0.24343280795796998, + "grad_norm": 4.746369885066603, + "learning_rate": 1.877338045047669e-06, + "loss": -0.052, + "step": 2282 + }, + { + "epoch": 0.24364615835933542, + "grad_norm": 19.86372771706047, + "learning_rate": 1.87698038944762e-06, + "loss": -0.293, + "step": 2284 + }, + { + "epoch": 0.24385950876070087, + "grad_norm": 5.177844555605487, + "learning_rate": 1.8766222473547269e-06, + "loss": -0.1477, + "step": 2286 + }, + { + "epoch": 0.2440728591620663, + "grad_norm": 3.4158091451466683, + "learning_rate": 1.8762636189676639e-06, + "loss": -0.6331, + "step": 2288 + }, + { + "epoch": 0.24428620956343175, + "grad_norm": 15.453945464573792, + "learning_rate": 1.8759045044853756e-06, + "loss": -0.1638, + "step": 2290 + }, + { + "epoch": 0.2444995599647972, + "grad_norm": 10.596600020719738, + "learning_rate": 1.8755449041070757e-06, + "loss": -1.1852, + "step": 2292 + }, + { + "epoch": 0.24471291036616263, + "grad_norm": 14.990629186643753, + "learning_rate": 1.8751848180322474e-06, + "loss": -0.338, + "step": 2294 + }, + { + "epoch": 0.24492626076752808, + "grad_norm": 11.30656679550947, + "learning_rate": 1.8748242464606437e-06, + "loss": 0.8451, + "step": 2296 + }, + { + "epoch": 0.24513961116889352, + "grad_norm": 5.928347319065744, + "learning_rate": 1.8744631895922867e-06, + "loss": -0.713, + "step": 2298 + }, + { + "epoch": 0.24535296157025896, + "grad_norm": 6.444139189078738, + "learning_rate": 1.8741016476274675e-06, + "loss": -1.0358, + "step": 2300 + }, + { + "epoch": 0.2455663119716244, + "grad_norm": 7.9684465119036485, + "learning_rate": 1.873739620766747e-06, + "loss": -0.6313, + "step": 2302 + }, + { + "epoch": 0.24577966237298984, + "grad_norm": 5.1428053879279565, + "learning_rate": 1.8733771092109544e-06, + "loss": -0.3503, + "step": 2304 + }, + { + "epoch": 0.24599301277435529, + "grad_norm": 7.8338959307743945, + "learning_rate": 1.873014113161188e-06, + "loss": -0.8449, + "step": 2306 + }, + { + "epoch": 0.24620636317572073, + "grad_norm": 8.329030605753001, + "learning_rate": 1.872650632818815e-06, + "loss": -0.8559, + "step": 2308 + }, + { + "epoch": 0.24641971357708617, + "grad_norm": 6.004484154776549, + "learning_rate": 1.8722866683854707e-06, + "loss": -0.1221, + "step": 2310 + }, + { + "epoch": 0.2466330639784516, + "grad_norm": 18.041740979343107, + "learning_rate": 1.8719222200630603e-06, + "loss": -0.8495, + "step": 2312 + }, + { + "epoch": 0.24684641437981705, + "grad_norm": 13.8993422113462, + "learning_rate": 1.8715572880537553e-06, + "loss": -0.584, + "step": 2314 + }, + { + "epoch": 0.2470597647811825, + "grad_norm": 7.603286049786936, + "learning_rate": 1.8711918725599975e-06, + "loss": 0.409, + "step": 2316 + }, + { + "epoch": 0.24727311518254794, + "grad_norm": 12.383128947530018, + "learning_rate": 1.8708259737844963e-06, + "loss": -0.1898, + "step": 2318 + }, + { + "epoch": 0.24748646558391338, + "grad_norm": 14.117241969233739, + "learning_rate": 1.8704595919302283e-06, + "loss": 0.5188, + "step": 2320 + }, + { + "epoch": 0.24769981598527882, + "grad_norm": 5.893210917326283, + "learning_rate": 1.8700927272004394e-06, + "loss": -0.5056, + "step": 2322 + }, + { + "epoch": 0.24791316638664426, + "grad_norm": 18.31287282961217, + "learning_rate": 1.8697253797986427e-06, + "loss": 0.2288, + "step": 2324 + }, + { + "epoch": 0.2481265167880097, + "grad_norm": 11.08866098282751, + "learning_rate": 1.8693575499286189e-06, + "loss": -0.2383, + "step": 2326 + }, + { + "epoch": 0.24833986718937515, + "grad_norm": 11.779301185879273, + "learning_rate": 1.8689892377944168e-06, + "loss": 0.2183, + "step": 2328 + }, + { + "epoch": 0.2485532175907406, + "grad_norm": 10.166191617998152, + "learning_rate": 1.8686204436003523e-06, + "loss": -0.3263, + "step": 2330 + }, + { + "epoch": 0.24876656799210603, + "grad_norm": 4.784977748146536, + "learning_rate": 1.8682511675510089e-06, + "loss": -0.1976, + "step": 2332 + }, + { + "epoch": 0.24897991839347147, + "grad_norm": 3.0284113912137456, + "learning_rate": 1.8678814098512378e-06, + "loss": 0.0196, + "step": 2334 + }, + { + "epoch": 0.24919326879483691, + "grad_norm": 7.914949027770823, + "learning_rate": 1.8675111707061567e-06, + "loss": 0.669, + "step": 2336 + }, + { + "epoch": 0.24940661919620236, + "grad_norm": 19.649999873606166, + "learning_rate": 1.867140450321151e-06, + "loss": 0.4264, + "step": 2338 + }, + { + "epoch": 0.2496199695975678, + "grad_norm": 8.968509283269004, + "learning_rate": 1.866769248901872e-06, + "loss": -0.437, + "step": 2340 + }, + { + "epoch": 0.24983331999893324, + "grad_norm": 9.853570349845704, + "learning_rate": 1.8663975666542395e-06, + "loss": -0.3988, + "step": 2342 + }, + { + "epoch": 0.2500466704002987, + "grad_norm": 7.207985884633396, + "learning_rate": 1.8660254037844386e-06, + "loss": -0.0546, + "step": 2344 + }, + { + "epoch": 0.25026002080166415, + "grad_norm": 7.15372288874833, + "learning_rate": 1.8656527604989216e-06, + "loss": -0.319, + "step": 2346 + }, + { + "epoch": 0.2504733712030296, + "grad_norm": 5.494342553996051, + "learning_rate": 1.8652796370044074e-06, + "loss": -0.3355, + "step": 2348 + }, + { + "epoch": 0.25068672160439504, + "grad_norm": 25.58370443408704, + "learning_rate": 1.8649060335078813e-06, + "loss": -0.1819, + "step": 2350 + }, + { + "epoch": 0.2509000720057605, + "grad_norm": 3.5730622573588953, + "learning_rate": 1.8645319502165941e-06, + "loss": 0.9055, + "step": 2352 + }, + { + "epoch": 0.2511134224071259, + "grad_norm": 13.020477400763882, + "learning_rate": 1.8641573873380637e-06, + "loss": -0.6351, + "step": 2354 + }, + { + "epoch": 0.25132677280849136, + "grad_norm": 12.680924597312218, + "learning_rate": 1.8637823450800743e-06, + "loss": -1.0723, + "step": 2356 + }, + { + "epoch": 0.2515401232098568, + "grad_norm": 14.018213573049106, + "learning_rate": 1.8634068236506745e-06, + "loss": -0.6361, + "step": 2358 + }, + { + "epoch": 0.25175347361122224, + "grad_norm": 10.987083846791398, + "learning_rate": 1.8630308232581804e-06, + "loss": -1.6505, + "step": 2360 + }, + { + "epoch": 0.2519668240125877, + "grad_norm": 10.33719425881593, + "learning_rate": 1.8626543441111728e-06, + "loss": 0.0228, + "step": 2362 + }, + { + "epoch": 0.25218017441395313, + "grad_norm": 12.832494385336979, + "learning_rate": 1.862277386418498e-06, + "loss": 0.1092, + "step": 2364 + }, + { + "epoch": 0.25239352481531857, + "grad_norm": 11.929711951297579, + "learning_rate": 1.8618999503892688e-06, + "loss": 0.2937, + "step": 2366 + }, + { + "epoch": 0.252606875216684, + "grad_norm": 13.862139471610936, + "learning_rate": 1.8615220362328618e-06, + "loss": -0.9202, + "step": 2368 + }, + { + "epoch": 0.25282022561804945, + "grad_norm": 9.682976773402793, + "learning_rate": 1.8611436441589205e-06, + "loss": -0.617, + "step": 2370 + }, + { + "epoch": 0.2530335760194149, + "grad_norm": 5.9430895533585435, + "learning_rate": 1.8607647743773523e-06, + "loss": -0.4079, + "step": 2372 + }, + { + "epoch": 0.25324692642078034, + "grad_norm": 8.283939675455104, + "learning_rate": 1.86038542709833e-06, + "loss": -0.3549, + "step": 2374 + }, + { + "epoch": 0.2534602768221458, + "grad_norm": 4.86946504359841, + "learning_rate": 1.8600056025322914e-06, + "loss": 0.2803, + "step": 2376 + }, + { + "epoch": 0.2536736272235112, + "grad_norm": 5.169292281313504, + "learning_rate": 1.8596253008899392e-06, + "loss": -0.3912, + "step": 2378 + }, + { + "epoch": 0.25388697762487666, + "grad_norm": 11.973300956867748, + "learning_rate": 1.85924452238224e-06, + "loss": -0.7652, + "step": 2380 + }, + { + "epoch": 0.2541003280262421, + "grad_norm": 7.745942596898536, + "learning_rate": 1.858863267220426e-06, + "loss": -1.0226, + "step": 2382 + }, + { + "epoch": 0.25431367842760755, + "grad_norm": 5.509556435115394, + "learning_rate": 1.8584815356159932e-06, + "loss": 1.0754, + "step": 2384 + }, + { + "epoch": 0.254527028828973, + "grad_norm": 15.406258747843344, + "learning_rate": 1.8580993277807014e-06, + "loss": -0.0632, + "step": 2386 + }, + { + "epoch": 0.25474037923033843, + "grad_norm": 14.010785290828032, + "learning_rate": 1.8577166439265754e-06, + "loss": -0.4037, + "step": 2388 + }, + { + "epoch": 0.2549537296317039, + "grad_norm": 5.69097663391526, + "learning_rate": 1.8573334842659043e-06, + "loss": -0.3808, + "step": 2390 + }, + { + "epoch": 0.2551670800330693, + "grad_norm": 5.524540431896903, + "learning_rate": 1.8569498490112402e-06, + "loss": -0.7019, + "step": 2392 + }, + { + "epoch": 0.25538043043443476, + "grad_norm": 16.252506033736484, + "learning_rate": 1.8565657383753997e-06, + "loss": -0.203, + "step": 2394 + }, + { + "epoch": 0.2555937808358002, + "grad_norm": 16.390311743520662, + "learning_rate": 1.8561811525714628e-06, + "loss": -0.7718, + "step": 2396 + }, + { + "epoch": 0.25580713123716564, + "grad_norm": 9.509455098271982, + "learning_rate": 1.8557960918127732e-06, + "loss": -1.2194, + "step": 2398 + }, + { + "epoch": 0.2560204816385311, + "grad_norm": 10.362931031584816, + "learning_rate": 1.8554105563129382e-06, + "loss": -0.5091, + "step": 2400 + } + ], + "logging_steps": 2, + "max_steps": 9374, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 600, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 206064315039744.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}