diff --git "a/ARM-7B-Pruned/checkpoint-1200/trainer_state.json" "b/ARM-7B-Pruned/checkpoint-1200/trainer_state.json" new file mode 100644--- /dev/null +++ "b/ARM-7B-Pruned/checkpoint-1200/trainer_state.json" @@ -0,0 +1,4233 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.12801024081926554, + "eval_steps": 50000, + "global_step": 1200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00021335040136544257, + "grad_norm": 42.684086970967954, + "learning_rate": 4.264392324093816e-09, + "loss": 0.5986, + "step": 2 + }, + { + "epoch": 0.00042670080273088514, + "grad_norm": 42.537918257537875, + "learning_rate": 8.528784648187632e-09, + "loss": 0.6957, + "step": 4 + }, + { + "epoch": 0.0006400512040963277, + "grad_norm": 58.765999906046574, + "learning_rate": 1.2793176972281448e-08, + "loss": 0.047, + "step": 6 + }, + { + "epoch": 0.0008534016054617703, + "grad_norm": 154.70890416675192, + "learning_rate": 1.7057569296375264e-08, + "loss": -0.6682, + "step": 8 + }, + { + "epoch": 0.0010667520068272129, + "grad_norm": 315.97286091458943, + "learning_rate": 2.1321961620469082e-08, + "loss": 0.0971, + "step": 10 + }, + { + "epoch": 0.0012801024081926554, + "grad_norm": 102.37154058019046, + "learning_rate": 2.5586353944562897e-08, + "loss": 1.0541, + "step": 12 + }, + { + "epoch": 0.001493452809558098, + "grad_norm": 105.127529829733, + "learning_rate": 2.9850746268656714e-08, + "loss": 0.4648, + "step": 14 + }, + { + "epoch": 0.0017068032109235406, + "grad_norm": 48.013196159706226, + "learning_rate": 3.411513859275053e-08, + "loss": -0.4529, + "step": 16 + }, + { + "epoch": 0.0019201536122889832, + "grad_norm": 55.69084727420936, + "learning_rate": 3.837953091684435e-08, + "loss": -0.1927, + "step": 18 + }, + { + "epoch": 0.0021335040136544257, + "grad_norm": 33.25875332287273, + "learning_rate": 4.2643923240938164e-08, + "loss": 0.2517, + "step": 20 + }, + { + "epoch": 0.002346854415019868, + "grad_norm": 245.94604658609376, + "learning_rate": 4.6908315565031985e-08, + "loss": -0.8646, + "step": 22 + }, + { + "epoch": 0.002560204816385311, + "grad_norm": 33.35694659794871, + "learning_rate": 5.117270788912579e-08, + "loss": 0.7383, + "step": 24 + }, + { + "epoch": 0.0027735552177507532, + "grad_norm": 193.28196705366508, + "learning_rate": 5.5437100213219614e-08, + "loss": -0.9655, + "step": 26 + }, + { + "epoch": 0.002986905619116196, + "grad_norm": 50.147398314622684, + "learning_rate": 5.970149253731343e-08, + "loss": -0.5218, + "step": 28 + }, + { + "epoch": 0.0032002560204816384, + "grad_norm": 75.21413198410397, + "learning_rate": 6.396588486140725e-08, + "loss": 0.1629, + "step": 30 + }, + { + "epoch": 0.003413606421847081, + "grad_norm": 103.01315028563378, + "learning_rate": 6.823027718550106e-08, + "loss": -0.7119, + "step": 32 + }, + { + "epoch": 0.0036269568232125235, + "grad_norm": 36.63545743331723, + "learning_rate": 7.249466950959488e-08, + "loss": 0.3538, + "step": 34 + }, + { + "epoch": 0.0038403072245779663, + "grad_norm": 39.002709650329486, + "learning_rate": 7.67590618336887e-08, + "loss": -0.6754, + "step": 36 + }, + { + "epoch": 0.004053657625943409, + "grad_norm": 23.502647009611, + "learning_rate": 8.102345415778252e-08, + "loss": 0.1865, + "step": 38 + }, + { + "epoch": 0.0042670080273088514, + "grad_norm": 26.139670413986497, + "learning_rate": 8.528784648187633e-08, + "loss": -0.3739, + "step": 40 + }, + { + "epoch": 0.004480358428674294, + "grad_norm": 30.02660527440867, + "learning_rate": 8.955223880597014e-08, + "loss": -0.0359, + "step": 42 + }, + { + "epoch": 0.004693708830039736, + "grad_norm": 13.980962042551367, + "learning_rate": 9.381663113006397e-08, + "loss": 0.4169, + "step": 44 + }, + { + "epoch": 0.004907059231405179, + "grad_norm": 94.42191623111091, + "learning_rate": 9.808102345415778e-08, + "loss": -1.4962, + "step": 46 + }, + { + "epoch": 0.005120409632770622, + "grad_norm": 36.146349198898875, + "learning_rate": 1.0234541577825159e-07, + "loss": 0.8652, + "step": 48 + }, + { + "epoch": 0.0053337600341360645, + "grad_norm": 34.112375662585286, + "learning_rate": 1.0660980810234541e-07, + "loss": 0.4556, + "step": 50 + }, + { + "epoch": 0.0055471104355015064, + "grad_norm": 54.41510979168854, + "learning_rate": 1.1087420042643923e-07, + "loss": 0.2452, + "step": 52 + }, + { + "epoch": 0.005760460836866949, + "grad_norm": 27.31961709504184, + "learning_rate": 1.1513859275053305e-07, + "loss": 0.3131, + "step": 54 + }, + { + "epoch": 0.005973811238232392, + "grad_norm": 77.00896363994629, + "learning_rate": 1.1940298507462686e-07, + "loss": -0.0289, + "step": 56 + }, + { + "epoch": 0.006187161639597835, + "grad_norm": 21.8781170195382, + "learning_rate": 1.2366737739872068e-07, + "loss": -0.6188, + "step": 58 + }, + { + "epoch": 0.006400512040963277, + "grad_norm": 16.368843761461356, + "learning_rate": 1.279317697228145e-07, + "loss": 0.2618, + "step": 60 + }, + { + "epoch": 0.0066138624423287195, + "grad_norm": 17.051752922162727, + "learning_rate": 1.3219616204690832e-07, + "loss": -0.1717, + "step": 62 + }, + { + "epoch": 0.006827212843694162, + "grad_norm": 99.13064855085295, + "learning_rate": 1.3646055437100212e-07, + "loss": -0.449, + "step": 64 + }, + { + "epoch": 0.007040563245059605, + "grad_norm": 136.3486847826033, + "learning_rate": 1.4072494669509594e-07, + "loss": 0.2984, + "step": 66 + }, + { + "epoch": 0.007253913646425047, + "grad_norm": 62.554463250965604, + "learning_rate": 1.4498933901918976e-07, + "loss": -0.2482, + "step": 68 + }, + { + "epoch": 0.00746726404779049, + "grad_norm": 51.515904581548206, + "learning_rate": 1.4925373134328355e-07, + "loss": -0.8223, + "step": 70 + }, + { + "epoch": 0.007680614449155933, + "grad_norm": 113.86732871533368, + "learning_rate": 1.535181236673774e-07, + "loss": -0.0383, + "step": 72 + }, + { + "epoch": 0.007893964850521375, + "grad_norm": 26.087446116741575, + "learning_rate": 1.5778251599147122e-07, + "loss": 0.7253, + "step": 74 + }, + { + "epoch": 0.008107315251886817, + "grad_norm": 24.131096654425452, + "learning_rate": 1.6204690831556504e-07, + "loss": -0.0834, + "step": 76 + }, + { + "epoch": 0.008320665653252261, + "grad_norm": 74.42536774731673, + "learning_rate": 1.6631130063965884e-07, + "loss": -0.2997, + "step": 78 + }, + { + "epoch": 0.008534016054617703, + "grad_norm": 71.52938681964571, + "learning_rate": 1.7057569296375266e-07, + "loss": 0.0944, + "step": 80 + }, + { + "epoch": 0.008747366455983145, + "grad_norm": 29.241243114282817, + "learning_rate": 1.7484008528784648e-07, + "loss": -0.7054, + "step": 82 + }, + { + "epoch": 0.008960716857348588, + "grad_norm": 134.58638764428474, + "learning_rate": 1.7910447761194027e-07, + "loss": -0.338, + "step": 84 + }, + { + "epoch": 0.00917406725871403, + "grad_norm": 109.66885361053902, + "learning_rate": 1.8336886993603412e-07, + "loss": -0.1817, + "step": 86 + }, + { + "epoch": 0.009387417660079472, + "grad_norm": 27.969535103995202, + "learning_rate": 1.8763326226012794e-07, + "loss": -0.9258, + "step": 88 + }, + { + "epoch": 0.009600768061444916, + "grad_norm": 27.49676445944368, + "learning_rate": 1.9189765458422174e-07, + "loss": 0.3309, + "step": 90 + }, + { + "epoch": 0.009814118462810358, + "grad_norm": 25.544238337228855, + "learning_rate": 1.9616204690831556e-07, + "loss": -0.0894, + "step": 92 + }, + { + "epoch": 0.010027468864175802, + "grad_norm": 26.09909186001115, + "learning_rate": 2.0042643923240938e-07, + "loss": -0.1209, + "step": 94 + }, + { + "epoch": 0.010240819265541243, + "grad_norm": 32.632466828944544, + "learning_rate": 2.0469083155650317e-07, + "loss": 0.4736, + "step": 96 + }, + { + "epoch": 0.010454169666906685, + "grad_norm": 50.68481961380247, + "learning_rate": 2.08955223880597e-07, + "loss": -0.6321, + "step": 98 + }, + { + "epoch": 0.010667520068272129, + "grad_norm": 25.00088670078768, + "learning_rate": 2.1321961620469082e-07, + "loss": -0.3787, + "step": 100 + }, + { + "epoch": 0.010880870469637571, + "grad_norm": 21.25939551515176, + "learning_rate": 2.1748400852878466e-07, + "loss": 0.1969, + "step": 102 + }, + { + "epoch": 0.011094220871003013, + "grad_norm": 23.873098687717384, + "learning_rate": 2.2174840085287846e-07, + "loss": -0.0885, + "step": 104 + }, + { + "epoch": 0.011307571272368457, + "grad_norm": 17.01811754764777, + "learning_rate": 2.2601279317697228e-07, + "loss": 0.0203, + "step": 106 + }, + { + "epoch": 0.011520921673733898, + "grad_norm": 31.009663505522784, + "learning_rate": 2.302771855010661e-07, + "loss": -0.817, + "step": 108 + }, + { + "epoch": 0.011734272075099342, + "grad_norm": 33.200887615423774, + "learning_rate": 2.345415778251599e-07, + "loss": 0.0471, + "step": 110 + }, + { + "epoch": 0.011947622476464784, + "grad_norm": 32.63042973424626, + "learning_rate": 2.388059701492537e-07, + "loss": -1.5996, + "step": 112 + }, + { + "epoch": 0.012160972877830226, + "grad_norm": 30.22918533639733, + "learning_rate": 2.4307036247334754e-07, + "loss": -0.2466, + "step": 114 + }, + { + "epoch": 0.01237432327919567, + "grad_norm": 27.145400816572458, + "learning_rate": 2.4733475479744136e-07, + "loss": -0.3294, + "step": 116 + }, + { + "epoch": 0.012587673680561112, + "grad_norm": 19.75203303790041, + "learning_rate": 2.515991471215352e-07, + "loss": -0.7427, + "step": 118 + }, + { + "epoch": 0.012801024081926553, + "grad_norm": 16.441388594403527, + "learning_rate": 2.55863539445629e-07, + "loss": -0.2637, + "step": 120 + }, + { + "epoch": 0.013014374483291997, + "grad_norm": 23.81603976262189, + "learning_rate": 2.601279317697228e-07, + "loss": -0.07, + "step": 122 + }, + { + "epoch": 0.013227724884657439, + "grad_norm": 80.42145460011395, + "learning_rate": 2.6439232409381664e-07, + "loss": -0.1647, + "step": 124 + }, + { + "epoch": 0.013441075286022881, + "grad_norm": 28.328954293896295, + "learning_rate": 2.686567164179104e-07, + "loss": -0.1786, + "step": 126 + }, + { + "epoch": 0.013654425687388325, + "grad_norm": 44.29342497352258, + "learning_rate": 2.7292110874200423e-07, + "loss": 0.6337, + "step": 128 + }, + { + "epoch": 0.013867776088753767, + "grad_norm": 14.247744370026695, + "learning_rate": 2.7718550106609805e-07, + "loss": -0.7468, + "step": 130 + }, + { + "epoch": 0.01408112649011921, + "grad_norm": 30.909935252567514, + "learning_rate": 2.8144989339019187e-07, + "loss": 0.2444, + "step": 132 + }, + { + "epoch": 0.014294476891484652, + "grad_norm": 19.951382058960338, + "learning_rate": 2.857142857142857e-07, + "loss": -0.0888, + "step": 134 + }, + { + "epoch": 0.014507827292850094, + "grad_norm": 19.451243521855947, + "learning_rate": 2.899786780383795e-07, + "loss": 0.0622, + "step": 136 + }, + { + "epoch": 0.014721177694215538, + "grad_norm": 44.543818203511506, + "learning_rate": 2.9424307036247334e-07, + "loss": -0.5024, + "step": 138 + }, + { + "epoch": 0.01493452809558098, + "grad_norm": 33.99708212815377, + "learning_rate": 2.985074626865671e-07, + "loss": 0.3096, + "step": 140 + }, + { + "epoch": 0.015147878496946422, + "grad_norm": 23.05447566893894, + "learning_rate": 3.02771855010661e-07, + "loss": 0.3329, + "step": 142 + }, + { + "epoch": 0.015361228898311865, + "grad_norm": 16.88015370164898, + "learning_rate": 3.070362473347548e-07, + "loss": -0.1371, + "step": 144 + }, + { + "epoch": 0.015574579299677307, + "grad_norm": 26.13573562942401, + "learning_rate": 3.113006396588486e-07, + "loss": -0.4183, + "step": 146 + }, + { + "epoch": 0.01578792970104275, + "grad_norm": 22.309803740611596, + "learning_rate": 3.1556503198294244e-07, + "loss": -0.3246, + "step": 148 + }, + { + "epoch": 0.016001280102408193, + "grad_norm": 22.52244155087573, + "learning_rate": 3.1982942430703626e-07, + "loss": -0.5349, + "step": 150 + }, + { + "epoch": 0.016214630503773635, + "grad_norm": 18.473624086858514, + "learning_rate": 3.240938166311301e-07, + "loss": 0.2352, + "step": 152 + }, + { + "epoch": 0.016427980905139077, + "grad_norm": 7.7748886163241355, + "learning_rate": 3.2835820895522385e-07, + "loss": 0.2151, + "step": 154 + }, + { + "epoch": 0.016641331306504522, + "grad_norm": 10.802467734897878, + "learning_rate": 3.3262260127931767e-07, + "loss": 0.2848, + "step": 156 + }, + { + "epoch": 0.016854681707869964, + "grad_norm": 18.162628483420374, + "learning_rate": 3.368869936034115e-07, + "loss": -0.168, + "step": 158 + }, + { + "epoch": 0.017068032109235406, + "grad_norm": 40.528567461071695, + "learning_rate": 3.411513859275053e-07, + "loss": -0.4126, + "step": 160 + }, + { + "epoch": 0.017281382510600848, + "grad_norm": 35.36550572570942, + "learning_rate": 3.4541577825159914e-07, + "loss": 0.0287, + "step": 162 + }, + { + "epoch": 0.01749473291196629, + "grad_norm": 10.892875403462355, + "learning_rate": 3.4968017057569296e-07, + "loss": -0.613, + "step": 164 + }, + { + "epoch": 0.01770808331333173, + "grad_norm": 27.141686915753116, + "learning_rate": 3.539445628997867e-07, + "loss": -0.9305, + "step": 166 + }, + { + "epoch": 0.017921433714697177, + "grad_norm": 37.37206055456951, + "learning_rate": 3.5820895522388055e-07, + "loss": -0.3178, + "step": 168 + }, + { + "epoch": 0.01813478411606262, + "grad_norm": 52.89469341474386, + "learning_rate": 3.6247334754797437e-07, + "loss": -0.6419, + "step": 170 + }, + { + "epoch": 0.01834813451742806, + "grad_norm": 23.215765487324806, + "learning_rate": 3.6673773987206824e-07, + "loss": -0.0533, + "step": 172 + }, + { + "epoch": 0.018561484918793503, + "grad_norm": 21.171847082083467, + "learning_rate": 3.7100213219616206e-07, + "loss": -1.0652, + "step": 174 + }, + { + "epoch": 0.018774835320158945, + "grad_norm": 17.335722663927694, + "learning_rate": 3.752665245202559e-07, + "loss": -0.169, + "step": 176 + }, + { + "epoch": 0.01898818572152439, + "grad_norm": 13.15039858318736, + "learning_rate": 3.795309168443497e-07, + "loss": 0.4414, + "step": 178 + }, + { + "epoch": 0.019201536122889832, + "grad_norm": 9.293181092571949, + "learning_rate": 3.8379530916844347e-07, + "loss": 0.1714, + "step": 180 + }, + { + "epoch": 0.019414886524255274, + "grad_norm": 27.253514499053658, + "learning_rate": 3.880597014925373e-07, + "loss": -0.3719, + "step": 182 + }, + { + "epoch": 0.019628236925620716, + "grad_norm": 11.179212679495947, + "learning_rate": 3.923240938166311e-07, + "loss": -1.5775, + "step": 184 + }, + { + "epoch": 0.019841587326986158, + "grad_norm": 28.002238164167814, + "learning_rate": 3.9658848614072494e-07, + "loss": -0.9116, + "step": 186 + }, + { + "epoch": 0.020054937728351603, + "grad_norm": 20.304811169854457, + "learning_rate": 4.0085287846481876e-07, + "loss": -0.6315, + "step": 188 + }, + { + "epoch": 0.020268288129717045, + "grad_norm": 43.26972545321108, + "learning_rate": 4.051172707889126e-07, + "loss": 0.0818, + "step": 190 + }, + { + "epoch": 0.020481638531082487, + "grad_norm": 23.280256775268967, + "learning_rate": 4.0938166311300635e-07, + "loss": -0.2754, + "step": 192 + }, + { + "epoch": 0.02069498893244793, + "grad_norm": 25.086595962784372, + "learning_rate": 4.1364605543710017e-07, + "loss": 0.1783, + "step": 194 + }, + { + "epoch": 0.02090833933381337, + "grad_norm": 17.929854339567687, + "learning_rate": 4.17910447761194e-07, + "loss": -0.3967, + "step": 196 + }, + { + "epoch": 0.021121689735178813, + "grad_norm": 26.684280905377257, + "learning_rate": 4.221748400852878e-07, + "loss": -0.2432, + "step": 198 + }, + { + "epoch": 0.021335040136544258, + "grad_norm": 20.811875320249584, + "learning_rate": 4.2643923240938163e-07, + "loss": -0.6011, + "step": 200 + }, + { + "epoch": 0.0215483905379097, + "grad_norm": 28.776367397287462, + "learning_rate": 4.3070362473347545e-07, + "loss": -1.6853, + "step": 202 + }, + { + "epoch": 0.021761740939275142, + "grad_norm": 22.23424675819631, + "learning_rate": 4.349680170575693e-07, + "loss": 0.2222, + "step": 204 + }, + { + "epoch": 0.021975091340640584, + "grad_norm": 15.408003573608147, + "learning_rate": 4.392324093816631e-07, + "loss": -0.0164, + "step": 206 + }, + { + "epoch": 0.022188441742006026, + "grad_norm": 8.60164597442568, + "learning_rate": 4.434968017057569e-07, + "loss": -0.5671, + "step": 208 + }, + { + "epoch": 0.02240179214337147, + "grad_norm": 10.343966416597475, + "learning_rate": 4.4776119402985074e-07, + "loss": -0.815, + "step": 210 + }, + { + "epoch": 0.022615142544736913, + "grad_norm": 15.741330211675757, + "learning_rate": 4.5202558635394456e-07, + "loss": -1.0932, + "step": 212 + }, + { + "epoch": 0.022828492946102355, + "grad_norm": 16.39054521840376, + "learning_rate": 4.562899786780384e-07, + "loss": -1.7177, + "step": 214 + }, + { + "epoch": 0.023041843347467797, + "grad_norm": 11.781875373813582, + "learning_rate": 4.605543710021322e-07, + "loss": 0.3606, + "step": 216 + }, + { + "epoch": 0.02325519374883324, + "grad_norm": 10.097449683599512, + "learning_rate": 4.64818763326226e-07, + "loss": -0.7045, + "step": 218 + }, + { + "epoch": 0.023468544150198684, + "grad_norm": 15.715275415653137, + "learning_rate": 4.690831556503198e-07, + "loss": -0.0658, + "step": 220 + }, + { + "epoch": 0.023681894551564126, + "grad_norm": 26.933594441832767, + "learning_rate": 4.733475479744136e-07, + "loss": -0.1298, + "step": 222 + }, + { + "epoch": 0.023895244952929568, + "grad_norm": 8.712912536861461, + "learning_rate": 4.776119402985074e-07, + "loss": 0.095, + "step": 224 + }, + { + "epoch": 0.02410859535429501, + "grad_norm": 21.120479987680937, + "learning_rate": 4.818763326226012e-07, + "loss": -0.0818, + "step": 226 + }, + { + "epoch": 0.024321945755660452, + "grad_norm": 10.071445207074044, + "learning_rate": 4.861407249466951e-07, + "loss": -0.1419, + "step": 228 + }, + { + "epoch": 0.024535296157025894, + "grad_norm": 10.801668877306875, + "learning_rate": 4.904051172707888e-07, + "loss": -0.3432, + "step": 230 + }, + { + "epoch": 0.02474864655839134, + "grad_norm": 13.692305863205236, + "learning_rate": 4.946695095948827e-07, + "loss": -0.1519, + "step": 232 + }, + { + "epoch": 0.02496199695975678, + "grad_norm": 22.11353305341118, + "learning_rate": 4.989339019189765e-07, + "loss": -0.0445, + "step": 234 + }, + { + "epoch": 0.025175347361122223, + "grad_norm": 25.883156385523233, + "learning_rate": 5.031982942430704e-07, + "loss": -0.4949, + "step": 236 + }, + { + "epoch": 0.025388697762487665, + "grad_norm": 61.080118987133744, + "learning_rate": 5.074626865671642e-07, + "loss": -1.1921, + "step": 238 + }, + { + "epoch": 0.025602048163853107, + "grad_norm": 9.282902236974593, + "learning_rate": 5.11727078891258e-07, + "loss": 0.1989, + "step": 240 + }, + { + "epoch": 0.025815398565218552, + "grad_norm": 9.781179043931736, + "learning_rate": 5.159914712153518e-07, + "loss": 0.9967, + "step": 242 + }, + { + "epoch": 0.026028748966583994, + "grad_norm": 12.216217874660604, + "learning_rate": 5.202558635394456e-07, + "loss": -0.3518, + "step": 244 + }, + { + "epoch": 0.026242099367949436, + "grad_norm": 24.42587736751593, + "learning_rate": 5.245202558635394e-07, + "loss": -1.0814, + "step": 246 + }, + { + "epoch": 0.026455449769314878, + "grad_norm": 38.12352896449889, + "learning_rate": 5.287846481876333e-07, + "loss": 0.0811, + "step": 248 + }, + { + "epoch": 0.02666880017068032, + "grad_norm": 24.52163737825959, + "learning_rate": 5.33049040511727e-07, + "loss": -0.8824, + "step": 250 + }, + { + "epoch": 0.026882150572045762, + "grad_norm": 19.231993611076483, + "learning_rate": 5.373134328358208e-07, + "loss": 0.5571, + "step": 252 + }, + { + "epoch": 0.027095500973411207, + "grad_norm": 17.129505372417224, + "learning_rate": 5.415778251599147e-07, + "loss": -0.6865, + "step": 254 + }, + { + "epoch": 0.02730885137477665, + "grad_norm": 19.406038647540843, + "learning_rate": 5.458422174840085e-07, + "loss": -0.4991, + "step": 256 + }, + { + "epoch": 0.02752220177614209, + "grad_norm": 6.847722005977987, + "learning_rate": 5.501066098081023e-07, + "loss": 0.1979, + "step": 258 + }, + { + "epoch": 0.027735552177507533, + "grad_norm": 13.405288361475698, + "learning_rate": 5.543710021321961e-07, + "loss": -0.3812, + "step": 260 + }, + { + "epoch": 0.027948902578872975, + "grad_norm": 21.529785897303153, + "learning_rate": 5.5863539445629e-07, + "loss": -0.6946, + "step": 262 + }, + { + "epoch": 0.02816225298023842, + "grad_norm": 14.20190258955381, + "learning_rate": 5.628997867803837e-07, + "loss": -0.7182, + "step": 264 + }, + { + "epoch": 0.028375603381603862, + "grad_norm": 20.09792579726273, + "learning_rate": 5.671641791044775e-07, + "loss": -0.2583, + "step": 266 + }, + { + "epoch": 0.028588953782969304, + "grad_norm": 17.213177542585473, + "learning_rate": 5.714285714285714e-07, + "loss": 0.3218, + "step": 268 + }, + { + "epoch": 0.028802304184334746, + "grad_norm": 11.185587231889192, + "learning_rate": 5.756929637526652e-07, + "loss": -0.0663, + "step": 270 + }, + { + "epoch": 0.029015654585700188, + "grad_norm": 27.722334316990437, + "learning_rate": 5.79957356076759e-07, + "loss": 0.1254, + "step": 272 + }, + { + "epoch": 0.029229004987065633, + "grad_norm": 20.49167387185228, + "learning_rate": 5.842217484008528e-07, + "loss": -0.3077, + "step": 274 + }, + { + "epoch": 0.029442355388431075, + "grad_norm": 17.084103781873182, + "learning_rate": 5.884861407249467e-07, + "loss": -1.0397, + "step": 276 + }, + { + "epoch": 0.029655705789796517, + "grad_norm": 23.870809301137577, + "learning_rate": 5.927505330490404e-07, + "loss": -0.401, + "step": 278 + }, + { + "epoch": 0.02986905619116196, + "grad_norm": 11.313699213427757, + "learning_rate": 5.970149253731342e-07, + "loss": -1.1555, + "step": 280 + }, + { + "epoch": 0.0300824065925274, + "grad_norm": 9.331131563295791, + "learning_rate": 6.012793176972282e-07, + "loss": -0.5128, + "step": 282 + }, + { + "epoch": 0.030295756993892843, + "grad_norm": 10.71153796479133, + "learning_rate": 6.05543710021322e-07, + "loss": 0.239, + "step": 284 + }, + { + "epoch": 0.03050910739525829, + "grad_norm": 28.31484795145744, + "learning_rate": 6.098081023454158e-07, + "loss": -0.2084, + "step": 286 + }, + { + "epoch": 0.03072245779662373, + "grad_norm": 11.917796878106765, + "learning_rate": 6.140724946695096e-07, + "loss": -0.2701, + "step": 288 + }, + { + "epoch": 0.030935808197989172, + "grad_norm": 15.31345988143714, + "learning_rate": 6.183368869936035e-07, + "loss": -0.9096, + "step": 290 + }, + { + "epoch": 0.031149158599354614, + "grad_norm": 49.77491988047747, + "learning_rate": 6.226012793176972e-07, + "loss": 0.1465, + "step": 292 + }, + { + "epoch": 0.03136250900072006, + "grad_norm": 12.884205486410131, + "learning_rate": 6.26865671641791e-07, + "loss": 0.5899, + "step": 294 + }, + { + "epoch": 0.0315758594020855, + "grad_norm": 22.77696725291766, + "learning_rate": 6.311300639658849e-07, + "loss": -0.3255, + "step": 296 + }, + { + "epoch": 0.031789209803450943, + "grad_norm": 14.760034845954518, + "learning_rate": 6.353944562899787e-07, + "loss": 0.2191, + "step": 298 + }, + { + "epoch": 0.032002560204816385, + "grad_norm": 16.04873208614678, + "learning_rate": 6.396588486140725e-07, + "loss": 0.9425, + "step": 300 + }, + { + "epoch": 0.03221591060618183, + "grad_norm": 21.948692800323336, + "learning_rate": 6.439232409381663e-07, + "loss": 0.4721, + "step": 302 + }, + { + "epoch": 0.03242926100754727, + "grad_norm": 17.475770533963207, + "learning_rate": 6.481876332622602e-07, + "loss": -0.8516, + "step": 304 + }, + { + "epoch": 0.03264261140891271, + "grad_norm": 21.938554689368157, + "learning_rate": 6.524520255863539e-07, + "loss": 0.1675, + "step": 306 + }, + { + "epoch": 0.03285596181027815, + "grad_norm": 46.95903237109459, + "learning_rate": 6.567164179104477e-07, + "loss": 0.7013, + "step": 308 + }, + { + "epoch": 0.033069312211643595, + "grad_norm": 21.7738424183671, + "learning_rate": 6.609808102345416e-07, + "loss": -0.8641, + "step": 310 + }, + { + "epoch": 0.033282662613009044, + "grad_norm": 20.818736283324778, + "learning_rate": 6.652452025586353e-07, + "loss": -0.2203, + "step": 312 + }, + { + "epoch": 0.033496013014374486, + "grad_norm": 12.191027612540129, + "learning_rate": 6.695095948827292e-07, + "loss": -0.2385, + "step": 314 + }, + { + "epoch": 0.03370936341573993, + "grad_norm": 17.485900290878302, + "learning_rate": 6.73773987206823e-07, + "loss": -0.6482, + "step": 316 + }, + { + "epoch": 0.03392271381710537, + "grad_norm": 12.365985112237825, + "learning_rate": 6.780383795309168e-07, + "loss": -0.0618, + "step": 318 + }, + { + "epoch": 0.03413606421847081, + "grad_norm": 19.889441682727426, + "learning_rate": 6.823027718550106e-07, + "loss": 0.255, + "step": 320 + }, + { + "epoch": 0.034349414619836253, + "grad_norm": 19.5015710993416, + "learning_rate": 6.865671641791044e-07, + "loss": -0.6494, + "step": 322 + }, + { + "epoch": 0.034562765021201695, + "grad_norm": 21.45614945706602, + "learning_rate": 6.908315565031983e-07, + "loss": -0.4695, + "step": 324 + }, + { + "epoch": 0.03477611542256714, + "grad_norm": 11.714892632525226, + "learning_rate": 6.95095948827292e-07, + "loss": -0.6134, + "step": 326 + }, + { + "epoch": 0.03498946582393258, + "grad_norm": 17.08341230963109, + "learning_rate": 6.993603411513859e-07, + "loss": -0.9212, + "step": 328 + }, + { + "epoch": 0.03520281622529802, + "grad_norm": 18.71372293407707, + "learning_rate": 7.036247334754797e-07, + "loss": -0.1783, + "step": 330 + }, + { + "epoch": 0.03541616662666346, + "grad_norm": 40.394000763247476, + "learning_rate": 7.078891257995734e-07, + "loss": -0.1696, + "step": 332 + }, + { + "epoch": 0.03562951702802891, + "grad_norm": 16.851717713795054, + "learning_rate": 7.121535181236673e-07, + "loss": -0.168, + "step": 334 + }, + { + "epoch": 0.035842867429394354, + "grad_norm": 20.980121165459167, + "learning_rate": 7.164179104477611e-07, + "loss": -0.1898, + "step": 336 + }, + { + "epoch": 0.036056217830759796, + "grad_norm": 5.128039309618372, + "learning_rate": 7.20682302771855e-07, + "loss": -0.3406, + "step": 338 + }, + { + "epoch": 0.03626956823212524, + "grad_norm": 17.130506401349365, + "learning_rate": 7.249466950959487e-07, + "loss": -0.2751, + "step": 340 + }, + { + "epoch": 0.03648291863349068, + "grad_norm": 17.383816800029777, + "learning_rate": 7.292110874200426e-07, + "loss": -0.0005, + "step": 342 + }, + { + "epoch": 0.03669626903485612, + "grad_norm": 43.198248806456, + "learning_rate": 7.334754797441365e-07, + "loss": -0.2181, + "step": 344 + }, + { + "epoch": 0.036909619436221563, + "grad_norm": 16.46060851996884, + "learning_rate": 7.377398720682303e-07, + "loss": 0.2075, + "step": 346 + }, + { + "epoch": 0.037122969837587005, + "grad_norm": 12.986350336767204, + "learning_rate": 7.420042643923241e-07, + "loss": -0.5395, + "step": 348 + }, + { + "epoch": 0.03733632023895245, + "grad_norm": 19.883036986347154, + "learning_rate": 7.462686567164179e-07, + "loss": -0.047, + "step": 350 + }, + { + "epoch": 0.03754967064031789, + "grad_norm": 8.289068924690984, + "learning_rate": 7.505330490405118e-07, + "loss": 0.9832, + "step": 352 + }, + { + "epoch": 0.03776302104168334, + "grad_norm": 10.173680365988295, + "learning_rate": 7.547974413646055e-07, + "loss": -0.4718, + "step": 354 + }, + { + "epoch": 0.03797637144304878, + "grad_norm": 29.406476571062164, + "learning_rate": 7.590618336886994e-07, + "loss": -0.3167, + "step": 356 + }, + { + "epoch": 0.03818972184441422, + "grad_norm": 27.934726009197963, + "learning_rate": 7.633262260127932e-07, + "loss": -0.1972, + "step": 358 + }, + { + "epoch": 0.038403072245779664, + "grad_norm": 25.076744001467002, + "learning_rate": 7.675906183368869e-07, + "loss": 0.2433, + "step": 360 + }, + { + "epoch": 0.038616422647145106, + "grad_norm": 17.034753146666695, + "learning_rate": 7.718550106609808e-07, + "loss": -0.6862, + "step": 362 + }, + { + "epoch": 0.03882977304851055, + "grad_norm": 30.28266067136412, + "learning_rate": 7.761194029850746e-07, + "loss": 0.0555, + "step": 364 + }, + { + "epoch": 0.03904312344987599, + "grad_norm": 15.433190582955902, + "learning_rate": 7.803837953091685e-07, + "loss": -0.6999, + "step": 366 + }, + { + "epoch": 0.03925647385124143, + "grad_norm": 16.705763291209895, + "learning_rate": 7.846481876332622e-07, + "loss": 0.145, + "step": 368 + }, + { + "epoch": 0.039469824252606873, + "grad_norm": 7.207700811021795, + "learning_rate": 7.889125799573561e-07, + "loss": 0.4911, + "step": 370 + }, + { + "epoch": 0.039683174653972315, + "grad_norm": 12.58245126026889, + "learning_rate": 7.931769722814499e-07, + "loss": -0.4105, + "step": 372 + }, + { + "epoch": 0.03989652505533776, + "grad_norm": 9.727684047283205, + "learning_rate": 7.974413646055436e-07, + "loss": -0.4456, + "step": 374 + }, + { + "epoch": 0.040109875456703206, + "grad_norm": 37.82883818815438, + "learning_rate": 8.017057569296375e-07, + "loss": -1.003, + "step": 376 + }, + { + "epoch": 0.04032322585806865, + "grad_norm": 9.205831543930358, + "learning_rate": 8.059701492537313e-07, + "loss": -0.9034, + "step": 378 + }, + { + "epoch": 0.04053657625943409, + "grad_norm": 17.5439627015, + "learning_rate": 8.102345415778252e-07, + "loss": -0.3568, + "step": 380 + }, + { + "epoch": 0.04074992666079953, + "grad_norm": 15.109693571978122, + "learning_rate": 8.144989339019189e-07, + "loss": -0.3776, + "step": 382 + }, + { + "epoch": 0.040963277062164974, + "grad_norm": 10.552742457862886, + "learning_rate": 8.187633262260127e-07, + "loss": 0.2015, + "step": 384 + }, + { + "epoch": 0.041176627463530416, + "grad_norm": 19.08793342231466, + "learning_rate": 8.230277185501066e-07, + "loss": -1.4421, + "step": 386 + }, + { + "epoch": 0.04138997786489586, + "grad_norm": 34.056443898662884, + "learning_rate": 8.272921108742003e-07, + "loss": -0.197, + "step": 388 + }, + { + "epoch": 0.0416033282662613, + "grad_norm": 7.384540107176898, + "learning_rate": 8.315565031982942e-07, + "loss": 0.7544, + "step": 390 + }, + { + "epoch": 0.04181667866762674, + "grad_norm": 23.683836639585067, + "learning_rate": 8.35820895522388e-07, + "loss": -0.3735, + "step": 392 + }, + { + "epoch": 0.042030029068992183, + "grad_norm": 19.629725497894157, + "learning_rate": 8.400852878464819e-07, + "loss": -0.6826, + "step": 394 + }, + { + "epoch": 0.042243379470357625, + "grad_norm": 13.85103939830437, + "learning_rate": 8.443496801705756e-07, + "loss": -0.2708, + "step": 396 + }, + { + "epoch": 0.042456729871723074, + "grad_norm": 19.710815160612455, + "learning_rate": 8.486140724946694e-07, + "loss": 0.0877, + "step": 398 + }, + { + "epoch": 0.042670080273088516, + "grad_norm": 4.706110486614041, + "learning_rate": 8.528784648187633e-07, + "loss": 0.1916, + "step": 400 + }, + { + "epoch": 0.04288343067445396, + "grad_norm": 12.864617887211866, + "learning_rate": 8.57142857142857e-07, + "loss": 0.0179, + "step": 402 + }, + { + "epoch": 0.0430967810758194, + "grad_norm": 9.560466147390757, + "learning_rate": 8.614072494669509e-07, + "loss": -0.0813, + "step": 404 + }, + { + "epoch": 0.04331013147718484, + "grad_norm": 21.130991357317583, + "learning_rate": 8.656716417910447e-07, + "loss": -1.1384, + "step": 406 + }, + { + "epoch": 0.043523481878550284, + "grad_norm": 19.392285709372985, + "learning_rate": 8.699360341151387e-07, + "loss": 0.1794, + "step": 408 + }, + { + "epoch": 0.043736832279915726, + "grad_norm": 9.266238784116867, + "learning_rate": 8.742004264392324e-07, + "loss": -1.768, + "step": 410 + }, + { + "epoch": 0.04395018268128117, + "grad_norm": 40.16756332202219, + "learning_rate": 8.784648187633262e-07, + "loss": -0.8232, + "step": 412 + }, + { + "epoch": 0.04416353308264661, + "grad_norm": 39.24330802424692, + "learning_rate": 8.827292110874201e-07, + "loss": -1.6178, + "step": 414 + }, + { + "epoch": 0.04437688348401205, + "grad_norm": 11.88082478788392, + "learning_rate": 8.869936034115138e-07, + "loss": -0.3053, + "step": 416 + }, + { + "epoch": 0.044590233885377493, + "grad_norm": 6.357526673930185, + "learning_rate": 8.912579957356077e-07, + "loss": -1.0471, + "step": 418 + }, + { + "epoch": 0.04480358428674294, + "grad_norm": 16.303453044925423, + "learning_rate": 8.955223880597015e-07, + "loss": -0.1262, + "step": 420 + }, + { + "epoch": 0.045016934688108384, + "grad_norm": 14.6518934802552, + "learning_rate": 8.997867803837953e-07, + "loss": -0.2323, + "step": 422 + }, + { + "epoch": 0.045230285089473826, + "grad_norm": 8.02837999022306, + "learning_rate": 9.040511727078891e-07, + "loss": 0.1657, + "step": 424 + }, + { + "epoch": 0.04544363549083927, + "grad_norm": 15.64582877736265, + "learning_rate": 9.083155650319829e-07, + "loss": -0.1882, + "step": 426 + }, + { + "epoch": 0.04565698589220471, + "grad_norm": 16.363233536467853, + "learning_rate": 9.125799573560768e-07, + "loss": -0.4399, + "step": 428 + }, + { + "epoch": 0.04587033629357015, + "grad_norm": 15.548489913061752, + "learning_rate": 9.168443496801705e-07, + "loss": 0.3602, + "step": 430 + }, + { + "epoch": 0.046083686694935594, + "grad_norm": 7.007708536573836, + "learning_rate": 9.211087420042644e-07, + "loss": -0.2646, + "step": 432 + }, + { + "epoch": 0.046297037096301036, + "grad_norm": 13.687256516704583, + "learning_rate": 9.253731343283582e-07, + "loss": -1.0053, + "step": 434 + }, + { + "epoch": 0.04651038749766648, + "grad_norm": 30.306488264414572, + "learning_rate": 9.29637526652452e-07, + "loss": -0.3203, + "step": 436 + }, + { + "epoch": 0.04672373789903192, + "grad_norm": 26.094659474249983, + "learning_rate": 9.339019189765458e-07, + "loss": -0.6359, + "step": 438 + }, + { + "epoch": 0.04693708830039737, + "grad_norm": 13.796812579064204, + "learning_rate": 9.381663113006396e-07, + "loss": -0.3471, + "step": 440 + }, + { + "epoch": 0.04715043870176281, + "grad_norm": 14.69389520446514, + "learning_rate": 9.424307036247334e-07, + "loss": -0.8145, + "step": 442 + }, + { + "epoch": 0.04736378910312825, + "grad_norm": 16.918575972701632, + "learning_rate": 9.466950959488272e-07, + "loss": 0.2185, + "step": 444 + }, + { + "epoch": 0.047577139504493694, + "grad_norm": 10.422480971977198, + "learning_rate": 9.509594882729211e-07, + "loss": -0.4649, + "step": 446 + }, + { + "epoch": 0.047790489905859136, + "grad_norm": 7.03443725132983, + "learning_rate": 9.552238805970149e-07, + "loss": -0.3627, + "step": 448 + }, + { + "epoch": 0.04800384030722458, + "grad_norm": 18.359610314742397, + "learning_rate": 9.594882729211086e-07, + "loss": -0.9404, + "step": 450 + }, + { + "epoch": 0.04821719070859002, + "grad_norm": 8.277072501759134, + "learning_rate": 9.637526652452024e-07, + "loss": 0.2409, + "step": 452 + }, + { + "epoch": 0.04843054110995546, + "grad_norm": 13.071634752922243, + "learning_rate": 9.680170575692964e-07, + "loss": 0.3506, + "step": 454 + }, + { + "epoch": 0.048643891511320904, + "grad_norm": 36.801112890968085, + "learning_rate": 9.722814498933901e-07, + "loss": -0.3961, + "step": 456 + }, + { + "epoch": 0.048857241912686346, + "grad_norm": 40.38991240984631, + "learning_rate": 9.76545842217484e-07, + "loss": -0.3497, + "step": 458 + }, + { + "epoch": 0.04907059231405179, + "grad_norm": 8.414591188760289, + "learning_rate": 9.808102345415777e-07, + "loss": -0.5775, + "step": 460 + }, + { + "epoch": 0.04928394271541724, + "grad_norm": 8.76108225719733, + "learning_rate": 9.850746268656714e-07, + "loss": 0.0839, + "step": 462 + }, + { + "epoch": 0.04949729311678268, + "grad_norm": 9.275399140799998, + "learning_rate": 9.893390191897654e-07, + "loss": 0.0137, + "step": 464 + }, + { + "epoch": 0.04971064351814812, + "grad_norm": 9.832501914571974, + "learning_rate": 9.936034115138592e-07, + "loss": 0.1014, + "step": 466 + }, + { + "epoch": 0.04992399391951356, + "grad_norm": 19.204369086700787, + "learning_rate": 9.97867803837953e-07, + "loss": -0.31, + "step": 468 + }, + { + "epoch": 0.050137344320879004, + "grad_norm": 6.109980991190154, + "learning_rate": 1.0021321961620467e-06, + "loss": -0.26, + "step": 470 + }, + { + "epoch": 0.050350694722244446, + "grad_norm": 31.865612558858906, + "learning_rate": 1.0063965884861407e-06, + "loss": -0.1208, + "step": 472 + }, + { + "epoch": 0.05056404512360989, + "grad_norm": 10.044184014650897, + "learning_rate": 1.0106609808102345e-06, + "loss": -0.2205, + "step": 474 + }, + { + "epoch": 0.05077739552497533, + "grad_norm": 21.875745094199374, + "learning_rate": 1.0149253731343285e-06, + "loss": -0.0812, + "step": 476 + }, + { + "epoch": 0.05099074592634077, + "grad_norm": 11.995975513229988, + "learning_rate": 1.019189765458422e-06, + "loss": -1.7783, + "step": 478 + }, + { + "epoch": 0.051204096327706214, + "grad_norm": 7.696707350691998, + "learning_rate": 1.023454157782516e-06, + "loss": 0.2012, + "step": 480 + }, + { + "epoch": 0.051417446729071656, + "grad_norm": 72.33916217166278, + "learning_rate": 1.0277185501066098e-06, + "loss": 0.0427, + "step": 482 + }, + { + "epoch": 0.051630797130437105, + "grad_norm": 13.569652394718384, + "learning_rate": 1.0319829424307035e-06, + "loss": 0.0953, + "step": 484 + }, + { + "epoch": 0.05184414753180255, + "grad_norm": 6.640484086255635, + "learning_rate": 1.0362473347547973e-06, + "loss": -1.0771, + "step": 486 + }, + { + "epoch": 0.05205749793316799, + "grad_norm": 16.284392321889953, + "learning_rate": 1.0405117270788913e-06, + "loss": -0.661, + "step": 488 + }, + { + "epoch": 0.05227084833453343, + "grad_norm": 10.396345506952937, + "learning_rate": 1.0447761194029848e-06, + "loss": -0.3095, + "step": 490 + }, + { + "epoch": 0.05248419873589887, + "grad_norm": 12.867700984841619, + "learning_rate": 1.0490405117270788e-06, + "loss": 0.4289, + "step": 492 + }, + { + "epoch": 0.052697549137264314, + "grad_norm": 18.31825168601856, + "learning_rate": 1.0533049040511726e-06, + "loss": 0.1729, + "step": 494 + }, + { + "epoch": 0.052910899538629756, + "grad_norm": 17.248690968213506, + "learning_rate": 1.0575692963752666e-06, + "loss": -1.1607, + "step": 496 + }, + { + "epoch": 0.0531242499399952, + "grad_norm": 8.51928074224459, + "learning_rate": 1.0618336886993601e-06, + "loss": -0.3654, + "step": 498 + }, + { + "epoch": 0.05333760034136064, + "grad_norm": 20.639043194792887, + "learning_rate": 1.066098081023454e-06, + "loss": -0.4893, + "step": 500 + }, + { + "epoch": 0.05355095074272608, + "grad_norm": 14.990753207547238, + "learning_rate": 1.070362473347548e-06, + "loss": -0.468, + "step": 502 + }, + { + "epoch": 0.053764301144091524, + "grad_norm": 24.695774003905925, + "learning_rate": 1.0746268656716416e-06, + "loss": -1.2465, + "step": 504 + }, + { + "epoch": 0.05397765154545697, + "grad_norm": 17.81220746000941, + "learning_rate": 1.0788912579957356e-06, + "loss": -0.232, + "step": 506 + }, + { + "epoch": 0.054191001946822415, + "grad_norm": 7.6840011548352924, + "learning_rate": 1.0831556503198294e-06, + "loss": 0.1394, + "step": 508 + }, + { + "epoch": 0.05440435234818786, + "grad_norm": 17.135280175028917, + "learning_rate": 1.0874200426439234e-06, + "loss": -0.1113, + "step": 510 + }, + { + "epoch": 0.0546177027495533, + "grad_norm": 21.577778290948245, + "learning_rate": 1.091684434968017e-06, + "loss": 0.1389, + "step": 512 + }, + { + "epoch": 0.05483105315091874, + "grad_norm": 26.22719409585662, + "learning_rate": 1.095948827292111e-06, + "loss": -0.1216, + "step": 514 + }, + { + "epoch": 0.05504440355228418, + "grad_norm": 8.763214398721058, + "learning_rate": 1.1002132196162047e-06, + "loss": -0.6157, + "step": 516 + }, + { + "epoch": 0.055257753953649624, + "grad_norm": 14.925448860683707, + "learning_rate": 1.1044776119402984e-06, + "loss": 0.0356, + "step": 518 + }, + { + "epoch": 0.055471104355015066, + "grad_norm": 14.365407282055516, + "learning_rate": 1.1087420042643922e-06, + "loss": -0.0624, + "step": 520 + }, + { + "epoch": 0.05568445475638051, + "grad_norm": 8.802366004297667, + "learning_rate": 1.1130063965884862e-06, + "loss": 0.5658, + "step": 522 + }, + { + "epoch": 0.05589780515774595, + "grad_norm": 24.251203850442998, + "learning_rate": 1.11727078891258e-06, + "loss": -0.0644, + "step": 524 + }, + { + "epoch": 0.0561111555591114, + "grad_norm": 16.632568797956132, + "learning_rate": 1.1215351812366737e-06, + "loss": -1.0315, + "step": 526 + }, + { + "epoch": 0.05632450596047684, + "grad_norm": 15.777059103835, + "learning_rate": 1.1257995735607675e-06, + "loss": 0.6824, + "step": 528 + }, + { + "epoch": 0.05653785636184228, + "grad_norm": 6.521931225844872, + "learning_rate": 1.1300639658848615e-06, + "loss": -0.2547, + "step": 530 + }, + { + "epoch": 0.056751206763207725, + "grad_norm": 16.18907262330339, + "learning_rate": 1.134328358208955e-06, + "loss": -0.714, + "step": 532 + }, + { + "epoch": 0.05696455716457317, + "grad_norm": 17.05716262320914, + "learning_rate": 1.138592750533049e-06, + "loss": -1.1937, + "step": 534 + }, + { + "epoch": 0.05717790756593861, + "grad_norm": 8.560189061728243, + "learning_rate": 1.1428571428571428e-06, + "loss": -0.3703, + "step": 536 + }, + { + "epoch": 0.05739125796730405, + "grad_norm": 11.56228500281614, + "learning_rate": 1.1471215351812368e-06, + "loss": -0.8073, + "step": 538 + }, + { + "epoch": 0.05760460836866949, + "grad_norm": 13.24672543046646, + "learning_rate": 1.1513859275053303e-06, + "loss": -0.3222, + "step": 540 + }, + { + "epoch": 0.057817958770034934, + "grad_norm": 14.035481073008786, + "learning_rate": 1.1556503198294243e-06, + "loss": 0.286, + "step": 542 + }, + { + "epoch": 0.058031309171400376, + "grad_norm": 28.95516126438781, + "learning_rate": 1.159914712153518e-06, + "loss": -1.189, + "step": 544 + }, + { + "epoch": 0.05824465957276582, + "grad_norm": 20.850060598425877, + "learning_rate": 1.1641791044776118e-06, + "loss": -0.6204, + "step": 546 + }, + { + "epoch": 0.05845800997413127, + "grad_norm": 21.211499139014034, + "learning_rate": 1.1684434968017056e-06, + "loss": -0.5428, + "step": 548 + }, + { + "epoch": 0.05867136037549671, + "grad_norm": 16.101447224508938, + "learning_rate": 1.1727078891257996e-06, + "loss": -0.7822, + "step": 550 + }, + { + "epoch": 0.05888471077686215, + "grad_norm": 60.24925771229807, + "learning_rate": 1.1769722814498933e-06, + "loss": -0.5338, + "step": 552 + }, + { + "epoch": 0.05909806117822759, + "grad_norm": 10.911944544305852, + "learning_rate": 1.1812366737739871e-06, + "loss": -0.1546, + "step": 554 + }, + { + "epoch": 0.059311411579593035, + "grad_norm": 20.123496073532923, + "learning_rate": 1.1855010660980809e-06, + "loss": 0.1078, + "step": 556 + }, + { + "epoch": 0.05952476198095848, + "grad_norm": 8.077431875506283, + "learning_rate": 1.1897654584221749e-06, + "loss": 0.6775, + "step": 558 + }, + { + "epoch": 0.05973811238232392, + "grad_norm": 20.69430693950377, + "learning_rate": 1.1940298507462684e-06, + "loss": -0.1531, + "step": 560 + }, + { + "epoch": 0.05995146278368936, + "grad_norm": 9.149229771649699, + "learning_rate": 1.1982942430703624e-06, + "loss": 0.2026, + "step": 562 + }, + { + "epoch": 0.0601648131850548, + "grad_norm": 35.409517307149535, + "learning_rate": 1.2025586353944564e-06, + "loss": 0.2081, + "step": 564 + }, + { + "epoch": 0.060378163586420244, + "grad_norm": 18.732961050530974, + "learning_rate": 1.2068230277185501e-06, + "loss": -0.4564, + "step": 566 + }, + { + "epoch": 0.060591513987785686, + "grad_norm": 18.17521906003365, + "learning_rate": 1.211087420042644e-06, + "loss": -0.4694, + "step": 568 + }, + { + "epoch": 0.060804864389151135, + "grad_norm": 15.950240307469345, + "learning_rate": 1.2153518123667377e-06, + "loss": -0.7607, + "step": 570 + }, + { + "epoch": 0.06101821479051658, + "grad_norm": 27.667171128676177, + "learning_rate": 1.2196162046908317e-06, + "loss": -0.4962, + "step": 572 + }, + { + "epoch": 0.06123156519188202, + "grad_norm": 6.875981558346762, + "learning_rate": 1.2238805970149252e-06, + "loss": 0.1071, + "step": 574 + }, + { + "epoch": 0.06144491559324746, + "grad_norm": 23.330801092034456, + "learning_rate": 1.2281449893390192e-06, + "loss": -1.1081, + "step": 576 + }, + { + "epoch": 0.0616582659946129, + "grad_norm": 18.097087985455573, + "learning_rate": 1.232409381663113e-06, + "loss": 0.292, + "step": 578 + }, + { + "epoch": 0.061871616395978345, + "grad_norm": 44.96075507661794, + "learning_rate": 1.236673773987207e-06, + "loss": -1.8344, + "step": 580 + }, + { + "epoch": 0.06208496679734379, + "grad_norm": 11.33849063045387, + "learning_rate": 1.2409381663113005e-06, + "loss": -0.5339, + "step": 582 + }, + { + "epoch": 0.06229831719870923, + "grad_norm": 25.655929914012017, + "learning_rate": 1.2452025586353945e-06, + "loss": 0.2773, + "step": 584 + }, + { + "epoch": 0.06251166760007468, + "grad_norm": 19.518509371933693, + "learning_rate": 1.2494669509594882e-06, + "loss": -0.3635, + "step": 586 + }, + { + "epoch": 0.06272501800144012, + "grad_norm": 19.337178836658445, + "learning_rate": 1.253731343283582e-06, + "loss": -0.4397, + "step": 588 + }, + { + "epoch": 0.06293836840280556, + "grad_norm": 15.629253302757428, + "learning_rate": 1.2579957356076758e-06, + "loss": -0.3098, + "step": 590 + }, + { + "epoch": 0.063151718804171, + "grad_norm": 35.95259160255183, + "learning_rate": 1.2622601279317698e-06, + "loss": -0.9708, + "step": 592 + }, + { + "epoch": 0.06336506920553645, + "grad_norm": 27.574715730226895, + "learning_rate": 1.2665245202558633e-06, + "loss": -0.5131, + "step": 594 + }, + { + "epoch": 0.06357841960690189, + "grad_norm": 47.74900017442824, + "learning_rate": 1.2707889125799573e-06, + "loss": 0.3332, + "step": 596 + }, + { + "epoch": 0.06379177000826733, + "grad_norm": 20.309365479498744, + "learning_rate": 1.275053304904051e-06, + "loss": -1.2309, + "step": 598 + }, + { + "epoch": 0.06400512040963277, + "grad_norm": 11.40208623029672, + "learning_rate": 1.279317697228145e-06, + "loss": -0.5857, + "step": 600 + }, + { + "epoch": 0.06421847081099821, + "grad_norm": 11.402181219311933, + "learning_rate": 1.2835820895522386e-06, + "loss": -0.3416, + "step": 602 + }, + { + "epoch": 0.06443182121236365, + "grad_norm": 15.010371839635921, + "learning_rate": 1.2878464818763326e-06, + "loss": -0.4071, + "step": 604 + }, + { + "epoch": 0.0646451716137291, + "grad_norm": 9.99633951597125, + "learning_rate": 1.2921108742004264e-06, + "loss": -1.1453, + "step": 606 + }, + { + "epoch": 0.06485852201509454, + "grad_norm": 21.31739921019411, + "learning_rate": 1.2963752665245203e-06, + "loss": 0.3107, + "step": 608 + }, + { + "epoch": 0.06507187241645998, + "grad_norm": 16.7015124046411, + "learning_rate": 1.3006396588486139e-06, + "loss": -0.9792, + "step": 610 + }, + { + "epoch": 0.06528522281782542, + "grad_norm": 17.983069493288237, + "learning_rate": 1.3049040511727079e-06, + "loss": -0.3921, + "step": 612 + }, + { + "epoch": 0.06549857321919086, + "grad_norm": 10.553594812872817, + "learning_rate": 1.3091684434968016e-06, + "loss": -1.2914, + "step": 614 + }, + { + "epoch": 0.0657119236205563, + "grad_norm": 14.041836178013272, + "learning_rate": 1.3134328358208954e-06, + "loss": -0.3353, + "step": 616 + }, + { + "epoch": 0.06592527402192175, + "grad_norm": 15.639430537257736, + "learning_rate": 1.3176972281449892e-06, + "loss": -0.4705, + "step": 618 + }, + { + "epoch": 0.06613862442328719, + "grad_norm": 12.786525687977138, + "learning_rate": 1.3219616204690832e-06, + "loss": -0.0215, + "step": 620 + }, + { + "epoch": 0.06635197482465263, + "grad_norm": 9.934313047441512, + "learning_rate": 1.3262260127931767e-06, + "loss": 0.6287, + "step": 622 + }, + { + "epoch": 0.06656532522601809, + "grad_norm": 9.779454896556423, + "learning_rate": 1.3304904051172707e-06, + "loss": -0.0751, + "step": 624 + }, + { + "epoch": 0.06677867562738353, + "grad_norm": 9.494698563936842, + "learning_rate": 1.3347547974413647e-06, + "loss": 0.1564, + "step": 626 + }, + { + "epoch": 0.06699202602874897, + "grad_norm": 10.740511386568414, + "learning_rate": 1.3390191897654584e-06, + "loss": -0.7072, + "step": 628 + }, + { + "epoch": 0.06720537643011441, + "grad_norm": 5.704581178120186, + "learning_rate": 1.3432835820895522e-06, + "loss": -0.6728, + "step": 630 + }, + { + "epoch": 0.06741872683147986, + "grad_norm": 22.298716006252704, + "learning_rate": 1.347547974413646e-06, + "loss": -0.7213, + "step": 632 + }, + { + "epoch": 0.0676320772328453, + "grad_norm": 15.330297898109906, + "learning_rate": 1.35181236673774e-06, + "loss": 0.2685, + "step": 634 + }, + { + "epoch": 0.06784542763421074, + "grad_norm": 22.69580065155436, + "learning_rate": 1.3560767590618335e-06, + "loss": 0.3992, + "step": 636 + }, + { + "epoch": 0.06805877803557618, + "grad_norm": 28.833859562335263, + "learning_rate": 1.3603411513859275e-06, + "loss": -0.7339, + "step": 638 + }, + { + "epoch": 0.06827212843694162, + "grad_norm": 11.283562143626995, + "learning_rate": 1.3646055437100213e-06, + "loss": -0.7638, + "step": 640 + }, + { + "epoch": 0.06848547883830707, + "grad_norm": 10.725377082827144, + "learning_rate": 1.3688699360341152e-06, + "loss": -0.8662, + "step": 642 + }, + { + "epoch": 0.06869882923967251, + "grad_norm": 26.31029696416489, + "learning_rate": 1.3731343283582088e-06, + "loss": -0.7486, + "step": 644 + }, + { + "epoch": 0.06891217964103795, + "grad_norm": 18.040887121015924, + "learning_rate": 1.3773987206823028e-06, + "loss": 0.3118, + "step": 646 + }, + { + "epoch": 0.06912553004240339, + "grad_norm": 10.96637271492357, + "learning_rate": 1.3816631130063965e-06, + "loss": -0.5209, + "step": 648 + }, + { + "epoch": 0.06933888044376883, + "grad_norm": 8.06597559224624, + "learning_rate": 1.3859275053304903e-06, + "loss": -0.4205, + "step": 650 + }, + { + "epoch": 0.06955223084513427, + "grad_norm": 17.471203906083133, + "learning_rate": 1.390191897654584e-06, + "loss": -0.5424, + "step": 652 + }, + { + "epoch": 0.06976558124649972, + "grad_norm": 11.059775225202081, + "learning_rate": 1.394456289978678e-06, + "loss": -1.2152, + "step": 654 + }, + { + "epoch": 0.06997893164786516, + "grad_norm": 8.31714286189907, + "learning_rate": 1.3987206823027718e-06, + "loss": -0.811, + "step": 656 + }, + { + "epoch": 0.0701922820492306, + "grad_norm": 28.554761860008615, + "learning_rate": 1.4029850746268656e-06, + "loss": -1.0846, + "step": 658 + }, + { + "epoch": 0.07040563245059604, + "grad_norm": 16.176945715631096, + "learning_rate": 1.4072494669509594e-06, + "loss": -0.5155, + "step": 660 + }, + { + "epoch": 0.07061898285196148, + "grad_norm": 21.625404385138676, + "learning_rate": 1.4115138592750533e-06, + "loss": -0.9076, + "step": 662 + }, + { + "epoch": 0.07083233325332693, + "grad_norm": 5.4202622935954405, + "learning_rate": 1.415778251599147e-06, + "loss": 0.2342, + "step": 664 + }, + { + "epoch": 0.07104568365469238, + "grad_norm": 17.194951603060776, + "learning_rate": 1.4200426439232409e-06, + "loss": 0.0548, + "step": 666 + }, + { + "epoch": 0.07125903405605782, + "grad_norm": 11.512096288311646, + "learning_rate": 1.4243070362473346e-06, + "loss": -0.886, + "step": 668 + }, + { + "epoch": 0.07147238445742327, + "grad_norm": 16.05900514880833, + "learning_rate": 1.4285714285714286e-06, + "loss": -0.1307, + "step": 670 + }, + { + "epoch": 0.07168573485878871, + "grad_norm": 9.185127296516999, + "learning_rate": 1.4328358208955222e-06, + "loss": -0.1077, + "step": 672 + }, + { + "epoch": 0.07189908526015415, + "grad_norm": 7.334484358389449, + "learning_rate": 1.4371002132196162e-06, + "loss": 0.1615, + "step": 674 + }, + { + "epoch": 0.07211243566151959, + "grad_norm": 18.964205902532676, + "learning_rate": 1.44136460554371e-06, + "loss": -1.4788, + "step": 676 + }, + { + "epoch": 0.07232578606288503, + "grad_norm": 17.11189372092431, + "learning_rate": 1.4456289978678037e-06, + "loss": 0.0577, + "step": 678 + }, + { + "epoch": 0.07253913646425048, + "grad_norm": 10.275590248100219, + "learning_rate": 1.4498933901918975e-06, + "loss": -0.0749, + "step": 680 + }, + { + "epoch": 0.07275248686561592, + "grad_norm": 25.814533145099567, + "learning_rate": 1.4541577825159914e-06, + "loss": 0.4326, + "step": 682 + }, + { + "epoch": 0.07296583726698136, + "grad_norm": 6.539318927578008, + "learning_rate": 1.4584221748400852e-06, + "loss": 0.0608, + "step": 684 + }, + { + "epoch": 0.0731791876683468, + "grad_norm": 27.253684296985305, + "learning_rate": 1.462686567164179e-06, + "loss": 0.1022, + "step": 686 + }, + { + "epoch": 0.07339253806971224, + "grad_norm": 12.415981292555502, + "learning_rate": 1.466950959488273e-06, + "loss": -0.3008, + "step": 688 + }, + { + "epoch": 0.07360588847107769, + "grad_norm": 21.077155825145116, + "learning_rate": 1.4712153518123667e-06, + "loss": -0.4936, + "step": 690 + }, + { + "epoch": 0.07381923887244313, + "grad_norm": 14.246036421919664, + "learning_rate": 1.4754797441364605e-06, + "loss": 0.0596, + "step": 692 + }, + { + "epoch": 0.07403258927380857, + "grad_norm": 15.479313497531585, + "learning_rate": 1.4797441364605543e-06, + "loss": -0.0542, + "step": 694 + }, + { + "epoch": 0.07424593967517401, + "grad_norm": 21.800805681217618, + "learning_rate": 1.4840085287846482e-06, + "loss": 0.8143, + "step": 696 + }, + { + "epoch": 0.07445929007653945, + "grad_norm": 12.447341265262597, + "learning_rate": 1.488272921108742e-06, + "loss": 0.2514, + "step": 698 + }, + { + "epoch": 0.0746726404779049, + "grad_norm": 15.457895001240969, + "learning_rate": 1.4925373134328358e-06, + "loss": -0.1873, + "step": 700 + }, + { + "epoch": 0.07488599087927034, + "grad_norm": 9.103159220217133, + "learning_rate": 1.4968017057569296e-06, + "loss": -0.1545, + "step": 702 + }, + { + "epoch": 0.07509934128063578, + "grad_norm": 13.892293596081212, + "learning_rate": 1.5010660980810235e-06, + "loss": -1.0031, + "step": 704 + }, + { + "epoch": 0.07531269168200122, + "grad_norm": 4.018531204967449, + "learning_rate": 1.505330490405117e-06, + "loss": -0.5123, + "step": 706 + }, + { + "epoch": 0.07552604208336668, + "grad_norm": 9.26403570634237, + "learning_rate": 1.509594882729211e-06, + "loss": -0.009, + "step": 708 + }, + { + "epoch": 0.07573939248473212, + "grad_norm": 23.108096955629904, + "learning_rate": 1.5138592750533048e-06, + "loss": 0.0175, + "step": 710 + }, + { + "epoch": 0.07595274288609756, + "grad_norm": 28.54965038619342, + "learning_rate": 1.5181236673773988e-06, + "loss": -0.4034, + "step": 712 + }, + { + "epoch": 0.076166093287463, + "grad_norm": 36.93993798487583, + "learning_rate": 1.5223880597014924e-06, + "loss": -1.3652, + "step": 714 + }, + { + "epoch": 0.07637944368882844, + "grad_norm": 9.796656002782315, + "learning_rate": 1.5266524520255864e-06, + "loss": 0.5496, + "step": 716 + }, + { + "epoch": 0.07659279409019389, + "grad_norm": 10.912135520925375, + "learning_rate": 1.5309168443496801e-06, + "loss": -0.3247, + "step": 718 + }, + { + "epoch": 0.07680614449155933, + "grad_norm": 14.84305599240473, + "learning_rate": 1.5351812366737739e-06, + "loss": -1.3197, + "step": 720 + }, + { + "epoch": 0.07701949489292477, + "grad_norm": 25.835978612302938, + "learning_rate": 1.5394456289978677e-06, + "loss": -0.2446, + "step": 722 + }, + { + "epoch": 0.07723284529429021, + "grad_norm": 7.4900190590260545, + "learning_rate": 1.5437100213219616e-06, + "loss": -0.6921, + "step": 724 + }, + { + "epoch": 0.07744619569565565, + "grad_norm": 12.73580854140001, + "learning_rate": 1.5479744136460552e-06, + "loss": 0.1706, + "step": 726 + }, + { + "epoch": 0.0776595460970211, + "grad_norm": 10.90211860054164, + "learning_rate": 1.5522388059701492e-06, + "loss": 0.0817, + "step": 728 + }, + { + "epoch": 0.07787289649838654, + "grad_norm": 6.002849805432761, + "learning_rate": 1.556503198294243e-06, + "loss": 0.0037, + "step": 730 + }, + { + "epoch": 0.07808624689975198, + "grad_norm": 15.988913842632046, + "learning_rate": 1.560767590618337e-06, + "loss": 0.3994, + "step": 732 + }, + { + "epoch": 0.07829959730111742, + "grad_norm": 21.834364945681322, + "learning_rate": 1.5650319829424305e-06, + "loss": -0.4599, + "step": 734 + }, + { + "epoch": 0.07851294770248286, + "grad_norm": 7.432963689652445, + "learning_rate": 1.5692963752665245e-06, + "loss": -0.3263, + "step": 736 + }, + { + "epoch": 0.0787262981038483, + "grad_norm": 16.20259948956077, + "learning_rate": 1.5735607675906182e-06, + "loss": -0.1557, + "step": 738 + }, + { + "epoch": 0.07893964850521375, + "grad_norm": 30.282091888880974, + "learning_rate": 1.5778251599147122e-06, + "loss": -1.2297, + "step": 740 + }, + { + "epoch": 0.07915299890657919, + "grad_norm": 18.86228833994148, + "learning_rate": 1.5820895522388058e-06, + "loss": -0.1715, + "step": 742 + }, + { + "epoch": 0.07936634930794463, + "grad_norm": 26.911000632499032, + "learning_rate": 1.5863539445628997e-06, + "loss": -0.2761, + "step": 744 + }, + { + "epoch": 0.07957969970931007, + "grad_norm": 55.67332241793552, + "learning_rate": 1.5906183368869935e-06, + "loss": -0.1245, + "step": 746 + }, + { + "epoch": 0.07979305011067551, + "grad_norm": 11.64128268726896, + "learning_rate": 1.5948827292110873e-06, + "loss": 0.2172, + "step": 748 + }, + { + "epoch": 0.08000640051204096, + "grad_norm": 7.141781123388319, + "learning_rate": 1.599147121535181e-06, + "loss": -1.0433, + "step": 750 + }, + { + "epoch": 0.08021975091340641, + "grad_norm": 14.355586850326564, + "learning_rate": 1.603411513859275e-06, + "loss": 0.7587, + "step": 752 + }, + { + "epoch": 0.08043310131477185, + "grad_norm": 21.229967868951906, + "learning_rate": 1.607675906183369e-06, + "loss": -1.1211, + "step": 754 + }, + { + "epoch": 0.0806464517161373, + "grad_norm": 11.519887266492887, + "learning_rate": 1.6119402985074626e-06, + "loss": 0.589, + "step": 756 + }, + { + "epoch": 0.08085980211750274, + "grad_norm": 14.339130440951164, + "learning_rate": 1.6162046908315565e-06, + "loss": -0.7255, + "step": 758 + }, + { + "epoch": 0.08107315251886818, + "grad_norm": 14.27705846557268, + "learning_rate": 1.6204690831556503e-06, + "loss": -1.539, + "step": 760 + }, + { + "epoch": 0.08128650292023362, + "grad_norm": 17.0155387877924, + "learning_rate": 1.624733475479744e-06, + "loss": -0.366, + "step": 762 + }, + { + "epoch": 0.08149985332159906, + "grad_norm": 7.661991050372059, + "learning_rate": 1.6289978678038378e-06, + "loss": -0.0189, + "step": 764 + }, + { + "epoch": 0.0817132037229645, + "grad_norm": 8.979485378676086, + "learning_rate": 1.6332622601279318e-06, + "loss": -1.96, + "step": 766 + }, + { + "epoch": 0.08192655412432995, + "grad_norm": 5.161703747401364, + "learning_rate": 1.6375266524520254e-06, + "loss": -0.5528, + "step": 768 + }, + { + "epoch": 0.08213990452569539, + "grad_norm": 8.76501482963132, + "learning_rate": 1.6417910447761194e-06, + "loss": -0.0766, + "step": 770 + }, + { + "epoch": 0.08235325492706083, + "grad_norm": 16.69504549029163, + "learning_rate": 1.6460554371002131e-06, + "loss": 0.3121, + "step": 772 + }, + { + "epoch": 0.08256660532842627, + "grad_norm": 17.43852691547062, + "learning_rate": 1.6503198294243071e-06, + "loss": -0.6262, + "step": 774 + }, + { + "epoch": 0.08277995572979172, + "grad_norm": 7.8082474899976075, + "learning_rate": 1.6545842217484007e-06, + "loss": 0.4352, + "step": 776 + }, + { + "epoch": 0.08299330613115716, + "grad_norm": 23.61906025471836, + "learning_rate": 1.6588486140724946e-06, + "loss": -0.586, + "step": 778 + }, + { + "epoch": 0.0832066565325226, + "grad_norm": 16.906426462089843, + "learning_rate": 1.6631130063965884e-06, + "loss": 0.1607, + "step": 780 + }, + { + "epoch": 0.08342000693388804, + "grad_norm": 12.849146673239158, + "learning_rate": 1.6673773987206822e-06, + "loss": -0.4005, + "step": 782 + }, + { + "epoch": 0.08363335733525348, + "grad_norm": 15.52998781884974, + "learning_rate": 1.671641791044776e-06, + "loss": -0.2504, + "step": 784 + }, + { + "epoch": 0.08384670773661893, + "grad_norm": 7.223884938731002, + "learning_rate": 1.67590618336887e-06, + "loss": -0.9925, + "step": 786 + }, + { + "epoch": 0.08406005813798437, + "grad_norm": 4.8718682784669145, + "learning_rate": 1.6801705756929637e-06, + "loss": -0.3866, + "step": 788 + }, + { + "epoch": 0.08427340853934981, + "grad_norm": 17.60760255975411, + "learning_rate": 1.6844349680170575e-06, + "loss": -0.9417, + "step": 790 + }, + { + "epoch": 0.08448675894071525, + "grad_norm": 27.714848425193637, + "learning_rate": 1.6886993603411512e-06, + "loss": 0.3963, + "step": 792 + }, + { + "epoch": 0.0847001093420807, + "grad_norm": 35.493014709513936, + "learning_rate": 1.6929637526652452e-06, + "loss": -0.4515, + "step": 794 + }, + { + "epoch": 0.08491345974344615, + "grad_norm": 21.400152124030786, + "learning_rate": 1.6972281449893388e-06, + "loss": -0.67, + "step": 796 + }, + { + "epoch": 0.08512681014481159, + "grad_norm": 16.911674814364677, + "learning_rate": 1.7014925373134328e-06, + "loss": -0.1268, + "step": 798 + }, + { + "epoch": 0.08534016054617703, + "grad_norm": 13.009852458798768, + "learning_rate": 1.7057569296375265e-06, + "loss": -0.5053, + "step": 800 + }, + { + "epoch": 0.08555351094754247, + "grad_norm": 16.266588077557774, + "learning_rate": 1.7100213219616205e-06, + "loss": -0.7199, + "step": 802 + }, + { + "epoch": 0.08576686134890792, + "grad_norm": 19.16342878443152, + "learning_rate": 1.714285714285714e-06, + "loss": 0.0409, + "step": 804 + }, + { + "epoch": 0.08598021175027336, + "grad_norm": 19.235257083323877, + "learning_rate": 1.718550106609808e-06, + "loss": -0.3201, + "step": 806 + }, + { + "epoch": 0.0861935621516388, + "grad_norm": 13.755965707142774, + "learning_rate": 1.7228144989339018e-06, + "loss": -0.7414, + "step": 808 + }, + { + "epoch": 0.08640691255300424, + "grad_norm": 9.2611346408095, + "learning_rate": 1.7270788912579956e-06, + "loss": 0.543, + "step": 810 + }, + { + "epoch": 0.08662026295436968, + "grad_norm": 11.293068548525245, + "learning_rate": 1.7313432835820893e-06, + "loss": 0.5024, + "step": 812 + }, + { + "epoch": 0.08683361335573513, + "grad_norm": 8.43885851672634, + "learning_rate": 1.7356076759061833e-06, + "loss": -0.8492, + "step": 814 + }, + { + "epoch": 0.08704696375710057, + "grad_norm": 10.34717338583048, + "learning_rate": 1.7398720682302773e-06, + "loss": -0.8168, + "step": 816 + }, + { + "epoch": 0.08726031415846601, + "grad_norm": 4.346106915743504, + "learning_rate": 1.7441364605543709e-06, + "loss": -0.781, + "step": 818 + }, + { + "epoch": 0.08747366455983145, + "grad_norm": 16.261736385850632, + "learning_rate": 1.7484008528784648e-06, + "loss": 0.3626, + "step": 820 + }, + { + "epoch": 0.0876870149611969, + "grad_norm": 6.451053388102568, + "learning_rate": 1.7526652452025586e-06, + "loss": 0.1613, + "step": 822 + }, + { + "epoch": 0.08790036536256234, + "grad_norm": 13.971393729868783, + "learning_rate": 1.7569296375266524e-06, + "loss": 0.1887, + "step": 824 + }, + { + "epoch": 0.08811371576392778, + "grad_norm": 4.60426711294295, + "learning_rate": 1.7611940298507461e-06, + "loss": -0.0714, + "step": 826 + }, + { + "epoch": 0.08832706616529322, + "grad_norm": 7.788765445569996, + "learning_rate": 1.7654584221748401e-06, + "loss": 0.1057, + "step": 828 + }, + { + "epoch": 0.08854041656665866, + "grad_norm": 16.242118293664294, + "learning_rate": 1.7697228144989339e-06, + "loss": 0.2466, + "step": 830 + }, + { + "epoch": 0.0887537669680241, + "grad_norm": 14.723029688197625, + "learning_rate": 1.7739872068230277e-06, + "loss": -0.6136, + "step": 832 + }, + { + "epoch": 0.08896711736938955, + "grad_norm": 20.958772423479118, + "learning_rate": 1.7782515991471214e-06, + "loss": -1.257, + "step": 834 + }, + { + "epoch": 0.08918046777075499, + "grad_norm": 12.999192030716888, + "learning_rate": 1.7825159914712154e-06, + "loss": 0.7867, + "step": 836 + }, + { + "epoch": 0.08939381817212044, + "grad_norm": 8.749254011513045, + "learning_rate": 1.786780383795309e-06, + "loss": -0.1854, + "step": 838 + }, + { + "epoch": 0.08960716857348588, + "grad_norm": 28.375671216859047, + "learning_rate": 1.791044776119403e-06, + "loss": -1.1224, + "step": 840 + }, + { + "epoch": 0.08982051897485133, + "grad_norm": 11.982995716340508, + "learning_rate": 1.7953091684434967e-06, + "loss": 0.1255, + "step": 842 + }, + { + "epoch": 0.09003386937621677, + "grad_norm": 13.991706133756487, + "learning_rate": 1.7995735607675907e-06, + "loss": -0.8058, + "step": 844 + }, + { + "epoch": 0.09024721977758221, + "grad_norm": 6.919587436813207, + "learning_rate": 1.8038379530916842e-06, + "loss": -0.514, + "step": 846 + }, + { + "epoch": 0.09046057017894765, + "grad_norm": 8.081461324634596, + "learning_rate": 1.8081023454157782e-06, + "loss": -0.4794, + "step": 848 + }, + { + "epoch": 0.0906739205803131, + "grad_norm": 21.839476666247815, + "learning_rate": 1.812366737739872e-06, + "loss": -0.1143, + "step": 850 + }, + { + "epoch": 0.09088727098167854, + "grad_norm": 4.526543841561235, + "learning_rate": 1.8166311300639658e-06, + "loss": -0.2663, + "step": 852 + }, + { + "epoch": 0.09110062138304398, + "grad_norm": 4.910992516997152, + "learning_rate": 1.8208955223880595e-06, + "loss": -0.79, + "step": 854 + }, + { + "epoch": 0.09131397178440942, + "grad_norm": 13.643412575777333, + "learning_rate": 1.8251599147121535e-06, + "loss": 0.3456, + "step": 856 + }, + { + "epoch": 0.09152732218577486, + "grad_norm": 12.207406995564389, + "learning_rate": 1.829424307036247e-06, + "loss": -0.1668, + "step": 858 + }, + { + "epoch": 0.0917406725871403, + "grad_norm": 10.619915846442005, + "learning_rate": 1.833688699360341e-06, + "loss": 0.8485, + "step": 860 + }, + { + "epoch": 0.09195402298850575, + "grad_norm": 7.23403783091102, + "learning_rate": 1.8379530916844348e-06, + "loss": 0.0591, + "step": 862 + }, + { + "epoch": 0.09216737338987119, + "grad_norm": 8.49916268019599, + "learning_rate": 1.8422174840085288e-06, + "loss": 0.3622, + "step": 864 + }, + { + "epoch": 0.09238072379123663, + "grad_norm": 17.881493738990475, + "learning_rate": 1.8464818763326224e-06, + "loss": 0.5555, + "step": 866 + }, + { + "epoch": 0.09259407419260207, + "grad_norm": 12.811241804624386, + "learning_rate": 1.8507462686567163e-06, + "loss": 0.1441, + "step": 868 + }, + { + "epoch": 0.09280742459396751, + "grad_norm": 17.08876136574821, + "learning_rate": 1.85501066098081e-06, + "loss": -0.2161, + "step": 870 + }, + { + "epoch": 0.09302077499533296, + "grad_norm": 18.87972131413687, + "learning_rate": 1.859275053304904e-06, + "loss": -0.643, + "step": 872 + }, + { + "epoch": 0.0932341253966984, + "grad_norm": 19.214085243462733, + "learning_rate": 1.8635394456289976e-06, + "loss": -0.3228, + "step": 874 + }, + { + "epoch": 0.09344747579806384, + "grad_norm": 13.107944155833772, + "learning_rate": 1.8678038379530916e-06, + "loss": -0.0718, + "step": 876 + }, + { + "epoch": 0.09366082619942928, + "grad_norm": 11.902725378153237, + "learning_rate": 1.8720682302771856e-06, + "loss": 0.1609, + "step": 878 + }, + { + "epoch": 0.09387417660079474, + "grad_norm": 12.04227842087194, + "learning_rate": 1.8763326226012792e-06, + "loss": 0.542, + "step": 880 + }, + { + "epoch": 0.09408752700216018, + "grad_norm": 10.083129476715843, + "learning_rate": 1.8805970149253731e-06, + "loss": -0.2667, + "step": 882 + }, + { + "epoch": 0.09430087740352562, + "grad_norm": 13.758222232767348, + "learning_rate": 1.884861407249467e-06, + "loss": 0.8839, + "step": 884 + }, + { + "epoch": 0.09451422780489106, + "grad_norm": 26.46812082435916, + "learning_rate": 1.8891257995735609e-06, + "loss": -0.808, + "step": 886 + }, + { + "epoch": 0.0947275782062565, + "grad_norm": 22.4696755994172, + "learning_rate": 1.8933901918976544e-06, + "loss": 0.206, + "step": 888 + }, + { + "epoch": 0.09494092860762195, + "grad_norm": 13.650968128133929, + "learning_rate": 1.8976545842217484e-06, + "loss": -0.2053, + "step": 890 + }, + { + "epoch": 0.09515427900898739, + "grad_norm": 25.295891533832265, + "learning_rate": 1.9019189765458422e-06, + "loss": 0.5393, + "step": 892 + }, + { + "epoch": 0.09536762941035283, + "grad_norm": 4.717180645046515, + "learning_rate": 1.906183368869936e-06, + "loss": -0.1717, + "step": 894 + }, + { + "epoch": 0.09558097981171827, + "grad_norm": 3.980783546053527, + "learning_rate": 1.9104477611940297e-06, + "loss": -0.3342, + "step": 896 + }, + { + "epoch": 0.09579433021308371, + "grad_norm": 17.53966519035405, + "learning_rate": 1.9147121535181237e-06, + "loss": 0.1547, + "step": 898 + }, + { + "epoch": 0.09600768061444916, + "grad_norm": 37.64965817072852, + "learning_rate": 1.9189765458422173e-06, + "loss": -0.1405, + "step": 900 + }, + { + "epoch": 0.0962210310158146, + "grad_norm": 13.214607736783556, + "learning_rate": 1.9232409381663112e-06, + "loss": -0.6144, + "step": 902 + }, + { + "epoch": 0.09643438141718004, + "grad_norm": 13.755458504610731, + "learning_rate": 1.927505330490405e-06, + "loss": -0.5176, + "step": 904 + }, + { + "epoch": 0.09664773181854548, + "grad_norm": 13.160266215659087, + "learning_rate": 1.9317697228144988e-06, + "loss": -0.3805, + "step": 906 + }, + { + "epoch": 0.09686108221991092, + "grad_norm": 8.56427462579027, + "learning_rate": 1.9360341151385928e-06, + "loss": -0.3108, + "step": 908 + }, + { + "epoch": 0.09707443262127637, + "grad_norm": 7.732136866046191, + "learning_rate": 1.9402985074626867e-06, + "loss": -0.7029, + "step": 910 + }, + { + "epoch": 0.09728778302264181, + "grad_norm": 3.674972386186435, + "learning_rate": 1.9445628997867803e-06, + "loss": -0.171, + "step": 912 + }, + { + "epoch": 0.09750113342400725, + "grad_norm": 5.033223647050098, + "learning_rate": 1.9488272921108743e-06, + "loss": -0.3051, + "step": 914 + }, + { + "epoch": 0.09771448382537269, + "grad_norm": 6.185985978437299, + "learning_rate": 1.953091684434968e-06, + "loss": -0.2443, + "step": 916 + }, + { + "epoch": 0.09792783422673813, + "grad_norm": 12.390573810874344, + "learning_rate": 1.957356076759062e-06, + "loss": -0.9419, + "step": 918 + }, + { + "epoch": 0.09814118462810358, + "grad_norm": 33.36024010674656, + "learning_rate": 1.9616204690831554e-06, + "loss": -1.5106, + "step": 920 + }, + { + "epoch": 0.09835453502946902, + "grad_norm": 8.028565562422017, + "learning_rate": 1.9658848614072493e-06, + "loss": -0.0367, + "step": 922 + }, + { + "epoch": 0.09856788543083447, + "grad_norm": 11.244074568741834, + "learning_rate": 1.970149253731343e-06, + "loss": -0.198, + "step": 924 + }, + { + "epoch": 0.09878123583219992, + "grad_norm": 8.155488460294738, + "learning_rate": 1.974413646055437e-06, + "loss": -0.4533, + "step": 926 + }, + { + "epoch": 0.09899458623356536, + "grad_norm": 8.455296272398023, + "learning_rate": 1.978678038379531e-06, + "loss": -0.2075, + "step": 928 + }, + { + "epoch": 0.0992079366349308, + "grad_norm": 19.66763752484763, + "learning_rate": 1.982942430703625e-06, + "loss": -0.8438, + "step": 930 + }, + { + "epoch": 0.09942128703629624, + "grad_norm": 16.941346284552324, + "learning_rate": 1.9872068230277184e-06, + "loss": 0.2668, + "step": 932 + }, + { + "epoch": 0.09963463743766168, + "grad_norm": 30.19526185636887, + "learning_rate": 1.9914712153518124e-06, + "loss": 0.7882, + "step": 934 + }, + { + "epoch": 0.09984798783902712, + "grad_norm": 7.7652033417557185, + "learning_rate": 1.995735607675906e-06, + "loss": -0.5121, + "step": 936 + }, + { + "epoch": 0.10006133824039257, + "grad_norm": 10.002945250026167, + "learning_rate": 2e-06, + "loss": -0.4891, + "step": 938 + }, + { + "epoch": 0.10027468864175801, + "grad_norm": 9.562548994737613, + "learning_rate": 1.999999722631857e-06, + "loss": -0.1357, + "step": 940 + }, + { + "epoch": 0.10048803904312345, + "grad_norm": 9.18438104835198, + "learning_rate": 1.999998890527582e-06, + "loss": -1.313, + "step": 942 + }, + { + "epoch": 0.10070138944448889, + "grad_norm": 19.038858970547153, + "learning_rate": 1.9999975036876364e-06, + "loss": 0.5632, + "step": 944 + }, + { + "epoch": 0.10091473984585433, + "grad_norm": 8.880460982287538, + "learning_rate": 1.9999955621127898e-06, + "loss": -0.1845, + "step": 946 + }, + { + "epoch": 0.10112809024721978, + "grad_norm": 27.328430670871857, + "learning_rate": 1.999993065804119e-06, + "loss": -1.1547, + "step": 948 + }, + { + "epoch": 0.10134144064858522, + "grad_norm": 3.7529838788448946, + "learning_rate": 1.9999900147630093e-06, + "loss": 0.4223, + "step": 950 + }, + { + "epoch": 0.10155479104995066, + "grad_norm": 20.065464957241378, + "learning_rate": 1.9999864089911524e-06, + "loss": -0.6613, + "step": 952 + }, + { + "epoch": 0.1017681414513161, + "grad_norm": 6.44845480619213, + "learning_rate": 1.9999822484905493e-06, + "loss": -0.1856, + "step": 954 + }, + { + "epoch": 0.10198149185268154, + "grad_norm": 15.587441612646101, + "learning_rate": 1.9999775332635073e-06, + "loss": -0.659, + "step": 956 + }, + { + "epoch": 0.10219484225404699, + "grad_norm": 10.95597370533827, + "learning_rate": 1.9999722633126426e-06, + "loss": 0.1653, + "step": 958 + }, + { + "epoch": 0.10240819265541243, + "grad_norm": 8.043054511297433, + "learning_rate": 1.9999664386408786e-06, + "loss": -0.5782, + "step": 960 + }, + { + "epoch": 0.10262154305677787, + "grad_norm": 19.158675843982078, + "learning_rate": 1.9999600592514464e-06, + "loss": -0.2572, + "step": 962 + }, + { + "epoch": 0.10283489345814331, + "grad_norm": 6.444667549256743, + "learning_rate": 1.9999531251478848e-06, + "loss": -0.3405, + "step": 964 + }, + { + "epoch": 0.10304824385950877, + "grad_norm": 59.77200968326433, + "learning_rate": 1.9999456363340406e-06, + "loss": -0.3214, + "step": 966 + }, + { + "epoch": 0.10326159426087421, + "grad_norm": 6.48727133868538, + "learning_rate": 1.999937592814068e-06, + "loss": 0.193, + "step": 968 + }, + { + "epoch": 0.10347494466223965, + "grad_norm": 9.899885529844648, + "learning_rate": 1.999928994592429e-06, + "loss": -0.8417, + "step": 970 + }, + { + "epoch": 0.1036882950636051, + "grad_norm": 10.671301533657493, + "learning_rate": 1.999919841673893e-06, + "loss": 0.6551, + "step": 972 + }, + { + "epoch": 0.10390164546497054, + "grad_norm": 18.54534340015214, + "learning_rate": 1.999910134063538e-06, + "loss": -0.2989, + "step": 974 + }, + { + "epoch": 0.10411499586633598, + "grad_norm": 4.6709829991723835, + "learning_rate": 1.999899871766749e-06, + "loss": 0.0672, + "step": 976 + }, + { + "epoch": 0.10432834626770142, + "grad_norm": 5.48735260106571, + "learning_rate": 1.9998890547892183e-06, + "loss": -0.6261, + "step": 978 + }, + { + "epoch": 0.10454169666906686, + "grad_norm": 16.54236866901039, + "learning_rate": 1.9998776831369476e-06, + "loss": -0.2955, + "step": 980 + }, + { + "epoch": 0.1047550470704323, + "grad_norm": 18.85187898982959, + "learning_rate": 1.9998657568162446e-06, + "loss": 0.1206, + "step": 982 + }, + { + "epoch": 0.10496839747179774, + "grad_norm": 12.054223122838927, + "learning_rate": 1.999853275833725e-06, + "loss": 0.4311, + "step": 984 + }, + { + "epoch": 0.10518174787316319, + "grad_norm": 12.198982090572391, + "learning_rate": 1.9998402401963128e-06, + "loss": -0.442, + "step": 986 + }, + { + "epoch": 0.10539509827452863, + "grad_norm": 37.98649241207374, + "learning_rate": 1.999826649911239e-06, + "loss": -1.3668, + "step": 988 + }, + { + "epoch": 0.10560844867589407, + "grad_norm": 11.038918775818233, + "learning_rate": 1.9998125049860433e-06, + "loss": 0.6653, + "step": 990 + }, + { + "epoch": 0.10582179907725951, + "grad_norm": 6.476149411837476, + "learning_rate": 1.999797805428572e-06, + "loss": 0.2109, + "step": 992 + }, + { + "epoch": 0.10603514947862495, + "grad_norm": 15.406789152771664, + "learning_rate": 1.999782551246979e-06, + "loss": -0.5677, + "step": 994 + }, + { + "epoch": 0.1062484998799904, + "grad_norm": 29.873738526715083, + "learning_rate": 1.999766742449727e-06, + "loss": -1.3612, + "step": 996 + }, + { + "epoch": 0.10646185028135584, + "grad_norm": 12.480796329993506, + "learning_rate": 1.999750379045585e-06, + "loss": -0.3667, + "step": 998 + }, + { + "epoch": 0.10667520068272128, + "grad_norm": 13.76196266074063, + "learning_rate": 1.9997334610436318e-06, + "loss": 0.1408, + "step": 1000 + }, + { + "epoch": 0.10688855108408672, + "grad_norm": 7.342086872809186, + "learning_rate": 1.999715988453251e-06, + "loss": -0.1217, + "step": 1002 + }, + { + "epoch": 0.10710190148545216, + "grad_norm": 24.793407128549426, + "learning_rate": 1.9996979612841357e-06, + "loss": -0.2125, + "step": 1004 + }, + { + "epoch": 0.1073152518868176, + "grad_norm": 12.462548496218666, + "learning_rate": 1.9996793795462862e-06, + "loss": -0.9109, + "step": 1006 + }, + { + "epoch": 0.10752860228818305, + "grad_norm": 22.456483713157958, + "learning_rate": 1.9996602432500107e-06, + "loss": -0.8032, + "step": 1008 + }, + { + "epoch": 0.1077419526895485, + "grad_norm": 8.279610471346707, + "learning_rate": 1.999640552405925e-06, + "loss": 0.5294, + "step": 1010 + }, + { + "epoch": 0.10795530309091395, + "grad_norm": 8.492474727142435, + "learning_rate": 1.9996203070249514e-06, + "loss": 0.345, + "step": 1012 + }, + { + "epoch": 0.10816865349227939, + "grad_norm": 10.855775876716889, + "learning_rate": 1.9995995071183215e-06, + "loss": -0.0323, + "step": 1014 + }, + { + "epoch": 0.10838200389364483, + "grad_norm": 13.044679276302682, + "learning_rate": 1.999578152697574e-06, + "loss": -0.9284, + "step": 1016 + }, + { + "epoch": 0.10859535429501027, + "grad_norm": 13.041568910636132, + "learning_rate": 1.999556243774554e-06, + "loss": 0.0727, + "step": 1018 + }, + { + "epoch": 0.10880870469637571, + "grad_norm": 8.840285871574821, + "learning_rate": 1.9995337803614165e-06, + "loss": -0.8054, + "step": 1020 + }, + { + "epoch": 0.10902205509774116, + "grad_norm": 11.045434971938723, + "learning_rate": 1.999510762470621e-06, + "loss": -0.4123, + "step": 1022 + }, + { + "epoch": 0.1092354054991066, + "grad_norm": 9.987536744093788, + "learning_rate": 1.999487190114938e-06, + "loss": -0.276, + "step": 1024 + }, + { + "epoch": 0.10944875590047204, + "grad_norm": 21.660050733355092, + "learning_rate": 1.9994630633074433e-06, + "loss": 0.0614, + "step": 1026 + }, + { + "epoch": 0.10966210630183748, + "grad_norm": 13.372785558415854, + "learning_rate": 1.9994383820615212e-06, + "loss": -0.5344, + "step": 1028 + }, + { + "epoch": 0.10987545670320292, + "grad_norm": 12.356702814123958, + "learning_rate": 1.9994131463908624e-06, + "loss": -0.1296, + "step": 1030 + }, + { + "epoch": 0.11008880710456836, + "grad_norm": 4.739222690282173, + "learning_rate": 1.999387356309467e-06, + "loss": -0.0715, + "step": 1032 + }, + { + "epoch": 0.1103021575059338, + "grad_norm": 2.353063878699358, + "learning_rate": 1.9993610118316415e-06, + "loss": -0.0108, + "step": 1034 + }, + { + "epoch": 0.11051550790729925, + "grad_norm": 9.018922760973295, + "learning_rate": 1.9993341129719997e-06, + "loss": 0.1682, + "step": 1036 + }, + { + "epoch": 0.11072885830866469, + "grad_norm": 11.347020997940431, + "learning_rate": 1.9993066597454637e-06, + "loss": 0.503, + "step": 1038 + }, + { + "epoch": 0.11094220871003013, + "grad_norm": 6.599406732799808, + "learning_rate": 1.9992786521672633e-06, + "loss": -1.0294, + "step": 1040 + }, + { + "epoch": 0.11115555911139557, + "grad_norm": 10.664927908662923, + "learning_rate": 1.999250090252934e-06, + "loss": -0.0274, + "step": 1042 + }, + { + "epoch": 0.11136890951276102, + "grad_norm": 12.59123280862704, + "learning_rate": 1.9992209740183212e-06, + "loss": -1.3216, + "step": 1044 + }, + { + "epoch": 0.11158225991412646, + "grad_norm": 12.964624626643758, + "learning_rate": 1.9991913034795767e-06, + "loss": 0.4125, + "step": 1046 + }, + { + "epoch": 0.1117956103154919, + "grad_norm": 4.273338653928415, + "learning_rate": 1.9991610786531593e-06, + "loss": -0.4966, + "step": 1048 + }, + { + "epoch": 0.11200896071685734, + "grad_norm": 5.088476515803206, + "learning_rate": 1.999130299555836e-06, + "loss": 0.2971, + "step": 1050 + }, + { + "epoch": 0.1122223111182228, + "grad_norm": 6.974108763676147, + "learning_rate": 1.9990989662046816e-06, + "loss": -0.7651, + "step": 1052 + }, + { + "epoch": 0.11243566151958824, + "grad_norm": 5.249430049582833, + "learning_rate": 1.999067078617077e-06, + "loss": -0.5608, + "step": 1054 + }, + { + "epoch": 0.11264901192095368, + "grad_norm": 15.65538195259475, + "learning_rate": 1.999034636810712e-06, + "loss": -0.5415, + "step": 1056 + }, + { + "epoch": 0.11286236232231912, + "grad_norm": 5.252233548968885, + "learning_rate": 1.999001640803583e-06, + "loss": -0.7343, + "step": 1058 + }, + { + "epoch": 0.11307571272368457, + "grad_norm": 8.691749940228224, + "learning_rate": 1.998968090613994e-06, + "loss": -0.418, + "step": 1060 + }, + { + "epoch": 0.11328906312505001, + "grad_norm": 13.48795251406841, + "learning_rate": 1.998933986260557e-06, + "loss": -0.7516, + "step": 1062 + }, + { + "epoch": 0.11350241352641545, + "grad_norm": 19.397334567221574, + "learning_rate": 1.99889932776219e-06, + "loss": 0.3052, + "step": 1064 + }, + { + "epoch": 0.11371576392778089, + "grad_norm": 13.172606242995984, + "learning_rate": 1.99886411513812e-06, + "loss": -0.4304, + "step": 1066 + }, + { + "epoch": 0.11392911432914633, + "grad_norm": 28.550101024674554, + "learning_rate": 1.9988283484078813e-06, + "loss": 0.2051, + "step": 1068 + }, + { + "epoch": 0.11414246473051178, + "grad_norm": 11.786086088715077, + "learning_rate": 1.9987920275913135e-06, + "loss": 0.5976, + "step": 1070 + }, + { + "epoch": 0.11435581513187722, + "grad_norm": 20.47705125595093, + "learning_rate": 1.9987551527085665e-06, + "loss": -0.6878, + "step": 1072 + }, + { + "epoch": 0.11456916553324266, + "grad_norm": 17.256084624740367, + "learning_rate": 1.9987177237800954e-06, + "loss": -1.1181, + "step": 1074 + }, + { + "epoch": 0.1147825159346081, + "grad_norm": 20.121126793429465, + "learning_rate": 1.9986797408266633e-06, + "loss": -1.1092, + "step": 1076 + }, + { + "epoch": 0.11499586633597354, + "grad_norm": 9.787270565538346, + "learning_rate": 1.998641203869341e-06, + "loss": -0.0358, + "step": 1078 + }, + { + "epoch": 0.11520921673733898, + "grad_norm": 10.072654902116856, + "learning_rate": 1.9986021129295067e-06, + "loss": -1.121, + "step": 1080 + }, + { + "epoch": 0.11542256713870443, + "grad_norm": 26.902121573368476, + "learning_rate": 1.9985624680288445e-06, + "loss": -0.1283, + "step": 1082 + }, + { + "epoch": 0.11563591754006987, + "grad_norm": 6.9266710792537936, + "learning_rate": 1.998522269189348e-06, + "loss": -0.1049, + "step": 1084 + }, + { + "epoch": 0.11584926794143531, + "grad_norm": 11.110711462850878, + "learning_rate": 1.998481516433316e-06, + "loss": -0.51, + "step": 1086 + }, + { + "epoch": 0.11606261834280075, + "grad_norm": 12.887226914453029, + "learning_rate": 1.9984402097833563e-06, + "loss": -0.1358, + "step": 1088 + }, + { + "epoch": 0.1162759687441662, + "grad_norm": 11.597183300597381, + "learning_rate": 1.998398349262383e-06, + "loss": -1.1465, + "step": 1090 + }, + { + "epoch": 0.11648931914553164, + "grad_norm": 9.612007934917326, + "learning_rate": 1.9983559348936175e-06, + "loss": -0.4243, + "step": 1092 + }, + { + "epoch": 0.11670266954689708, + "grad_norm": 7.480442846976291, + "learning_rate": 1.9983129667005884e-06, + "loss": -0.8296, + "step": 1094 + }, + { + "epoch": 0.11691601994826253, + "grad_norm": 11.901507316301505, + "learning_rate": 1.998269444707132e-06, + "loss": 0.2059, + "step": 1096 + }, + { + "epoch": 0.11712937034962798, + "grad_norm": 11.84934594902733, + "learning_rate": 1.9982253689373918e-06, + "loss": -0.7015, + "step": 1098 + }, + { + "epoch": 0.11734272075099342, + "grad_norm": 18.98160723910663, + "learning_rate": 1.9981807394158177e-06, + "loss": 0.3484, + "step": 1100 + }, + { + "epoch": 0.11755607115235886, + "grad_norm": 9.999100561355233, + "learning_rate": 1.9981355561671677e-06, + "loss": -0.0919, + "step": 1102 + }, + { + "epoch": 0.1177694215537243, + "grad_norm": 8.253006466198785, + "learning_rate": 1.9980898192165063e-06, + "loss": -0.3401, + "step": 1104 + }, + { + "epoch": 0.11798277195508974, + "grad_norm": 21.38855116207188, + "learning_rate": 1.998043528589205e-06, + "loss": -0.2649, + "step": 1106 + }, + { + "epoch": 0.11819612235645519, + "grad_norm": 17.581276449720193, + "learning_rate": 1.9979966843109445e-06, + "loss": -0.7882, + "step": 1108 + }, + { + "epoch": 0.11840947275782063, + "grad_norm": 17.881994136606902, + "learning_rate": 1.9979492864077094e-06, + "loss": -0.9395, + "step": 1110 + }, + { + "epoch": 0.11862282315918607, + "grad_norm": 27.450698807590868, + "learning_rate": 1.9979013349057932e-06, + "loss": -1.1812, + "step": 1112 + }, + { + "epoch": 0.11883617356055151, + "grad_norm": 11.85016744188475, + "learning_rate": 1.997852829831797e-06, + "loss": -0.2322, + "step": 1114 + }, + { + "epoch": 0.11904952396191695, + "grad_norm": 19.57162822354579, + "learning_rate": 1.997803771212629e-06, + "loss": -0.2706, + "step": 1116 + }, + { + "epoch": 0.1192628743632824, + "grad_norm": 10.830735208685383, + "learning_rate": 1.997754159075502e-06, + "loss": -1.1194, + "step": 1118 + }, + { + "epoch": 0.11947622476464784, + "grad_norm": 4.730754320481347, + "learning_rate": 1.9977039934479385e-06, + "loss": -0.7898, + "step": 1120 + }, + { + "epoch": 0.11968957516601328, + "grad_norm": 8.745657757788235, + "learning_rate": 1.9976532743577673e-06, + "loss": 0.0379, + "step": 1122 + }, + { + "epoch": 0.11990292556737872, + "grad_norm": 28.813446860024147, + "learning_rate": 1.9976020018331243e-06, + "loss": 0.3645, + "step": 1124 + }, + { + "epoch": 0.12011627596874416, + "grad_norm": 10.440814030501423, + "learning_rate": 1.9975501759024517e-06, + "loss": -0.0548, + "step": 1126 + }, + { + "epoch": 0.1203296263701096, + "grad_norm": 11.416071472081667, + "learning_rate": 1.9974977965945e-06, + "loss": 0.2929, + "step": 1128 + }, + { + "epoch": 0.12054297677147505, + "grad_norm": 8.441152040615334, + "learning_rate": 1.9974448639383244e-06, + "loss": -0.6188, + "step": 1130 + }, + { + "epoch": 0.12075632717284049, + "grad_norm": 5.663303106994784, + "learning_rate": 1.9973913779632904e-06, + "loss": -0.4657, + "step": 1132 + }, + { + "epoch": 0.12096967757420593, + "grad_norm": 18.35892778992275, + "learning_rate": 1.9973373386990674e-06, + "loss": 0.2044, + "step": 1134 + }, + { + "epoch": 0.12118302797557137, + "grad_norm": 23.684412769651576, + "learning_rate": 1.9972827461756334e-06, + "loss": -0.4693, + "step": 1136 + }, + { + "epoch": 0.12139637837693683, + "grad_norm": 14.171496402604255, + "learning_rate": 1.997227600423273e-06, + "loss": -0.6943, + "step": 1138 + }, + { + "epoch": 0.12160972877830227, + "grad_norm": 13.922460920458215, + "learning_rate": 1.9971719014725768e-06, + "loss": 0.8172, + "step": 1140 + }, + { + "epoch": 0.12182307917966771, + "grad_norm": 21.72636134013763, + "learning_rate": 1.9971156493544437e-06, + "loss": -0.4408, + "step": 1142 + }, + { + "epoch": 0.12203642958103315, + "grad_norm": 20.82069946728396, + "learning_rate": 1.9970588441000787e-06, + "loss": -0.3291, + "step": 1144 + }, + { + "epoch": 0.1222497799823986, + "grad_norm": 15.957243371369229, + "learning_rate": 1.9970014857409936e-06, + "loss": -0.3534, + "step": 1146 + }, + { + "epoch": 0.12246313038376404, + "grad_norm": 13.228394989772614, + "learning_rate": 1.9969435743090076e-06, + "loss": -0.4047, + "step": 1148 + }, + { + "epoch": 0.12267648078512948, + "grad_norm": 9.336630176087915, + "learning_rate": 1.9968851098362455e-06, + "loss": -0.2281, + "step": 1150 + }, + { + "epoch": 0.12288983118649492, + "grad_norm": 21.372600065820446, + "learning_rate": 1.9968260923551405e-06, + "loss": -0.1527, + "step": 1152 + }, + { + "epoch": 0.12310318158786036, + "grad_norm": 10.49080827854095, + "learning_rate": 1.9967665218984306e-06, + "loss": 0.2273, + "step": 1154 + }, + { + "epoch": 0.1233165319892258, + "grad_norm": 26.54581332312223, + "learning_rate": 1.996706398499163e-06, + "loss": -0.0176, + "step": 1156 + }, + { + "epoch": 0.12352988239059125, + "grad_norm": 10.60684954766727, + "learning_rate": 1.9966457221906893e-06, + "loss": -0.7239, + "step": 1158 + }, + { + "epoch": 0.12374323279195669, + "grad_norm": 18.45210289211234, + "learning_rate": 1.9965844930066696e-06, + "loss": -0.4982, + "step": 1160 + }, + { + "epoch": 0.12395658319332213, + "grad_norm": 20.290379711344247, + "learning_rate": 1.9965227109810694e-06, + "loss": -0.1957, + "step": 1162 + }, + { + "epoch": 0.12416993359468757, + "grad_norm": 10.365692652900515, + "learning_rate": 1.9964603761481623e-06, + "loss": 0.3669, + "step": 1164 + }, + { + "epoch": 0.12438328399605302, + "grad_norm": 13.125311380390214, + "learning_rate": 1.996397488542526e-06, + "loss": -0.3221, + "step": 1166 + }, + { + "epoch": 0.12459663439741846, + "grad_norm": 14.361058641237294, + "learning_rate": 1.9963340481990486e-06, + "loss": 0.1227, + "step": 1168 + }, + { + "epoch": 0.1248099847987839, + "grad_norm": 7.056112272117484, + "learning_rate": 1.9962700551529214e-06, + "loss": 0.0827, + "step": 1170 + }, + { + "epoch": 0.12502333520014935, + "grad_norm": 20.11483934762056, + "learning_rate": 1.996205509439644e-06, + "loss": -1.1759, + "step": 1172 + }, + { + "epoch": 0.1252366856015148, + "grad_norm": 9.441979966620615, + "learning_rate": 1.996140411095022e-06, + "loss": -0.2787, + "step": 1174 + }, + { + "epoch": 0.12545003600288024, + "grad_norm": 6.0147967546848395, + "learning_rate": 1.9960747601551686e-06, + "loss": -0.791, + "step": 1176 + }, + { + "epoch": 0.12566338640424568, + "grad_norm": 27.231381943588612, + "learning_rate": 1.9960085566565015e-06, + "loss": -0.3671, + "step": 1178 + }, + { + "epoch": 0.12587673680561112, + "grad_norm": 9.81504324283188, + "learning_rate": 1.9959418006357476e-06, + "loss": 0.398, + "step": 1180 + }, + { + "epoch": 0.12609008720697656, + "grad_norm": 9.244327583663429, + "learning_rate": 1.995874492129938e-06, + "loss": -0.4957, + "step": 1182 + }, + { + "epoch": 0.126303437608342, + "grad_norm": 12.948378749756142, + "learning_rate": 1.995806631176411e-06, + "loss": -0.6401, + "step": 1184 + }, + { + "epoch": 0.12651678800970745, + "grad_norm": 14.684805388151638, + "learning_rate": 1.9957382178128122e-06, + "loss": -0.7437, + "step": 1186 + }, + { + "epoch": 0.1267301384110729, + "grad_norm": 12.31300809828398, + "learning_rate": 1.995669252077093e-06, + "loss": 0.1044, + "step": 1188 + }, + { + "epoch": 0.12694348881243833, + "grad_norm": 11.76491596226188, + "learning_rate": 1.9955997340075107e-06, + "loss": -0.4917, + "step": 1190 + }, + { + "epoch": 0.12715683921380377, + "grad_norm": 10.89131589218926, + "learning_rate": 1.9955296636426294e-06, + "loss": -0.0883, + "step": 1192 + }, + { + "epoch": 0.12737018961516922, + "grad_norm": 7.320449762607573, + "learning_rate": 1.9954590410213204e-06, + "loss": -0.4345, + "step": 1194 + }, + { + "epoch": 0.12758354001653466, + "grad_norm": 7.738748707298395, + "learning_rate": 1.9953878661827603e-06, + "loss": -0.1405, + "step": 1196 + }, + { + "epoch": 0.1277968904179001, + "grad_norm": 29.041058801442883, + "learning_rate": 1.9953161391664314e-06, + "loss": -0.0113, + "step": 1198 + }, + { + "epoch": 0.12801024081926554, + "grad_norm": 18.64878794716449, + "learning_rate": 1.9952438600121247e-06, + "loss": -0.3338, + "step": 1200 + } + ], + "logging_steps": 2, + "max_steps": 9374, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 600, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 102272174161920.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}