{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 389, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002570694087403599, "grad_norm": 0.0, "learning_rate": 1.6666666666666667e-06, "loss": 1.7749, "step": 1 }, { "epoch": 0.005141388174807198, "grad_norm": 0.0, "learning_rate": 3.3333333333333333e-06, "loss": 1.7549, "step": 2 }, { "epoch": 0.007712082262210797, "grad_norm": 0.0, "learning_rate": 5e-06, "loss": 1.7659, "step": 3 }, { "epoch": 0.010282776349614395, "grad_norm": 0.0, "learning_rate": 6.666666666666667e-06, "loss": 1.7842, "step": 4 }, { "epoch": 0.012853470437017995, "grad_norm": 0.0, "learning_rate": 8.333333333333334e-06, "loss": 1.8015, "step": 5 }, { "epoch": 0.015424164524421594, "grad_norm": 0.0, "learning_rate": 1e-05, "loss": 1.7534, "step": 6 }, { "epoch": 0.017994858611825194, "grad_norm": 0.0, "learning_rate": 1.1666666666666668e-05, "loss": 1.7747, "step": 7 }, { "epoch": 0.02056555269922879, "grad_norm": 0.0, "learning_rate": 1.3333333333333333e-05, "loss": 1.7783, "step": 8 }, { "epoch": 0.02313624678663239, "grad_norm": 0.0, "learning_rate": 1.5000000000000002e-05, "loss": 1.803, "step": 9 }, { "epoch": 0.02570694087403599, "grad_norm": 0.0, "learning_rate": 1.6666666666666667e-05, "loss": 1.7859, "step": 10 }, { "epoch": 0.028277634961439587, "grad_norm": 0.0, "learning_rate": 1.8333333333333333e-05, "loss": 1.7856, "step": 11 }, { "epoch": 0.030848329048843187, "grad_norm": 0.0, "learning_rate": 2e-05, "loss": 1.7825, "step": 12 }, { "epoch": 0.033419023136246784, "grad_norm": 0.0, "learning_rate": 1.9999652796146877e-05, "loss": 1.8127, "step": 13 }, { "epoch": 0.03598971722365039, "grad_norm": 0.0, "learning_rate": 1.9998611208697607e-05, "loss": 1.8115, "step": 14 }, { "epoch": 0.038560411311053984, "grad_norm": 0.0, "learning_rate": 1.9996875309980824e-05, "loss": 1.7544, "step": 15 }, { "epoch": 0.04113110539845758, "grad_norm": 0.0, "learning_rate": 1.9994445220538678e-05, "loss": 1.7886, "step": 16 }, { "epoch": 0.043701799485861184, "grad_norm": 0.0, "learning_rate": 1.999132110911845e-05, "loss": 1.7866, "step": 17 }, { "epoch": 0.04627249357326478, "grad_norm": 0.0, "learning_rate": 1.9987503192660842e-05, "loss": 1.7612, "step": 18 }, { "epoch": 0.04884318766066838, "grad_norm": 0.0, "learning_rate": 1.9982991736284914e-05, "loss": 1.7944, "step": 19 }, { "epoch": 0.05141388174807198, "grad_norm": 0.0, "learning_rate": 1.997778705326968e-05, "loss": 1.7656, "step": 20 }, { "epoch": 0.05398457583547558, "grad_norm": 0.0, "learning_rate": 1.9971889505032337e-05, "loss": 1.7554, "step": 21 }, { "epoch": 0.056555269922879174, "grad_norm": 0.0, "learning_rate": 1.9965299501103178e-05, "loss": 1.7637, "step": 22 }, { "epoch": 0.05912596401028278, "grad_norm": 0.0, "learning_rate": 1.995801749909715e-05, "loss": 1.7803, "step": 23 }, { "epoch": 0.061696658097686374, "grad_norm": 0.0, "learning_rate": 1.995004400468209e-05, "loss": 1.7402, "step": 24 }, { "epoch": 0.06426735218508997, "grad_norm": 0.0, "learning_rate": 1.9941379571543597e-05, "loss": 1.7017, "step": 25 }, { "epoch": 0.06683804627249357, "grad_norm": 0.0, "learning_rate": 1.9932024801346583e-05, "loss": 1.7671, "step": 26 }, { "epoch": 0.06940874035989718, "grad_norm": 0.0, "learning_rate": 1.992198034369349e-05, "loss": 1.7014, "step": 27 }, { "epoch": 0.07197943444730077, "grad_norm": 0.0, "learning_rate": 1.991124689607921e-05, "loss": 1.7532, "step": 28 }, { "epoch": 0.07455012853470437, "grad_norm": 0.0, "learning_rate": 1.9899825203842613e-05, "loss": 1.7129, "step": 29 }, { "epoch": 0.07712082262210797, "grad_norm": 0.0, "learning_rate": 1.988771606011481e-05, "loss": 1.7126, "step": 30 }, { "epoch": 0.07969151670951156, "grad_norm": 0.0, "learning_rate": 1.987492030576407e-05, "loss": 1.7393, "step": 31 }, { "epoch": 0.08226221079691516, "grad_norm": 0.0, "learning_rate": 1.986143882933744e-05, "loss": 1.7742, "step": 32 }, { "epoch": 0.08483290488431877, "grad_norm": 0.0, "learning_rate": 1.9847272566999026e-05, "loss": 1.7483, "step": 33 }, { "epoch": 0.08740359897172237, "grad_norm": 0.0, "learning_rate": 1.9832422502465013e-05, "loss": 1.707, "step": 34 }, { "epoch": 0.08997429305912596, "grad_norm": 0.0, "learning_rate": 1.9816889666935318e-05, "loss": 1.7507, "step": 35 }, { "epoch": 0.09254498714652956, "grad_norm": 0.0, "learning_rate": 1.9800675139022006e-05, "loss": 1.7339, "step": 36 }, { "epoch": 0.09511568123393316, "grad_norm": 0.0, "learning_rate": 1.9783780044674402e-05, "loss": 1.748, "step": 37 }, { "epoch": 0.09768637532133675, "grad_norm": 0.0, "learning_rate": 1.976620555710087e-05, "loss": 1.686, "step": 38 }, { "epoch": 0.10025706940874037, "grad_norm": 0.0, "learning_rate": 1.974795289668737e-05, "loss": 1.7043, "step": 39 }, { "epoch": 0.10282776349614396, "grad_norm": 0.0, "learning_rate": 1.972902333091271e-05, "loss": 1.7646, "step": 40 }, { "epoch": 0.10539845758354756, "grad_norm": 0.0, "learning_rate": 1.9709418174260523e-05, "loss": 1.6802, "step": 41 }, { "epoch": 0.10796915167095116, "grad_norm": 0.0, "learning_rate": 1.9689138788127994e-05, "loss": 1.6775, "step": 42 }, { "epoch": 0.11053984575835475, "grad_norm": 0.0, "learning_rate": 1.966818658073133e-05, "loss": 1.6633, "step": 43 }, { "epoch": 0.11311053984575835, "grad_norm": 0.0, "learning_rate": 1.9646563007007952e-05, "loss": 1.7637, "step": 44 }, { "epoch": 0.11568123393316196, "grad_norm": 0.0, "learning_rate": 1.9624269568515486e-05, "loss": 1.7087, "step": 45 }, { "epoch": 0.11825192802056556, "grad_norm": 0.0, "learning_rate": 1.960130781332748e-05, "loss": 1.6562, "step": 46 }, { "epoch": 0.12082262210796915, "grad_norm": 0.0, "learning_rate": 1.957767933592591e-05, "loss": 1.698, "step": 47 }, { "epoch": 0.12339331619537275, "grad_norm": 0.0, "learning_rate": 1.955338577709046e-05, "loss": 1.7444, "step": 48 }, { "epoch": 0.12596401028277635, "grad_norm": 0.0, "learning_rate": 1.9528428823784567e-05, "loss": 1.6743, "step": 49 }, { "epoch": 0.12853470437017994, "grad_norm": 0.0, "learning_rate": 1.9502810209038302e-05, "loss": 1.6741, "step": 50 }, { "epoch": 0.13110539845758354, "grad_norm": 0.0, "learning_rate": 1.9476531711828027e-05, "loss": 1.708, "step": 51 }, { "epoch": 0.13367609254498714, "grad_norm": 0.0, "learning_rate": 1.9449595156952827e-05, "loss": 1.6587, "step": 52 }, { "epoch": 0.13624678663239073, "grad_norm": 0.0, "learning_rate": 1.9422002414907837e-05, "loss": 1.6887, "step": 53 }, { "epoch": 0.13881748071979436, "grad_norm": 0.0, "learning_rate": 1.9393755401754324e-05, "loss": 1.6714, "step": 54 }, { "epoch": 0.14138817480719795, "grad_norm": 0.0, "learning_rate": 1.936485607898665e-05, "loss": 1.7432, "step": 55 }, { "epoch": 0.14395886889460155, "grad_norm": 0.0, "learning_rate": 1.9335306453396066e-05, "loss": 1.6675, "step": 56 }, { "epoch": 0.14652956298200515, "grad_norm": 0.0, "learning_rate": 1.9305108576931336e-05, "loss": 1.6436, "step": 57 }, { "epoch": 0.14910025706940874, "grad_norm": 0.0, "learning_rate": 1.927426454655627e-05, "loss": 1.6853, "step": 58 }, { "epoch": 0.15167095115681234, "grad_norm": 0.0, "learning_rate": 1.924277650410412e-05, "loss": 1.6641, "step": 59 }, { "epoch": 0.15424164524421594, "grad_norm": 0.0, "learning_rate": 1.9210646636128805e-05, "loss": 1.7385, "step": 60 }, { "epoch": 0.15681233933161953, "grad_norm": 0.0, "learning_rate": 1.9177877173753127e-05, "loss": 1.7178, "step": 61 }, { "epoch": 0.15938303341902313, "grad_norm": 0.0, "learning_rate": 1.91444703925138e-05, "loss": 1.6785, "step": 62 }, { "epoch": 0.16195372750642673, "grad_norm": 0.0, "learning_rate": 1.9110428612203463e-05, "loss": 1.6799, "step": 63 }, { "epoch": 0.16452442159383032, "grad_norm": 0.0, "learning_rate": 1.9075754196709574e-05, "loss": 1.7075, "step": 64 }, { "epoch": 0.16709511568123395, "grad_norm": 0.0, "learning_rate": 1.904044955385026e-05, "loss": 1.6621, "step": 65 }, { "epoch": 0.16966580976863754, "grad_norm": 0.0, "learning_rate": 1.9004517135207127e-05, "loss": 1.6492, "step": 66 }, { "epoch": 0.17223650385604114, "grad_norm": 0.0, "learning_rate": 1.8967959435955027e-05, "loss": 1.7297, "step": 67 }, { "epoch": 0.17480719794344474, "grad_norm": 0.0, "learning_rate": 1.893077899468876e-05, "loss": 1.6882, "step": 68 }, { "epoch": 0.17737789203084833, "grad_norm": 0.0, "learning_rate": 1.889297839324682e-05, "loss": 1.6714, "step": 69 }, { "epoch": 0.17994858611825193, "grad_norm": 0.0, "learning_rate": 1.8854560256532098e-05, "loss": 1.6489, "step": 70 }, { "epoch": 0.18251928020565553, "grad_norm": 0.0, "learning_rate": 1.8815527252329624e-05, "loss": 1.6721, "step": 71 }, { "epoch": 0.18508997429305912, "grad_norm": 0.0, "learning_rate": 1.8775882091121282e-05, "loss": 1.6533, "step": 72 }, { "epoch": 0.18766066838046272, "grad_norm": 0.0, "learning_rate": 1.8735627525897618e-05, "loss": 1.6443, "step": 73 }, { "epoch": 0.19023136246786632, "grad_norm": 0.0, "learning_rate": 1.8694766351966665e-05, "loss": 1.6631, "step": 74 }, { "epoch": 0.1928020565552699, "grad_norm": 0.0, "learning_rate": 1.8653301406759827e-05, "loss": 1.6873, "step": 75 }, { "epoch": 0.1953727506426735, "grad_norm": 0.0, "learning_rate": 1.8611235569634852e-05, "loss": 1.7046, "step": 76 }, { "epoch": 0.19794344473007713, "grad_norm": 0.0, "learning_rate": 1.8568571761675893e-05, "loss": 1.7002, "step": 77 }, { "epoch": 0.20051413881748073, "grad_norm": 0.0, "learning_rate": 1.8525312945490647e-05, "loss": 1.698, "step": 78 }, { "epoch": 0.20308483290488433, "grad_norm": 0.0, "learning_rate": 1.8481462125004647e-05, "loss": 1.6765, "step": 79 }, { "epoch": 0.20565552699228792, "grad_norm": 0.0, "learning_rate": 1.8437022345252666e-05, "loss": 1.7185, "step": 80 }, { "epoch": 0.20822622107969152, "grad_norm": 0.0, "learning_rate": 1.8391996692167242e-05, "loss": 1.6653, "step": 81 }, { "epoch": 0.21079691516709512, "grad_norm": 0.0, "learning_rate": 1.8346388292364438e-05, "loss": 1.7129, "step": 82 }, { "epoch": 0.2133676092544987, "grad_norm": 0.0, "learning_rate": 1.8300200312926674e-05, "loss": 1.6709, "step": 83 }, { "epoch": 0.2159383033419023, "grad_norm": 0.0, "learning_rate": 1.8253435961182844e-05, "loss": 1.6597, "step": 84 }, { "epoch": 0.2185089974293059, "grad_norm": 0.0, "learning_rate": 1.8206098484485563e-05, "loss": 1.6812, "step": 85 }, { "epoch": 0.2210796915167095, "grad_norm": 0.0, "learning_rate": 1.8158191169985696e-05, "loss": 1.6792, "step": 86 }, { "epoch": 0.2236503856041131, "grad_norm": 0.0, "learning_rate": 1.810971734440408e-05, "loss": 1.6404, "step": 87 }, { "epoch": 0.2262210796915167, "grad_norm": 0.0, "learning_rate": 1.806068037380052e-05, "loss": 1.6528, "step": 88 }, { "epoch": 0.22879177377892032, "grad_norm": 0.0, "learning_rate": 1.801108366334004e-05, "loss": 1.6775, "step": 89 }, { "epoch": 0.23136246786632392, "grad_norm": 0.0, "learning_rate": 1.796093065705644e-05, "loss": 1.679, "step": 90 }, { "epoch": 0.23393316195372751, "grad_norm": 0.0, "learning_rate": 1.791022483761312e-05, "loss": 1.658, "step": 91 }, { "epoch": 0.2365038560411311, "grad_norm": 0.0, "learning_rate": 1.7858969726061262e-05, "loss": 1.6277, "step": 92 }, { "epoch": 0.2390745501285347, "grad_norm": 0.0, "learning_rate": 1.7807168881595304e-05, "loss": 1.6602, "step": 93 }, { "epoch": 0.2416452442159383, "grad_norm": 0.0, "learning_rate": 1.7754825901305814e-05, "loss": 1.6758, "step": 94 }, { "epoch": 0.2442159383033419, "grad_norm": 0.0, "learning_rate": 1.7701944419929673e-05, "loss": 1.6353, "step": 95 }, { "epoch": 0.2467866323907455, "grad_norm": 0.0, "learning_rate": 1.7648528109597704e-05, "loss": 1.6602, "step": 96 }, { "epoch": 0.2493573264781491, "grad_norm": 0.0, "learning_rate": 1.7594580679579654e-05, "loss": 1.6404, "step": 97 }, { "epoch": 0.2519280205655527, "grad_norm": 0.0, "learning_rate": 1.7540105876026647e-05, "loss": 1.6365, "step": 98 }, { "epoch": 0.2544987146529563, "grad_norm": 0.0, "learning_rate": 1.7485107481711014e-05, "loss": 1.6353, "step": 99 }, { "epoch": 0.2570694087403599, "grad_norm": 0.0, "learning_rate": 1.7429589315763637e-05, "loss": 1.6541, "step": 100 }, { "epoch": 0.2596401028277635, "grad_norm": 0.0, "learning_rate": 1.737355523340875e-05, "loss": 1.6133, "step": 101 }, { "epoch": 0.2622107969151671, "grad_norm": 0.0, "learning_rate": 1.7317009125696208e-05, "loss": 1.6687, "step": 102 }, { "epoch": 0.2647814910025707, "grad_norm": 0.0, "learning_rate": 1.725995491923131e-05, "loss": 1.636, "step": 103 }, { "epoch": 0.26735218508997427, "grad_norm": 0.0, "learning_rate": 1.7202396575902118e-05, "loss": 1.6497, "step": 104 }, { "epoch": 0.2699228791773779, "grad_norm": 0.0, "learning_rate": 1.714433809260435e-05, "loss": 1.6458, "step": 105 }, { "epoch": 0.27249357326478146, "grad_norm": 0.0, "learning_rate": 1.7085783500963825e-05, "loss": 1.624, "step": 106 }, { "epoch": 0.2750642673521851, "grad_norm": 0.0, "learning_rate": 1.702673686705651e-05, "loss": 1.6353, "step": 107 }, { "epoch": 0.2776349614395887, "grad_norm": 0.0, "learning_rate": 1.6967202291126174e-05, "loss": 1.6406, "step": 108 }, { "epoch": 0.2802056555269923, "grad_norm": 0.0, "learning_rate": 1.690718390729964e-05, "loss": 1.6323, "step": 109 }, { "epoch": 0.2827763496143959, "grad_norm": 0.0, "learning_rate": 1.684668588329973e-05, "loss": 1.665, "step": 110 }, { "epoch": 0.2853470437017995, "grad_norm": 0.0, "learning_rate": 1.6785712420155864e-05, "loss": 1.635, "step": 111 }, { "epoch": 0.2879177377892031, "grad_norm": 0.0, "learning_rate": 1.67242677519123e-05, "loss": 1.6335, "step": 112 }, { "epoch": 0.29048843187660667, "grad_norm": 0.0, "learning_rate": 1.6662356145334158e-05, "loss": 1.6846, "step": 113 }, { "epoch": 0.2930591259640103, "grad_norm": 0.0, "learning_rate": 1.6599981899611103e-05, "loss": 1.6353, "step": 114 }, { "epoch": 0.29562982005141386, "grad_norm": 0.0, "learning_rate": 1.653714934605883e-05, "loss": 1.6189, "step": 115 }, { "epoch": 0.2982005141388175, "grad_norm": 0.0, "learning_rate": 1.647386284781828e-05, "loss": 1.7021, "step": 116 }, { "epoch": 0.30077120822622105, "grad_norm": 0.0, "learning_rate": 1.6410126799552653e-05, "loss": 1.6777, "step": 117 }, { "epoch": 0.3033419023136247, "grad_norm": 0.0, "learning_rate": 1.6345945627142264e-05, "loss": 1.6377, "step": 118 }, { "epoch": 0.3059125964010283, "grad_norm": 0.0, "learning_rate": 1.628132378737718e-05, "loss": 1.6616, "step": 119 }, { "epoch": 0.30848329048843187, "grad_norm": 0.0, "learning_rate": 1.6216265767647756e-05, "loss": 1.616, "step": 120 }, { "epoch": 0.3110539845758355, "grad_norm": 0.0, "learning_rate": 1.615077608563302e-05, "loss": 1.6816, "step": 121 }, { "epoch": 0.31362467866323906, "grad_norm": 0.0, "learning_rate": 1.6084859288986957e-05, "loss": 1.6099, "step": 122 }, { "epoch": 0.3161953727506427, "grad_norm": 0.0, "learning_rate": 1.601851995502272e-05, "loss": 1.6274, "step": 123 }, { "epoch": 0.31876606683804626, "grad_norm": 0.0, "learning_rate": 1.5951762690394788e-05, "loss": 1.6663, "step": 124 }, { "epoch": 0.3213367609254499, "grad_norm": 0.0, "learning_rate": 1.5884592130779056e-05, "loss": 1.6494, "step": 125 }, { "epoch": 0.32390745501285345, "grad_norm": 0.0, "learning_rate": 1.581701294055095e-05, "loss": 1.614, "step": 126 }, { "epoch": 0.3264781491002571, "grad_norm": 0.0, "learning_rate": 1.5749029812461515e-05, "loss": 1.6265, "step": 127 }, { "epoch": 0.32904884318766064, "grad_norm": 0.0, "learning_rate": 1.568064746731156e-05, "loss": 1.5913, "step": 128 }, { "epoch": 0.33161953727506427, "grad_norm": 0.0, "learning_rate": 1.5611870653623826e-05, "loss": 1.5984, "step": 129 }, { "epoch": 0.3341902313624679, "grad_norm": 0.0, "learning_rate": 1.5542704147313257e-05, "loss": 1.6343, "step": 130 }, { "epoch": 0.33676092544987146, "grad_norm": 0.0, "learning_rate": 1.5473152751355353e-05, "loss": 1.6355, "step": 131 }, { "epoch": 0.3393316195372751, "grad_norm": 0.0, "learning_rate": 1.5403221295452647e-05, "loss": 1.647, "step": 132 }, { "epoch": 0.34190231362467866, "grad_norm": 0.0, "learning_rate": 1.5332914635699327e-05, "loss": 1.6191, "step": 133 }, { "epoch": 0.3444730077120823, "grad_norm": 0.0, "learning_rate": 1.5262237654244026e-05, "loss": 1.624, "step": 134 }, { "epoch": 0.34704370179948585, "grad_norm": 0.0, "learning_rate": 1.5191195258950804e-05, "loss": 1.6055, "step": 135 }, { "epoch": 0.3496143958868895, "grad_norm": 0.0, "learning_rate": 1.5119792383058338e-05, "loss": 1.6492, "step": 136 }, { "epoch": 0.35218508997429304, "grad_norm": 0.0, "learning_rate": 1.5048033984837352e-05, "loss": 1.6155, "step": 137 }, { "epoch": 0.35475578406169667, "grad_norm": 0.0, "learning_rate": 1.4975925047246319e-05, "loss": 1.6042, "step": 138 }, { "epoch": 0.35732647814910024, "grad_norm": 0.0, "learning_rate": 1.4903470577585433e-05, "loss": 1.6367, "step": 139 }, { "epoch": 0.35989717223650386, "grad_norm": 0.0, "learning_rate": 1.4830675607148899e-05, "loss": 1.5928, "step": 140 }, { "epoch": 0.36246786632390743, "grad_norm": 0.0, "learning_rate": 1.475754519087557e-05, "loss": 1.6526, "step": 141 }, { "epoch": 0.36503856041131105, "grad_norm": 0.0, "learning_rate": 1.4684084406997903e-05, "loss": 1.6362, "step": 142 }, { "epoch": 0.3676092544987147, "grad_norm": 0.0, "learning_rate": 1.4610298356689341e-05, "loss": 1.6201, "step": 143 }, { "epoch": 0.37017994858611825, "grad_norm": 0.0, "learning_rate": 1.453619216371008e-05, "loss": 1.6162, "step": 144 }, { "epoch": 0.37275064267352187, "grad_norm": 0.0, "learning_rate": 1.446177097405127e-05, "loss": 1.6172, "step": 145 }, { "epoch": 0.37532133676092544, "grad_norm": 0.0, "learning_rate": 1.4387039955577668e-05, "loss": 1.6301, "step": 146 }, { "epoch": 0.37789203084832906, "grad_norm": 0.0, "learning_rate": 1.4312004297668791e-05, "loss": 1.6096, "step": 147 }, { "epoch": 0.38046272493573263, "grad_norm": 0.0, "learning_rate": 1.4236669210858544e-05, "loss": 1.6152, "step": 148 }, { "epoch": 0.38303341902313626, "grad_norm": 0.0, "learning_rate": 1.4161039926473412e-05, "loss": 1.6321, "step": 149 }, { "epoch": 0.3856041131105398, "grad_norm": 0.0, "learning_rate": 1.4085121696269185e-05, "loss": 1.5957, "step": 150 }, { "epoch": 0.38817480719794345, "grad_norm": 0.0, "learning_rate": 1.4008919792066273e-05, "loss": 1.6421, "step": 151 }, { "epoch": 0.390745501285347, "grad_norm": 0.0, "learning_rate": 1.3932439505383628e-05, "loss": 1.6189, "step": 152 }, { "epoch": 0.39331619537275064, "grad_norm": 0.0, "learning_rate": 1.385568614707129e-05, "loss": 1.6106, "step": 153 }, { "epoch": 0.39588688946015427, "grad_norm": 0.0, "learning_rate": 1.3778665046941616e-05, "loss": 1.6321, "step": 154 }, { "epoch": 0.39845758354755784, "grad_norm": 0.0, "learning_rate": 1.3701381553399147e-05, "loss": 1.5796, "step": 155 }, { "epoch": 0.40102827763496146, "grad_norm": 0.0, "learning_rate": 1.3623841033069232e-05, "loss": 1.6555, "step": 156 }, { "epoch": 0.40359897172236503, "grad_norm": 0.0, "learning_rate": 1.3546048870425356e-05, "loss": 1.6028, "step": 157 }, { "epoch": 0.40616966580976865, "grad_norm": 0.0, "learning_rate": 1.3468010467415248e-05, "loss": 1.5969, "step": 158 }, { "epoch": 0.4087403598971722, "grad_norm": 0.0, "learning_rate": 1.3389731243085747e-05, "loss": 1.6077, "step": 159 }, { "epoch": 0.41131105398457585, "grad_norm": 0.0, "learning_rate": 1.3311216633206514e-05, "loss": 1.5762, "step": 160 }, { "epoch": 0.4138817480719794, "grad_norm": 0.0, "learning_rate": 1.3232472089892567e-05, "loss": 1.6079, "step": 161 }, { "epoch": 0.41645244215938304, "grad_norm": 0.0, "learning_rate": 1.315350308122567e-05, "loss": 1.5994, "step": 162 }, { "epoch": 0.4190231362467866, "grad_norm": 0.0, "learning_rate": 1.3074315090874652e-05, "loss": 1.5732, "step": 163 }, { "epoch": 0.42159383033419023, "grad_norm": 0.0, "learning_rate": 1.2994913617714573e-05, "loss": 1.5901, "step": 164 }, { "epoch": 0.4241645244215938, "grad_norm": 0.0, "learning_rate": 1.2915304175444929e-05, "loss": 1.6138, "step": 165 }, { "epoch": 0.4267352185089974, "grad_norm": 0.0, "learning_rate": 1.2835492292206735e-05, "loss": 1.5945, "step": 166 }, { "epoch": 0.42930591259640105, "grad_norm": 0.0, "learning_rate": 1.2755483510198668e-05, "loss": 1.6067, "step": 167 }, { "epoch": 0.4318766066838046, "grad_norm": 0.0, "learning_rate": 1.2675283385292212e-05, "loss": 1.5957, "step": 168 }, { "epoch": 0.43444730077120824, "grad_norm": 0.0, "learning_rate": 1.2594897486645836e-05, "loss": 1.6089, "step": 169 }, { "epoch": 0.4370179948586118, "grad_norm": 0.0, "learning_rate": 1.2514331396318298e-05, "loss": 1.6335, "step": 170 }, { "epoch": 0.43958868894601544, "grad_norm": 0.0, "learning_rate": 1.2433590708880991e-05, "loss": 1.6406, "step": 171 }, { "epoch": 0.442159383033419, "grad_norm": 0.0, "learning_rate": 1.2352681031029476e-05, "loss": 1.5759, "step": 172 }, { "epoch": 0.44473007712082263, "grad_norm": 0.0, "learning_rate": 1.2271607981194132e-05, "loss": 1.5955, "step": 173 }, { "epoch": 0.4473007712082262, "grad_norm": 0.0, "learning_rate": 1.2190377189150016e-05, "loss": 1.6069, "step": 174 }, { "epoch": 0.4498714652956298, "grad_norm": 0.0, "learning_rate": 1.2108994295625924e-05, "loss": 1.5796, "step": 175 }, { "epoch": 0.4524421593830334, "grad_norm": 0.0, "learning_rate": 1.2027464951912703e-05, "loss": 1.5952, "step": 176 }, { "epoch": 0.455012853470437, "grad_norm": 0.0, "learning_rate": 1.1945794819470805e-05, "loss": 1.6213, "step": 177 }, { "epoch": 0.45758354755784064, "grad_norm": 0.0, "learning_rate": 1.1863989569537165e-05, "loss": 1.5974, "step": 178 }, { "epoch": 0.4601542416452442, "grad_norm": 0.0, "learning_rate": 1.1782054882731377e-05, "loss": 1.5188, "step": 179 }, { "epoch": 0.46272493573264784, "grad_norm": 0.0, "learning_rate": 1.1699996448661242e-05, "loss": 1.5964, "step": 180 }, { "epoch": 0.4652956298200514, "grad_norm": 0.0, "learning_rate": 1.161781996552765e-05, "loss": 1.5681, "step": 181 }, { "epoch": 0.46786632390745503, "grad_norm": 0.0, "learning_rate": 1.1535531139728918e-05, "loss": 1.5938, "step": 182 }, { "epoch": 0.4704370179948586, "grad_norm": 0.0, "learning_rate": 1.1453135685464524e-05, "loss": 1.574, "step": 183 }, { "epoch": 0.4730077120822622, "grad_norm": 0.0, "learning_rate": 1.1370639324338313e-05, "loss": 1.5872, "step": 184 }, { "epoch": 0.4755784061696658, "grad_norm": 0.0, "learning_rate": 1.1288047784961166e-05, "loss": 1.5806, "step": 185 }, { "epoch": 0.4781491002570694, "grad_norm": 0.0, "learning_rate": 1.1205366802553231e-05, "loss": 1.5542, "step": 186 }, { "epoch": 0.480719794344473, "grad_norm": 0.0, "learning_rate": 1.1122602118545642e-05, "loss": 1.5723, "step": 187 }, { "epoch": 0.4832904884318766, "grad_norm": 0.0, "learning_rate": 1.1039759480181836e-05, "loss": 1.5645, "step": 188 }, { "epoch": 0.48586118251928023, "grad_norm": 0.0, "learning_rate": 1.0956844640118462e-05, "loss": 1.5884, "step": 189 }, { "epoch": 0.4884318766066838, "grad_norm": 0.0, "learning_rate": 1.0873863356025911e-05, "loss": 1.5559, "step": 190 }, { "epoch": 0.4910025706940874, "grad_norm": 0.0, "learning_rate": 1.0790821390188493e-05, "loss": 1.5623, "step": 191 }, { "epoch": 0.493573264781491, "grad_norm": 0.0, "learning_rate": 1.0707724509104318e-05, "loss": 1.5916, "step": 192 }, { "epoch": 0.4961439588688946, "grad_norm": 0.0, "learning_rate": 1.062457848308484e-05, "loss": 1.5696, "step": 193 }, { "epoch": 0.4987146529562982, "grad_norm": 0.0, "learning_rate": 1.0541389085854177e-05, "loss": 1.5913, "step": 194 }, { "epoch": 0.5012853470437018, "grad_norm": 0.0, "learning_rate": 1.0458162094148185e-05, "loss": 1.5439, "step": 195 }, { "epoch": 0.5038560411311054, "grad_norm": 0.0, "learning_rate": 1.0374903287313307e-05, "loss": 1.6013, "step": 196 }, { "epoch": 0.506426735218509, "grad_norm": 0.0, "learning_rate": 1.029161844690525e-05, "loss": 1.5813, "step": 197 }, { "epoch": 0.5089974293059126, "grad_norm": 0.0, "learning_rate": 1.0208313356287505e-05, "loss": 1.5757, "step": 198 }, { "epoch": 0.5115681233933161, "grad_norm": 0.0, "learning_rate": 1.0124993800229774e-05, "loss": 1.5508, "step": 199 }, { "epoch": 0.5141388174807198, "grad_norm": 0.0, "learning_rate": 1.004166556450623e-05, "loss": 1.5774, "step": 200 }, { "epoch": 0.5167095115681234, "grad_norm": 0.0, "learning_rate": 9.958334435493776e-06, "loss": 1.594, "step": 201 }, { "epoch": 0.519280205655527, "grad_norm": 0.0, "learning_rate": 9.87500619977023e-06, "loss": 1.5977, "step": 202 }, { "epoch": 0.5218508997429306, "grad_norm": 0.0, "learning_rate": 9.791686643712498e-06, "loss": 1.5938, "step": 203 }, { "epoch": 0.5244215938303342, "grad_norm": 0.0, "learning_rate": 9.708381553094754e-06, "loss": 1.5371, "step": 204 }, { "epoch": 0.5269922879177378, "grad_norm": 0.0, "learning_rate": 9.625096712686694e-06, "loss": 1.5315, "step": 205 }, { "epoch": 0.5295629820051414, "grad_norm": 0.0, "learning_rate": 9.541837905851817e-06, "loss": 1.5708, "step": 206 }, { "epoch": 0.532133676092545, "grad_norm": 0.0, "learning_rate": 9.458610914145826e-06, "loss": 1.5691, "step": 207 }, { "epoch": 0.5347043701799485, "grad_norm": 0.0, "learning_rate": 9.375421516915165e-06, "loss": 1.5881, "step": 208 }, { "epoch": 0.5372750642673522, "grad_norm": 0.0, "learning_rate": 9.292275490895685e-06, "loss": 1.5732, "step": 209 }, { "epoch": 0.5398457583547558, "grad_norm": 0.0, "learning_rate": 9.209178609811509e-06, "loss": 1.5562, "step": 210 }, { "epoch": 0.5424164524421594, "grad_norm": 0.0, "learning_rate": 9.126136643974094e-06, "loss": 1.5603, "step": 211 }, { "epoch": 0.5449871465295629, "grad_norm": 0.0, "learning_rate": 9.043155359881538e-06, "loss": 1.5352, "step": 212 }, { "epoch": 0.5475578406169666, "grad_norm": 0.0, "learning_rate": 8.960240519818167e-06, "loss": 1.5647, "step": 213 }, { "epoch": 0.5501285347043702, "grad_norm": 0.0, "learning_rate": 8.877397881454358e-06, "loss": 1.5747, "step": 214 }, { "epoch": 0.5526992287917738, "grad_norm": 0.0, "learning_rate": 8.79463319744677e-06, "loss": 1.5586, "step": 215 }, { "epoch": 0.5552699228791774, "grad_norm": 0.0, "learning_rate": 8.711952215038837e-06, "loss": 1.5527, "step": 216 }, { "epoch": 0.5578406169665809, "grad_norm": 0.0, "learning_rate": 8.629360675661693e-06, "loss": 1.5374, "step": 217 }, { "epoch": 0.5604113110539846, "grad_norm": 0.0, "learning_rate": 8.546864314535478e-06, "loss": 1.5647, "step": 218 }, { "epoch": 0.5629820051413882, "grad_norm": 0.0, "learning_rate": 8.464468860271084e-06, "loss": 1.5356, "step": 219 }, { "epoch": 0.5655526992287918, "grad_norm": 0.0, "learning_rate": 8.382180034472353e-06, "loss": 1.5483, "step": 220 }, { "epoch": 0.5681233933161953, "grad_norm": 0.0, "learning_rate": 8.30000355133876e-06, "loss": 1.5386, "step": 221 }, { "epoch": 0.570694087403599, "grad_norm": 0.0, "learning_rate": 8.217945117268624e-06, "loss": 1.5552, "step": 222 }, { "epoch": 0.5732647814910026, "grad_norm": 0.0, "learning_rate": 8.136010430462837e-06, "loss": 1.5635, "step": 223 }, { "epoch": 0.5758354755784062, "grad_norm": 0.0, "learning_rate": 8.0542051805292e-06, "loss": 1.5657, "step": 224 }, { "epoch": 0.5784061696658098, "grad_norm": 0.0, "learning_rate": 7.9725350480873e-06, "loss": 1.5386, "step": 225 }, { "epoch": 0.5809768637532133, "grad_norm": 0.0, "learning_rate": 7.89100570437408e-06, "loss": 1.6018, "step": 226 }, { "epoch": 0.583547557840617, "grad_norm": 0.0, "learning_rate": 7.809622810849986e-06, "loss": 1.5396, "step": 227 }, { "epoch": 0.5861182519280206, "grad_norm": 0.0, "learning_rate": 7.72839201880587e-06, "loss": 1.5474, "step": 228 }, { "epoch": 0.5886889460154242, "grad_norm": 0.0, "learning_rate": 7.647318968970528e-06, "loss": 1.5654, "step": 229 }, { "epoch": 0.5912596401028277, "grad_norm": 0.0, "learning_rate": 7.566409291119008e-06, "loss": 1.5732, "step": 230 }, { "epoch": 0.5938303341902313, "grad_norm": 0.0, "learning_rate": 7.485668603681706e-06, "loss": 1.5779, "step": 231 }, { "epoch": 0.596401028277635, "grad_norm": 0.0, "learning_rate": 7.405102513354166e-06, "loss": 1.5449, "step": 232 }, { "epoch": 0.5989717223650386, "grad_norm": 0.0, "learning_rate": 7.324716614707794e-06, "loss": 1.5408, "step": 233 }, { "epoch": 0.6015424164524421, "grad_norm": 0.0, "learning_rate": 7.2445164898013345e-06, "loss": 1.5403, "step": 234 }, { "epoch": 0.6041131105398457, "grad_norm": 0.0, "learning_rate": 7.1645077077932666e-06, "loss": 1.5159, "step": 235 }, { "epoch": 0.6066838046272494, "grad_norm": 0.0, "learning_rate": 7.084695824555074e-06, "loss": 1.5557, "step": 236 }, { "epoch": 0.609254498714653, "grad_norm": 0.0, "learning_rate": 7.005086382285426e-06, "loss": 1.5625, "step": 237 }, { "epoch": 0.6118251928020566, "grad_norm": 0.0, "learning_rate": 6.925684909125354e-06, "loss": 1.552, "step": 238 }, { "epoch": 0.6143958868894601, "grad_norm": 0.0, "learning_rate": 6.84649691877433e-06, "loss": 1.5488, "step": 239 }, { "epoch": 0.6169665809768637, "grad_norm": 0.0, "learning_rate": 6.767527910107437e-06, "loss": 1.5181, "step": 240 }, { "epoch": 0.6195372750642674, "grad_norm": 0.0, "learning_rate": 6.688783366793488e-06, "loss": 1.5403, "step": 241 }, { "epoch": 0.622107969151671, "grad_norm": 0.0, "learning_rate": 6.610268756914254e-06, "loss": 1.5662, "step": 242 }, { "epoch": 0.6246786632390745, "grad_norm": 0.0, "learning_rate": 6.5319895325847535e-06, "loss": 1.5222, "step": 243 }, { "epoch": 0.6272493573264781, "grad_norm": 0.0, "learning_rate": 6.453951129574644e-06, "loss": 1.5439, "step": 244 }, { "epoch": 0.6298200514138818, "grad_norm": 0.0, "learning_rate": 6.3761589669307745e-06, "loss": 1.5312, "step": 245 }, { "epoch": 0.6323907455012854, "grad_norm": 0.0, "learning_rate": 6.298618446600856e-06, "loss": 1.5383, "step": 246 }, { "epoch": 0.6349614395886889, "grad_norm": 0.0, "learning_rate": 6.221334953058389e-06, "loss": 1.5393, "step": 247 }, { "epoch": 0.6375321336760925, "grad_norm": 0.0, "learning_rate": 6.144313852928712e-06, "loss": 1.5247, "step": 248 }, { "epoch": 0.6401028277634961, "grad_norm": 0.0, "learning_rate": 6.067560494616374e-06, "loss": 1.5454, "step": 249 }, { "epoch": 0.6426735218508998, "grad_norm": 0.0, "learning_rate": 5.9910802079337285e-06, "loss": 1.5215, "step": 250 }, { "epoch": 0.6452442159383034, "grad_norm": 0.0, "learning_rate": 5.9148783037308154e-06, "loss": 1.5427, "step": 251 }, { "epoch": 0.6478149100257069, "grad_norm": 0.0, "learning_rate": 5.838960073526589e-06, "loss": 1.5427, "step": 252 }, { "epoch": 0.6503856041131105, "grad_norm": 0.0, "learning_rate": 5.763330789141457e-06, "loss": 1.5552, "step": 253 }, { "epoch": 0.6529562982005142, "grad_norm": 0.0, "learning_rate": 5.687995702331211e-06, "loss": 1.5388, "step": 254 }, { "epoch": 0.6555269922879178, "grad_norm": 0.0, "learning_rate": 5.612960044422335e-06, "loss": 1.5854, "step": 255 }, { "epoch": 0.6580976863753213, "grad_norm": 0.0, "learning_rate": 5.538229025948729e-06, "loss": 1.5588, "step": 256 }, { "epoch": 0.6606683804627249, "grad_norm": 0.0, "learning_rate": 5.463807836289921e-06, "loss": 1.5217, "step": 257 }, { "epoch": 0.6632390745501285, "grad_norm": 0.0, "learning_rate": 5.389701643310661e-06, "loss": 1.5066, "step": 258 }, { "epoch": 0.6658097686375322, "grad_norm": 0.0, "learning_rate": 5.3159155930021e-06, "loss": 1.5327, "step": 259 }, { "epoch": 0.6683804627249358, "grad_norm": 0.0, "learning_rate": 5.2424548091244334e-06, "loss": 1.5522, "step": 260 }, { "epoch": 0.6709511568123393, "grad_norm": 0.0, "learning_rate": 5.169324392851105e-06, "loss": 1.543, "step": 261 }, { "epoch": 0.6735218508997429, "grad_norm": 0.0, "learning_rate": 5.096529422414571e-06, "loss": 1.5483, "step": 262 }, { "epoch": 0.6760925449871465, "grad_norm": 0.0, "learning_rate": 5.0240749527536845e-06, "loss": 1.5234, "step": 263 }, { "epoch": 0.6786632390745502, "grad_norm": 0.0, "learning_rate": 4.951966015162652e-06, "loss": 1.5315, "step": 264 }, { "epoch": 0.6812339331619537, "grad_norm": 0.0, "learning_rate": 4.880207616941663e-06, "loss": 1.5193, "step": 265 }, { "epoch": 0.6838046272493573, "grad_norm": 0.0, "learning_rate": 4.8088047410492e-06, "loss": 1.5586, "step": 266 }, { "epoch": 0.6863753213367609, "grad_norm": 0.0, "learning_rate": 4.737762345755975e-06, "loss": 1.481, "step": 267 }, { "epoch": 0.6889460154241646, "grad_norm": 0.0, "learning_rate": 4.667085364300678e-06, "loss": 1.5869, "step": 268 }, { "epoch": 0.6915167095115681, "grad_norm": 0.0, "learning_rate": 4.596778704547359e-06, "loss": 1.5366, "step": 269 }, { "epoch": 0.6940874035989717, "grad_norm": 0.0, "learning_rate": 4.526847248644652e-06, "loss": 1.5007, "step": 270 }, { "epoch": 0.6966580976863753, "grad_norm": 0.0, "learning_rate": 4.457295852686746e-06, "loss": 1.5352, "step": 271 }, { "epoch": 0.699228791773779, "grad_norm": 0.0, "learning_rate": 4.388129346376177e-06, "loss": 1.5447, "step": 272 }, { "epoch": 0.7017994858611826, "grad_norm": 0.0, "learning_rate": 4.319352532688444e-06, "loss": 1.5701, "step": 273 }, { "epoch": 0.7043701799485861, "grad_norm": 0.0, "learning_rate": 4.250970187538484e-06, "loss": 1.5, "step": 274 }, { "epoch": 0.7069408740359897, "grad_norm": 0.0, "learning_rate": 4.182987059449056e-06, "loss": 1.5513, "step": 275 }, { "epoch": 0.7095115681233933, "grad_norm": 0.0, "learning_rate": 4.115407869220948e-06, "loss": 1.5007, "step": 276 }, { "epoch": 0.712082262210797, "grad_norm": 0.0, "learning_rate": 4.048237309605216e-06, "loss": 1.5398, "step": 277 }, { "epoch": 0.7146529562982005, "grad_norm": 0.0, "learning_rate": 3.981480044977284e-06, "loss": 1.5476, "step": 278 }, { "epoch": 0.7172236503856041, "grad_norm": 0.0, "learning_rate": 3.915140711013044e-06, "loss": 1.5015, "step": 279 }, { "epoch": 0.7197943444730077, "grad_norm": 0.0, "learning_rate": 3.849223914366981e-06, "loss": 1.5405, "step": 280 }, { "epoch": 0.7223650385604113, "grad_norm": 0.0, "learning_rate": 3.7837342323522454e-06, "loss": 1.5413, "step": 281 }, { "epoch": 0.7249357326478149, "grad_norm": 0.0, "learning_rate": 3.7186762126228227e-06, "loss": 1.5874, "step": 282 }, { "epoch": 0.7275064267352185, "grad_norm": 0.0, "learning_rate": 3.654054372857738e-06, "loss": 1.5122, "step": 283 }, { "epoch": 0.7300771208226221, "grad_norm": 0.0, "learning_rate": 3.5898732004473523e-06, "loss": 1.55, "step": 284 }, { "epoch": 0.7326478149100257, "grad_norm": 0.0, "learning_rate": 3.5261371521817247e-06, "loss": 1.5337, "step": 285 }, { "epoch": 0.7352185089974294, "grad_norm": 0.0, "learning_rate": 3.462850653941171e-06, "loss": 1.5159, "step": 286 }, { "epoch": 0.7377892030848329, "grad_norm": 0.0, "learning_rate": 3.4000181003889e-06, "loss": 1.5139, "step": 287 }, { "epoch": 0.7403598971722365, "grad_norm": 0.0, "learning_rate": 3.337643854665843e-06, "loss": 1.499, "step": 288 }, { "epoch": 0.7429305912596401, "grad_norm": 0.0, "learning_rate": 3.2757322480876996e-06, "loss": 1.5149, "step": 289 }, { "epoch": 0.7455012853470437, "grad_norm": 0.0, "learning_rate": 3.2142875798441376e-06, "loss": 1.5098, "step": 290 }, { "epoch": 0.7480719794344473, "grad_norm": 0.0, "learning_rate": 3.15331411670027e-06, "loss": 1.5217, "step": 291 }, { "epoch": 0.7506426735218509, "grad_norm": 0.0, "learning_rate": 3.092816092700366e-06, "loss": 1.5017, "step": 292 }, { "epoch": 0.7532133676092545, "grad_norm": 0.0, "learning_rate": 3.032797708873828e-06, "loss": 1.5398, "step": 293 }, { "epoch": 0.7557840616966581, "grad_norm": 0.0, "learning_rate": 2.97326313294349e-06, "loss": 1.4983, "step": 294 }, { "epoch": 0.7583547557840618, "grad_norm": 0.0, "learning_rate": 2.914216499036178e-06, "loss": 1.5271, "step": 295 }, { "epoch": 0.7609254498714653, "grad_norm": 0.0, "learning_rate": 2.855661907395655e-06, "loss": 1.5286, "step": 296 }, { "epoch": 0.7634961439588689, "grad_norm": 0.0, "learning_rate": 2.7976034240978834e-06, "loss": 1.4954, "step": 297 }, { "epoch": 0.7660668380462725, "grad_norm": 0.0, "learning_rate": 2.740045080768694e-06, "loss": 1.4653, "step": 298 }, { "epoch": 0.7686375321336761, "grad_norm": 0.0, "learning_rate": 2.6829908743037936e-06, "loss": 1.5271, "step": 299 }, { "epoch": 0.7712082262210797, "grad_norm": 0.0, "learning_rate": 2.626444766591253e-06, "loss": 1.48, "step": 300 }, { "epoch": 0.7737789203084833, "grad_norm": 0.0, "learning_rate": 2.570410684236365e-06, "loss": 1.5093, "step": 301 }, { "epoch": 0.7763496143958869, "grad_norm": 0.0, "learning_rate": 2.514892518288988e-06, "loss": 1.531, "step": 302 }, { "epoch": 0.7789203084832905, "grad_norm": 0.0, "learning_rate": 2.4598941239733555e-06, "loss": 1.4795, "step": 303 }, { "epoch": 0.781491002570694, "grad_norm": 0.0, "learning_rate": 2.4054193204203457e-06, "loss": 1.5056, "step": 304 }, { "epoch": 0.7840616966580977, "grad_norm": 0.0, "learning_rate": 2.3514718904022993e-06, "loss": 1.4841, "step": 305 }, { "epoch": 0.7866323907455013, "grad_norm": 0.0, "learning_rate": 2.2980555800703273e-06, "loss": 1.5337, "step": 306 }, { "epoch": 0.7892030848329049, "grad_norm": 0.0, "learning_rate": 2.2451740986941905e-06, "loss": 1.5212, "step": 307 }, { "epoch": 0.7917737789203085, "grad_norm": 0.0, "learning_rate": 2.1928311184046967e-06, "loss": 1.5308, "step": 308 }, { "epoch": 0.794344473007712, "grad_norm": 0.0, "learning_rate": 2.1410302739387424e-06, "loss": 1.5159, "step": 309 }, { "epoch": 0.7969151670951157, "grad_norm": 0.0, "learning_rate": 2.0897751623868833e-06, "loss": 1.5349, "step": 310 }, { "epoch": 0.7994858611825193, "grad_norm": 0.0, "learning_rate": 2.0390693429435626e-06, "loss": 1.5029, "step": 311 }, { "epoch": 0.8020565552699229, "grad_norm": 0.0, "learning_rate": 1.9889163366599607e-06, "loss": 1.519, "step": 312 }, { "epoch": 0.8046272493573264, "grad_norm": 0.0, "learning_rate": 1.939319626199483e-06, "loss": 1.5054, "step": 313 }, { "epoch": 0.8071979434447301, "grad_norm": 0.0, "learning_rate": 1.890282655595922e-06, "loss": 1.4736, "step": 314 }, { "epoch": 0.8097686375321337, "grad_norm": 0.0, "learning_rate": 1.8418088300143044e-06, "loss": 1.5242, "step": 315 }, { "epoch": 0.8123393316195373, "grad_norm": 0.0, "learning_rate": 1.7939015155144378e-06, "loss": 1.5208, "step": 316 }, { "epoch": 0.8149100257069408, "grad_norm": 0.0, "learning_rate": 1.7465640388171589e-06, "loss": 1.5332, "step": 317 }, { "epoch": 0.8174807197943444, "grad_norm": 0.0, "learning_rate": 1.6997996870733268e-06, "loss": 1.4978, "step": 318 }, { "epoch": 0.8200514138817481, "grad_norm": 0.0, "learning_rate": 1.6536117076355652e-06, "loss": 1.4961, "step": 319 }, { "epoch": 0.8226221079691517, "grad_norm": 0.0, "learning_rate": 1.6080033078327585e-06, "loss": 1.5559, "step": 320 }, { "epoch": 0.8251928020565553, "grad_norm": 0.0, "learning_rate": 1.5629776547473397e-06, "loss": 1.5435, "step": 321 }, { "epoch": 0.8277634961439588, "grad_norm": 0.0, "learning_rate": 1.5185378749953538e-06, "loss": 1.4744, "step": 322 }, { "epoch": 0.8303341902313625, "grad_norm": 0.0, "learning_rate": 1.4746870545093528e-06, "loss": 1.4885, "step": 323 }, { "epoch": 0.8329048843187661, "grad_norm": 0.0, "learning_rate": 1.4314282383241097e-06, "loss": 1.5088, "step": 324 }, { "epoch": 0.8354755784061697, "grad_norm": 0.0, "learning_rate": 1.388764430365147e-06, "loss": 1.4971, "step": 325 }, { "epoch": 0.8380462724935732, "grad_norm": 0.0, "learning_rate": 1.3466985932401743e-06, "loss": 1.5269, "step": 326 }, { "epoch": 0.8406169665809768, "grad_norm": 0.0, "learning_rate": 1.3052336480333372e-06, "loss": 1.5088, "step": 327 }, { "epoch": 0.8431876606683805, "grad_norm": 0.0, "learning_rate": 1.2643724741023845e-06, "loss": 1.5046, "step": 328 }, { "epoch": 0.8457583547557841, "grad_norm": 0.0, "learning_rate": 1.2241179088787192e-06, "loss": 1.5217, "step": 329 }, { "epoch": 0.8483290488431876, "grad_norm": 0.0, "learning_rate": 1.1844727476703776e-06, "loss": 1.4951, "step": 330 }, { "epoch": 0.8508997429305912, "grad_norm": 0.0, "learning_rate": 1.1454397434679022e-06, "loss": 1.4941, "step": 331 }, { "epoch": 0.8534704370179949, "grad_norm": 0.0, "learning_rate": 1.1070216067531825e-06, "loss": 1.5122, "step": 332 }, { "epoch": 0.8560411311053985, "grad_norm": 0.0, "learning_rate": 1.0692210053112451e-06, "loss": 1.5427, "step": 333 }, { "epoch": 0.8586118251928021, "grad_norm": 0.0, "learning_rate": 1.032040564044975e-06, "loss": 1.5278, "step": 334 }, { "epoch": 0.8611825192802056, "grad_norm": 0.0, "learning_rate": 9.954828647928727e-07, "loss": 1.4768, "step": 335 }, { "epoch": 0.8637532133676092, "grad_norm": 0.0, "learning_rate": 9.595504461497441e-07, "loss": 1.5066, "step": 336 }, { "epoch": 0.8663239074550129, "grad_norm": 0.0, "learning_rate": 9.242458032904311e-07, "loss": 1.4871, "step": 337 }, { "epoch": 0.8688946015424165, "grad_norm": 0.0, "learning_rate": 8.895713877965373e-07, "loss": 1.5212, "step": 338 }, { "epoch": 0.87146529562982, "grad_norm": 0.0, "learning_rate": 8.555296074861996e-07, "loss": 1.4919, "step": 339 }, { "epoch": 0.8740359897172236, "grad_norm": 0.0, "learning_rate": 8.22122826246875e-07, "loss": 1.5476, "step": 340 }, { "epoch": 0.8766066838046273, "grad_norm": 0.0, "learning_rate": 7.89353363871197e-07, "loss": 1.5142, "step": 341 }, { "epoch": 0.8791773778920309, "grad_norm": 0.0, "learning_rate": 7.572234958958846e-07, "loss": 1.5332, "step": 342 }, { "epoch": 0.8817480719794345, "grad_norm": 0.0, "learning_rate": 7.2573545344373e-07, "loss": 1.4924, "step": 343 }, { "epoch": 0.884318766066838, "grad_norm": 0.0, "learning_rate": 6.948914230686688e-07, "loss": 1.5181, "step": 344 }, { "epoch": 0.8868894601542416, "grad_norm": 0.0, "learning_rate": 6.646935466039373e-07, "loss": 1.5137, "step": 345 }, { "epoch": 0.8894601542416453, "grad_norm": 0.0, "learning_rate": 6.351439210133492e-07, "loss": 1.5056, "step": 346 }, { "epoch": 0.8920308483290489, "grad_norm": 0.0, "learning_rate": 6.062445982456777e-07, "loss": 1.4688, "step": 347 }, { "epoch": 0.8946015424164524, "grad_norm": 0.0, "learning_rate": 5.77997585092166e-07, "loss": 1.5146, "step": 348 }, { "epoch": 0.897172236503856, "grad_norm": 0.0, "learning_rate": 5.504048430471753e-07, "loss": 1.4695, "step": 349 }, { "epoch": 0.8997429305912596, "grad_norm": 0.0, "learning_rate": 5.234682881719766e-07, "loss": 1.5129, "step": 350 }, { "epoch": 0.9023136246786633, "grad_norm": 0.0, "learning_rate": 4.971897909616985e-07, "loss": 1.5061, "step": 351 }, { "epoch": 0.9048843187660668, "grad_norm": 0.0, "learning_rate": 4.715711762154362e-07, "loss": 1.4722, "step": 352 }, { "epoch": 0.9074550128534704, "grad_norm": 0.0, "learning_rate": 4.4661422290954495e-07, "loss": 1.5056, "step": 353 }, { "epoch": 0.910025706940874, "grad_norm": 0.0, "learning_rate": 4.2232066407409067e-07, "loss": 1.5017, "step": 354 }, { "epoch": 0.9125964010282777, "grad_norm": 0.0, "learning_rate": 3.986921866725202e-07, "loss": 1.5393, "step": 355 }, { "epoch": 0.9151670951156813, "grad_norm": 0.0, "learning_rate": 3.7573043148451673e-07, "loss": 1.5034, "step": 356 }, { "epoch": 0.9177377892030848, "grad_norm": 0.0, "learning_rate": 3.5343699299205003e-07, "loss": 1.5139, "step": 357 }, { "epoch": 0.9203084832904884, "grad_norm": 0.0, "learning_rate": 3.3181341926867283e-07, "loss": 1.4788, "step": 358 }, { "epoch": 0.922879177377892, "grad_norm": 0.0, "learning_rate": 3.1086121187200667e-07, "loss": 1.4746, "step": 359 }, { "epoch": 0.9254498714652957, "grad_norm": 0.0, "learning_rate": 2.905818257394799e-07, "loss": 1.5112, "step": 360 }, { "epoch": 0.9280205655526992, "grad_norm": 0.0, "learning_rate": 2.7097666908729283e-07, "loss": 1.5071, "step": 361 }, { "epoch": 0.9305912596401028, "grad_norm": 0.0, "learning_rate": 2.520471033126326e-07, "loss": 1.4773, "step": 362 }, { "epoch": 0.9331619537275064, "grad_norm": 0.0, "learning_rate": 2.3379444289913344e-07, "loss": 1.5146, "step": 363 }, { "epoch": 0.9357326478149101, "grad_norm": 0.0, "learning_rate": 2.1621995532559947e-07, "loss": 1.4978, "step": 364 }, { "epoch": 0.9383033419023136, "grad_norm": 0.0, "learning_rate": 1.9932486097799408e-07, "loss": 1.5183, "step": 365 }, { "epoch": 0.9408740359897172, "grad_norm": 0.0, "learning_rate": 1.8311033306468552e-07, "loss": 1.4761, "step": 366 }, { "epoch": 0.9434447300771208, "grad_norm": 0.0, "learning_rate": 1.6757749753498865e-07, "loss": 1.509, "step": 367 }, { "epoch": 0.9460154241645244, "grad_norm": 0.0, "learning_rate": 1.5272743300097316e-07, "loss": 1.5095, "step": 368 }, { "epoch": 0.9485861182519281, "grad_norm": 0.0, "learning_rate": 1.3856117066256225e-07, "loss": 1.5361, "step": 369 }, { "epoch": 0.9511568123393316, "grad_norm": 0.0, "learning_rate": 1.2507969423593225e-07, "loss": 1.5051, "step": 370 }, { "epoch": 0.9537275064267352, "grad_norm": 0.0, "learning_rate": 1.1228393988519381e-07, "loss": 1.5532, "step": 371 }, { "epoch": 0.9562982005141388, "grad_norm": 0.0, "learning_rate": 1.0017479615738957e-07, "loss": 1.553, "step": 372 }, { "epoch": 0.9588688946015425, "grad_norm": 0.0, "learning_rate": 8.875310392079118e-08, "loss": 1.5125, "step": 373 }, { "epoch": 0.961439588688946, "grad_norm": 0.0, "learning_rate": 7.801965630651165e-08, "loss": 1.4321, "step": 374 }, { "epoch": 0.9640102827763496, "grad_norm": 0.0, "learning_rate": 6.797519865342161e-08, "loss": 1.5005, "step": 375 }, { "epoch": 0.9665809768637532, "grad_norm": 0.0, "learning_rate": 5.862042845640403e-08, "loss": 1.4973, "step": 376 }, { "epoch": 0.9691516709511568, "grad_norm": 0.0, "learning_rate": 4.9955995317908514e-08, "loss": 1.5449, "step": 377 }, { "epoch": 0.9717223650385605, "grad_norm": 0.0, "learning_rate": 4.198250090284961e-08, "loss": 1.4795, "step": 378 }, { "epoch": 0.974293059125964, "grad_norm": 0.0, "learning_rate": 3.47004988968247e-08, "loss": 1.5508, "step": 379 }, { "epoch": 0.9768637532133676, "grad_norm": 0.0, "learning_rate": 2.8110494967664713e-08, "loss": 1.5095, "step": 380 }, { "epoch": 0.9794344473007712, "grad_norm": 0.0, "learning_rate": 2.221294673032004e-08, "loss": 1.5146, "step": 381 }, { "epoch": 0.9820051413881749, "grad_norm": 0.0, "learning_rate": 1.7008263715085904e-08, "loss": 1.5112, "step": 382 }, { "epoch": 0.9845758354755784, "grad_norm": 0.0, "learning_rate": 1.24968073391607e-08, "loss": 1.5144, "step": 383 }, { "epoch": 0.987146529562982, "grad_norm": 0.0, "learning_rate": 8.678890881552715e-09, "loss": 1.5459, "step": 384 }, { "epoch": 0.9897172236503856, "grad_norm": 0.0, "learning_rate": 5.554779461323101e-09, "loss": 1.4885, "step": 385 }, { "epoch": 0.9922879177377892, "grad_norm": 0.0, "learning_rate": 3.1246900191761463e-09, "loss": 1.4919, "step": 386 }, { "epoch": 0.9948586118251928, "grad_norm": 0.0, "learning_rate": 1.3887913023946652e-09, "loss": 1.5034, "step": 387 }, { "epoch": 0.9974293059125964, "grad_norm": 0.0, "learning_rate": 3.4720385312492223e-10, "loss": 1.4812, "step": 388 }, { "epoch": 1.0, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 1.3696, "step": 389 }, { "epoch": 1.0, "step": 389, "total_flos": 1.4102482311698186e+18, "train_loss": 1.5950692380302056, "train_runtime": 5789.3639, "train_samples_per_second": 17.167, "train_steps_per_second": 0.067 } ], "logging_steps": 1.0, "max_steps": 389, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.4102482311698186e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }