{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 939, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003194888178913738, "grad_norm": 6.409946841419442, "learning_rate": 4.2553191489361704e-07, "loss": 0.9383, "step": 1 }, { "epoch": 0.006389776357827476, "grad_norm": 5.883137519343072, "learning_rate": 8.510638297872341e-07, "loss": 0.8501, "step": 2 }, { "epoch": 0.009584664536741214, "grad_norm": 5.946755544649649, "learning_rate": 1.276595744680851e-06, "loss": 0.887, "step": 3 }, { "epoch": 0.012779552715654952, "grad_norm": 5.642225969670194, "learning_rate": 1.7021276595744682e-06, "loss": 0.8099, "step": 4 }, { "epoch": 0.01597444089456869, "grad_norm": 5.891932559187005, "learning_rate": 2.1276595744680853e-06, "loss": 0.876, "step": 5 }, { "epoch": 0.019169329073482427, "grad_norm": 4.9025637632451495, "learning_rate": 2.553191489361702e-06, "loss": 0.8158, "step": 6 }, { "epoch": 0.022364217252396165, "grad_norm": 4.57848090524475, "learning_rate": 2.978723404255319e-06, "loss": 0.8368, "step": 7 }, { "epoch": 0.025559105431309903, "grad_norm": 2.658016400637032, "learning_rate": 3.4042553191489363e-06, "loss": 0.7747, "step": 8 }, { "epoch": 0.02875399361022364, "grad_norm": 2.4579449141086083, "learning_rate": 3.8297872340425535e-06, "loss": 0.7551, "step": 9 }, { "epoch": 0.03194888178913738, "grad_norm": 2.072756095086418, "learning_rate": 4.255319148936171e-06, "loss": 0.7211, "step": 10 }, { "epoch": 0.03514376996805112, "grad_norm": 3.5764214287428824, "learning_rate": 4.680851063829788e-06, "loss": 0.7667, "step": 11 }, { "epoch": 0.038338658146964855, "grad_norm": 4.214989656079249, "learning_rate": 5.106382978723404e-06, "loss": 0.8135, "step": 12 }, { "epoch": 0.04153354632587859, "grad_norm": 3.695642879844999, "learning_rate": 5.531914893617022e-06, "loss": 0.738, "step": 13 }, { "epoch": 0.04472843450479233, "grad_norm": 4.169046468693503, "learning_rate": 5.957446808510638e-06, "loss": 0.7383, "step": 14 }, { "epoch": 0.04792332268370607, "grad_norm": 3.911669767568645, "learning_rate": 6.382978723404256e-06, "loss": 0.6796, "step": 15 }, { "epoch": 0.051118210862619806, "grad_norm": 3.3181523814007345, "learning_rate": 6.808510638297873e-06, "loss": 0.6902, "step": 16 }, { "epoch": 0.054313099041533544, "grad_norm": 2.287624393972901, "learning_rate": 7.234042553191491e-06, "loss": 0.6386, "step": 17 }, { "epoch": 0.05750798722044728, "grad_norm": 1.9124190484261288, "learning_rate": 7.659574468085107e-06, "loss": 0.6893, "step": 18 }, { "epoch": 0.06070287539936102, "grad_norm": 1.8739939120437843, "learning_rate": 8.085106382978723e-06, "loss": 0.6776, "step": 19 }, { "epoch": 0.06389776357827476, "grad_norm": 2.1937664044946783, "learning_rate": 8.510638297872341e-06, "loss": 0.6205, "step": 20 }, { "epoch": 0.0670926517571885, "grad_norm": 2.2715620880143916, "learning_rate": 8.936170212765958e-06, "loss": 0.6056, "step": 21 }, { "epoch": 0.07028753993610223, "grad_norm": 1.6081424374861002, "learning_rate": 9.361702127659576e-06, "loss": 0.555, "step": 22 }, { "epoch": 0.07348242811501597, "grad_norm": 1.763358212351945, "learning_rate": 9.787234042553192e-06, "loss": 0.6666, "step": 23 }, { "epoch": 0.07667731629392971, "grad_norm": 1.4811530280287506, "learning_rate": 1.0212765957446808e-05, "loss": 0.6504, "step": 24 }, { "epoch": 0.07987220447284345, "grad_norm": 1.3405676738844483, "learning_rate": 1.0638297872340426e-05, "loss": 0.5772, "step": 25 }, { "epoch": 0.08306709265175719, "grad_norm": 1.2991745234008487, "learning_rate": 1.1063829787234044e-05, "loss": 0.6445, "step": 26 }, { "epoch": 0.08626198083067092, "grad_norm": 1.0505598061019923, "learning_rate": 1.1489361702127662e-05, "loss": 0.5923, "step": 27 }, { "epoch": 0.08945686900958466, "grad_norm": 1.103208098271378, "learning_rate": 1.1914893617021277e-05, "loss": 0.6241, "step": 28 }, { "epoch": 0.0926517571884984, "grad_norm": 1.2201972114638524, "learning_rate": 1.2340425531914895e-05, "loss": 0.6283, "step": 29 }, { "epoch": 0.09584664536741214, "grad_norm": 1.1112064502121644, "learning_rate": 1.2765957446808513e-05, "loss": 0.6223, "step": 30 }, { "epoch": 0.09904153354632587, "grad_norm": 0.9923339302816265, "learning_rate": 1.3191489361702127e-05, "loss": 0.5931, "step": 31 }, { "epoch": 0.10223642172523961, "grad_norm": 1.0477203467646528, "learning_rate": 1.3617021276595745e-05, "loss": 0.6182, "step": 32 }, { "epoch": 0.10543130990415335, "grad_norm": 1.1019257759272982, "learning_rate": 1.4042553191489363e-05, "loss": 0.5675, "step": 33 }, { "epoch": 0.10862619808306709, "grad_norm": 1.0627070873952427, "learning_rate": 1.4468085106382981e-05, "loss": 0.596, "step": 34 }, { "epoch": 0.11182108626198083, "grad_norm": 1.1260018768412945, "learning_rate": 1.4893617021276596e-05, "loss": 0.5888, "step": 35 }, { "epoch": 0.11501597444089456, "grad_norm": 1.0131578830510992, "learning_rate": 1.5319148936170214e-05, "loss": 0.53, "step": 36 }, { "epoch": 0.1182108626198083, "grad_norm": 0.9462086601742236, "learning_rate": 1.5744680851063832e-05, "loss": 0.5871, "step": 37 }, { "epoch": 0.12140575079872204, "grad_norm": 1.089392870614615, "learning_rate": 1.6170212765957446e-05, "loss": 0.5879, "step": 38 }, { "epoch": 0.12460063897763578, "grad_norm": 0.9813042474086496, "learning_rate": 1.6595744680851064e-05, "loss": 0.5539, "step": 39 }, { "epoch": 0.12779552715654952, "grad_norm": 0.8478222123412902, "learning_rate": 1.7021276595744682e-05, "loss": 0.5272, "step": 40 }, { "epoch": 0.13099041533546327, "grad_norm": 0.9509316220604798, "learning_rate": 1.74468085106383e-05, "loss": 0.5524, "step": 41 }, { "epoch": 0.134185303514377, "grad_norm": 0.9910817013897578, "learning_rate": 1.7872340425531915e-05, "loss": 0.5781, "step": 42 }, { "epoch": 0.13738019169329074, "grad_norm": 1.1162723782576445, "learning_rate": 1.8297872340425533e-05, "loss": 0.57, "step": 43 }, { "epoch": 0.14057507987220447, "grad_norm": 0.9939969861590426, "learning_rate": 1.872340425531915e-05, "loss": 0.5459, "step": 44 }, { "epoch": 0.14376996805111822, "grad_norm": 1.2439803148930444, "learning_rate": 1.914893617021277e-05, "loss": 0.5854, "step": 45 }, { "epoch": 0.14696485623003194, "grad_norm": 1.2941430779218954, "learning_rate": 1.9574468085106384e-05, "loss": 0.5667, "step": 46 }, { "epoch": 0.1501597444089457, "grad_norm": 1.062058714539539, "learning_rate": 2e-05, "loss": 0.6011, "step": 47 }, { "epoch": 0.15335463258785942, "grad_norm": 1.0851192274830292, "learning_rate": 2.0425531914893616e-05, "loss": 0.58, "step": 48 }, { "epoch": 0.15654952076677317, "grad_norm": 1.0375577575144754, "learning_rate": 2.0851063829787238e-05, "loss": 0.5729, "step": 49 }, { "epoch": 0.1597444089456869, "grad_norm": 1.147815879136087, "learning_rate": 2.1276595744680852e-05, "loss": 0.5442, "step": 50 }, { "epoch": 0.16293929712460065, "grad_norm": 1.0221514663668887, "learning_rate": 2.1702127659574467e-05, "loss": 0.5857, "step": 51 }, { "epoch": 0.16613418530351437, "grad_norm": 1.0076524214922307, "learning_rate": 2.2127659574468088e-05, "loss": 0.6012, "step": 52 }, { "epoch": 0.16932907348242812, "grad_norm": 0.9868930985691252, "learning_rate": 2.2553191489361703e-05, "loss": 0.5524, "step": 53 }, { "epoch": 0.17252396166134185, "grad_norm": 0.8832842960378782, "learning_rate": 2.2978723404255324e-05, "loss": 0.5442, "step": 54 }, { "epoch": 0.1757188498402556, "grad_norm": 0.8366367306264574, "learning_rate": 2.340425531914894e-05, "loss": 0.5611, "step": 55 }, { "epoch": 0.17891373801916932, "grad_norm": 1.0326981018410393, "learning_rate": 2.3829787234042553e-05, "loss": 0.5304, "step": 56 }, { "epoch": 0.18210862619808307, "grad_norm": 0.9142673475609777, "learning_rate": 2.4255319148936175e-05, "loss": 0.5587, "step": 57 }, { "epoch": 0.1853035143769968, "grad_norm": 1.056499453713147, "learning_rate": 2.468085106382979e-05, "loss": 0.5374, "step": 58 }, { "epoch": 0.18849840255591055, "grad_norm": 1.0459906767360574, "learning_rate": 2.5106382978723404e-05, "loss": 0.5635, "step": 59 }, { "epoch": 0.19169329073482427, "grad_norm": 1.0845897272985423, "learning_rate": 2.5531914893617025e-05, "loss": 0.5536, "step": 60 }, { "epoch": 0.19488817891373802, "grad_norm": 1.2724090682106446, "learning_rate": 2.595744680851064e-05, "loss": 0.5549, "step": 61 }, { "epoch": 0.19808306709265175, "grad_norm": 0.9769443793575512, "learning_rate": 2.6382978723404255e-05, "loss": 0.5142, "step": 62 }, { "epoch": 0.2012779552715655, "grad_norm": 1.0004128504205176, "learning_rate": 2.6808510638297876e-05, "loss": 0.5533, "step": 63 }, { "epoch": 0.20447284345047922, "grad_norm": 1.2165255582257546, "learning_rate": 2.723404255319149e-05, "loss": 0.5484, "step": 64 }, { "epoch": 0.20766773162939298, "grad_norm": 1.1589649633600527, "learning_rate": 2.7659574468085112e-05, "loss": 0.5833, "step": 65 }, { "epoch": 0.2108626198083067, "grad_norm": 0.8637922666258719, "learning_rate": 2.8085106382978727e-05, "loss": 0.5216, "step": 66 }, { "epoch": 0.21405750798722045, "grad_norm": 1.3422386117294691, "learning_rate": 2.851063829787234e-05, "loss": 0.5304, "step": 67 }, { "epoch": 0.21725239616613418, "grad_norm": 1.0665495345944047, "learning_rate": 2.8936170212765963e-05, "loss": 0.5217, "step": 68 }, { "epoch": 0.22044728434504793, "grad_norm": 1.1974304023992344, "learning_rate": 2.9361702127659577e-05, "loss": 0.5758, "step": 69 }, { "epoch": 0.22364217252396165, "grad_norm": 1.055001246229523, "learning_rate": 2.9787234042553192e-05, "loss": 0.5959, "step": 70 }, { "epoch": 0.2268370607028754, "grad_norm": 0.9295929863968017, "learning_rate": 3.0212765957446813e-05, "loss": 0.5113, "step": 71 }, { "epoch": 0.23003194888178913, "grad_norm": 1.0895682011148657, "learning_rate": 3.063829787234043e-05, "loss": 0.5343, "step": 72 }, { "epoch": 0.23322683706070288, "grad_norm": 1.0443823721191292, "learning_rate": 3.1063829787234046e-05, "loss": 0.5368, "step": 73 }, { "epoch": 0.2364217252396166, "grad_norm": 1.0778250715462803, "learning_rate": 3.1489361702127664e-05, "loss": 0.5187, "step": 74 }, { "epoch": 0.23961661341853036, "grad_norm": 1.1417565226413213, "learning_rate": 3.191489361702128e-05, "loss": 0.5248, "step": 75 }, { "epoch": 0.24281150159744408, "grad_norm": 1.1170007131913982, "learning_rate": 3.234042553191489e-05, "loss": 0.547, "step": 76 }, { "epoch": 0.24600638977635783, "grad_norm": 1.1070104668983525, "learning_rate": 3.276595744680851e-05, "loss": 0.4987, "step": 77 }, { "epoch": 0.24920127795527156, "grad_norm": 1.2780289742174922, "learning_rate": 3.319148936170213e-05, "loss": 0.5372, "step": 78 }, { "epoch": 0.2523961661341853, "grad_norm": 1.1736505513452447, "learning_rate": 3.361702127659575e-05, "loss": 0.5534, "step": 79 }, { "epoch": 0.25559105431309903, "grad_norm": 0.9677573531833106, "learning_rate": 3.4042553191489365e-05, "loss": 0.5124, "step": 80 }, { "epoch": 0.25878594249201275, "grad_norm": 1.257228321448476, "learning_rate": 3.446808510638298e-05, "loss": 0.5622, "step": 81 }, { "epoch": 0.26198083067092653, "grad_norm": 1.0555331388479319, "learning_rate": 3.48936170212766e-05, "loss": 0.5765, "step": 82 }, { "epoch": 0.26517571884984026, "grad_norm": 1.0776914247686873, "learning_rate": 3.531914893617022e-05, "loss": 0.5258, "step": 83 }, { "epoch": 0.268370607028754, "grad_norm": 1.0056240006776036, "learning_rate": 3.574468085106383e-05, "loss": 0.4866, "step": 84 }, { "epoch": 0.2715654952076677, "grad_norm": 1.1298508994667116, "learning_rate": 3.617021276595745e-05, "loss": 0.5806, "step": 85 }, { "epoch": 0.2747603833865815, "grad_norm": 1.0948113866316362, "learning_rate": 3.6595744680851066e-05, "loss": 0.5617, "step": 86 }, { "epoch": 0.2779552715654952, "grad_norm": 1.0650985230812091, "learning_rate": 3.7021276595744684e-05, "loss": 0.5656, "step": 87 }, { "epoch": 0.28115015974440893, "grad_norm": 1.0504970870871342, "learning_rate": 3.74468085106383e-05, "loss": 0.5978, "step": 88 }, { "epoch": 0.28434504792332266, "grad_norm": 0.8855494930537949, "learning_rate": 3.787234042553192e-05, "loss": 0.4934, "step": 89 }, { "epoch": 0.28753993610223644, "grad_norm": 1.030400345834846, "learning_rate": 3.829787234042554e-05, "loss": 0.5292, "step": 90 }, { "epoch": 0.29073482428115016, "grad_norm": 1.2762844738209218, "learning_rate": 3.872340425531915e-05, "loss": 0.5444, "step": 91 }, { "epoch": 0.2939297124600639, "grad_norm": 1.126915075174302, "learning_rate": 3.914893617021277e-05, "loss": 0.5111, "step": 92 }, { "epoch": 0.2971246006389776, "grad_norm": 1.4237336432448202, "learning_rate": 3.9574468085106385e-05, "loss": 0.5495, "step": 93 }, { "epoch": 0.3003194888178914, "grad_norm": 1.162381948355831, "learning_rate": 4e-05, "loss": 0.5999, "step": 94 }, { "epoch": 0.3035143769968051, "grad_norm": 1.3353166705017592, "learning_rate": 3.999986177524551e-05, "loss": 0.5371, "step": 95 }, { "epoch": 0.30670926517571884, "grad_norm": 1.1533614278616147, "learning_rate": 3.999944710289265e-05, "loss": 0.5636, "step": 96 }, { "epoch": 0.30990415335463256, "grad_norm": 1.2821208981923682, "learning_rate": 3.9998755988673205e-05, "loss": 0.5319, "step": 97 }, { "epoch": 0.31309904153354634, "grad_norm": 1.3064500394077125, "learning_rate": 3.9997788442140105e-05, "loss": 0.5626, "step": 98 }, { "epoch": 0.31629392971246006, "grad_norm": 1.1664124835581022, "learning_rate": 3.999654447666721e-05, "loss": 0.5379, "step": 99 }, { "epoch": 0.3194888178913738, "grad_norm": 1.1049186599225438, "learning_rate": 3.999502410944923e-05, "loss": 0.5968, "step": 100 }, { "epoch": 0.3226837060702875, "grad_norm": 0.9776199567560799, "learning_rate": 3.99932273615014e-05, "loss": 0.5493, "step": 101 }, { "epoch": 0.3258785942492013, "grad_norm": 1.082662657590535, "learning_rate": 3.99911542576592e-05, "loss": 0.582, "step": 102 }, { "epoch": 0.329073482428115, "grad_norm": 1.0520705991247479, "learning_rate": 3.998880482657809e-05, "loss": 0.5239, "step": 103 }, { "epoch": 0.33226837060702874, "grad_norm": 1.0198152234253615, "learning_rate": 3.9986179100733e-05, "loss": 0.5562, "step": 104 }, { "epoch": 0.3354632587859425, "grad_norm": 1.1581233439140284, "learning_rate": 3.9983277116417974e-05, "loss": 0.5199, "step": 105 }, { "epoch": 0.33865814696485624, "grad_norm": 1.2617196945442968, "learning_rate": 3.998009891374561e-05, "loss": 0.5158, "step": 106 }, { "epoch": 0.34185303514376997, "grad_norm": 1.233474385279073, "learning_rate": 3.997664453664654e-05, "loss": 0.5796, "step": 107 }, { "epoch": 0.3450479233226837, "grad_norm": 1.383774855291399, "learning_rate": 3.9972914032868805e-05, "loss": 0.5355, "step": 108 }, { "epoch": 0.34824281150159747, "grad_norm": 1.2889281329856341, "learning_rate": 3.99689074539772e-05, "loss": 0.5459, "step": 109 }, { "epoch": 0.3514376996805112, "grad_norm": 0.8238497044674719, "learning_rate": 3.996462485535257e-05, "loss": 0.5503, "step": 110 }, { "epoch": 0.3546325878594249, "grad_norm": 1.4807124856342684, "learning_rate": 3.996006629619103e-05, "loss": 0.6508, "step": 111 }, { "epoch": 0.35782747603833864, "grad_norm": 1.4078520868755624, "learning_rate": 3.995523183950314e-05, "loss": 0.6093, "step": 112 }, { "epoch": 0.3610223642172524, "grad_norm": 0.9015312070363262, "learning_rate": 3.9950121552113076e-05, "loss": 0.5879, "step": 113 }, { "epoch": 0.36421725239616615, "grad_norm": 1.386297055498617, "learning_rate": 3.994473550465765e-05, "loss": 0.6371, "step": 114 }, { "epoch": 0.36741214057507987, "grad_norm": 1.186356563974775, "learning_rate": 3.993907377158537e-05, "loss": 0.5058, "step": 115 }, { "epoch": 0.3706070287539936, "grad_norm": 1.046808813203276, "learning_rate": 3.993313643115541e-05, "loss": 0.5475, "step": 116 }, { "epoch": 0.3738019169329074, "grad_norm": 1.1184791458413588, "learning_rate": 3.992692356543649e-05, "loss": 0.5131, "step": 117 }, { "epoch": 0.3769968051118211, "grad_norm": 0.9927859768100731, "learning_rate": 3.992043526030582e-05, "loss": 0.5351, "step": 118 }, { "epoch": 0.3801916932907348, "grad_norm": 0.7194994252219997, "learning_rate": 3.991367160544783e-05, "loss": 0.5241, "step": 119 }, { "epoch": 0.38338658146964855, "grad_norm": 1.1103180710410239, "learning_rate": 3.990663269435298e-05, "loss": 0.5341, "step": 120 }, { "epoch": 0.3865814696485623, "grad_norm": 0.891726161610439, "learning_rate": 3.9899318624316424e-05, "loss": 0.56, "step": 121 }, { "epoch": 0.38977635782747605, "grad_norm": 0.9833837615596567, "learning_rate": 3.9891729496436736e-05, "loss": 0.5275, "step": 122 }, { "epoch": 0.3929712460063898, "grad_norm": 1.0423973561059756, "learning_rate": 3.988386541561444e-05, "loss": 0.5726, "step": 123 }, { "epoch": 0.3961661341853035, "grad_norm": 1.051551443360772, "learning_rate": 3.9875726490550606e-05, "loss": 0.5689, "step": 124 }, { "epoch": 0.3993610223642173, "grad_norm": 1.0379195132528112, "learning_rate": 3.986731283374533e-05, "loss": 0.5532, "step": 125 }, { "epoch": 0.402555910543131, "grad_norm": 1.066307983698186, "learning_rate": 3.985862456149616e-05, "loss": 0.6265, "step": 126 }, { "epoch": 0.4057507987220447, "grad_norm": 1.0375163774393865, "learning_rate": 3.9849661793896537e-05, "loss": 0.573, "step": 127 }, { "epoch": 0.40894568690095845, "grad_norm": 0.8956353921945285, "learning_rate": 3.984042465483409e-05, "loss": 0.5177, "step": 128 }, { "epoch": 0.41214057507987223, "grad_norm": 1.079166565420564, "learning_rate": 3.983091327198896e-05, "loss": 0.5476, "step": 129 }, { "epoch": 0.41533546325878595, "grad_norm": 0.9871062497017694, "learning_rate": 3.982112777683199e-05, "loss": 0.5868, "step": 130 }, { "epoch": 0.4185303514376997, "grad_norm": 1.1607498270093395, "learning_rate": 3.981106830462296e-05, "loss": 0.5829, "step": 131 }, { "epoch": 0.4217252396166134, "grad_norm": 0.938424834465439, "learning_rate": 3.9800734994408657e-05, "loss": 0.5233, "step": 132 }, { "epoch": 0.4249201277955272, "grad_norm": 0.9723972932143454, "learning_rate": 3.9790127989021024e-05, "loss": 0.5452, "step": 133 }, { "epoch": 0.4281150159744409, "grad_norm": 1.1133970630266223, "learning_rate": 3.977924743507513e-05, "loss": 0.6125, "step": 134 }, { "epoch": 0.43130990415335463, "grad_norm": 0.9887899047167236, "learning_rate": 3.976809348296716e-05, "loss": 0.5532, "step": 135 }, { "epoch": 0.43450479233226835, "grad_norm": 1.0482567177622366, "learning_rate": 3.9756666286872345e-05, "loss": 0.5469, "step": 136 }, { "epoch": 0.43769968051118213, "grad_norm": 0.9896665341430675, "learning_rate": 3.974496600474282e-05, "loss": 0.5496, "step": 137 }, { "epoch": 0.44089456869009586, "grad_norm": 1.1889549086086342, "learning_rate": 3.9732992798305465e-05, "loss": 0.5767, "step": 138 }, { "epoch": 0.4440894568690096, "grad_norm": 0.9876667745869144, "learning_rate": 3.972074683305961e-05, "loss": 0.5746, "step": 139 }, { "epoch": 0.4472843450479233, "grad_norm": 1.0145350383168896, "learning_rate": 3.9708228278274816e-05, "loss": 0.5332, "step": 140 }, { "epoch": 0.4504792332268371, "grad_norm": 0.898832690907024, "learning_rate": 3.96954373069885e-05, "loss": 0.6213, "step": 141 }, { "epoch": 0.4536741214057508, "grad_norm": 0.9695396273633855, "learning_rate": 3.968237409600355e-05, "loss": 0.5389, "step": 142 }, { "epoch": 0.45686900958466453, "grad_norm": 0.8686445117159894, "learning_rate": 3.9669038825885875e-05, "loss": 0.5514, "step": 143 }, { "epoch": 0.46006389776357826, "grad_norm": 0.9730958936696619, "learning_rate": 3.9655431680961924e-05, "loss": 0.5342, "step": 144 }, { "epoch": 0.46325878594249204, "grad_norm": 1.085850372058602, "learning_rate": 3.964155284931612e-05, "loss": 0.5637, "step": 145 }, { "epoch": 0.46645367412140576, "grad_norm": 0.8490885837637923, "learning_rate": 3.962740252278827e-05, "loss": 0.5432, "step": 146 }, { "epoch": 0.4696485623003195, "grad_norm": 1.208209634263402, "learning_rate": 3.961298089697093e-05, "loss": 0.5406, "step": 147 }, { "epoch": 0.4728434504792332, "grad_norm": 1.2369537682088747, "learning_rate": 3.959828817120665e-05, "loss": 0.5871, "step": 148 }, { "epoch": 0.476038338658147, "grad_norm": 1.0963511451666044, "learning_rate": 3.9583324548585276e-05, "loss": 0.5965, "step": 149 }, { "epoch": 0.4792332268370607, "grad_norm": 1.1210854346222001, "learning_rate": 3.956809023594112e-05, "loss": 0.4888, "step": 150 }, { "epoch": 0.48242811501597443, "grad_norm": 1.10943858725056, "learning_rate": 3.955258544385009e-05, "loss": 0.568, "step": 151 }, { "epoch": 0.48562300319488816, "grad_norm": 0.9114400050058079, "learning_rate": 3.95368103866268e-05, "loss": 0.5329, "step": 152 }, { "epoch": 0.48881789137380194, "grad_norm": 1.2588910487562754, "learning_rate": 3.9520765282321584e-05, "loss": 0.5458, "step": 153 }, { "epoch": 0.49201277955271566, "grad_norm": 1.161279475928506, "learning_rate": 3.9504450352717514e-05, "loss": 0.534, "step": 154 }, { "epoch": 0.4952076677316294, "grad_norm": 1.1666419752273152, "learning_rate": 3.948786582332728e-05, "loss": 0.5449, "step": 155 }, { "epoch": 0.4984025559105431, "grad_norm": 1.2141093053662426, "learning_rate": 3.947101192339016e-05, "loss": 0.5761, "step": 156 }, { "epoch": 0.5015974440894568, "grad_norm": 0.9508186492271605, "learning_rate": 3.9453888885868756e-05, "loss": 0.5019, "step": 157 }, { "epoch": 0.5047923322683706, "grad_norm": 1.0363555761012373, "learning_rate": 3.943649694744584e-05, "loss": 0.5156, "step": 158 }, { "epoch": 0.5079872204472844, "grad_norm": 1.0787946362260752, "learning_rate": 3.9418836348521045e-05, "loss": 0.5405, "step": 159 }, { "epoch": 0.5111821086261981, "grad_norm": 0.9707452122472157, "learning_rate": 3.940090733320757e-05, "loss": 0.5227, "step": 160 }, { "epoch": 0.5143769968051118, "grad_norm": 0.8852169354166731, "learning_rate": 3.93827101493288e-05, "loss": 0.5471, "step": 161 }, { "epoch": 0.5175718849840255, "grad_norm": 1.0544907272347088, "learning_rate": 3.936424504841485e-05, "loss": 0.5495, "step": 162 }, { "epoch": 0.5207667731629393, "grad_norm": 0.8460840865557951, "learning_rate": 3.934551228569913e-05, "loss": 0.5919, "step": 163 }, { "epoch": 0.5239616613418531, "grad_norm": 1.3423757776337415, "learning_rate": 3.932651212011479e-05, "loss": 0.5745, "step": 164 }, { "epoch": 0.5271565495207667, "grad_norm": 0.8367417092044956, "learning_rate": 3.930724481429114e-05, "loss": 0.5582, "step": 165 }, { "epoch": 0.5303514376996805, "grad_norm": 0.8162694542335893, "learning_rate": 3.928771063455007e-05, "loss": 0.5382, "step": 166 }, { "epoch": 0.5335463258785943, "grad_norm": 1.055654119658905, "learning_rate": 3.926790985090228e-05, "loss": 0.5127, "step": 167 }, { "epoch": 0.536741214057508, "grad_norm": 0.9342358155994782, "learning_rate": 3.924784273704363e-05, "loss": 0.5133, "step": 168 }, { "epoch": 0.5399361022364217, "grad_norm": 0.8868699043103929, "learning_rate": 3.922750957035128e-05, "loss": 0.5757, "step": 169 }, { "epoch": 0.5431309904153354, "grad_norm": 0.9258969627974063, "learning_rate": 3.920691063187995e-05, "loss": 0.5588, "step": 170 }, { "epoch": 0.5463258785942492, "grad_norm": 0.9405026305900314, "learning_rate": 3.918604620635797e-05, "loss": 0.5652, "step": 171 }, { "epoch": 0.549520766773163, "grad_norm": 0.8161599160408928, "learning_rate": 3.916491658218333e-05, "loss": 0.5586, "step": 172 }, { "epoch": 0.5527156549520766, "grad_norm": 0.9068770118413155, "learning_rate": 3.914352205141975e-05, "loss": 0.566, "step": 173 }, { "epoch": 0.5559105431309904, "grad_norm": 0.9468726642569554, "learning_rate": 3.91218629097926e-05, "loss": 0.6058, "step": 174 }, { "epoch": 0.5591054313099042, "grad_norm": 0.8130418886800949, "learning_rate": 3.909993945668484e-05, "loss": 0.5453, "step": 175 }, { "epoch": 0.5623003194888179, "grad_norm": 0.909689386530599, "learning_rate": 3.907775199513286e-05, "loss": 0.5348, "step": 176 }, { "epoch": 0.5654952076677316, "grad_norm": 0.9448685038012304, "learning_rate": 3.905530083182231e-05, "loss": 0.5615, "step": 177 }, { "epoch": 0.5686900958466453, "grad_norm": 0.8887839168967605, "learning_rate": 3.903258627708383e-05, "loss": 0.5318, "step": 178 }, { "epoch": 0.5718849840255591, "grad_norm": 0.8148539808805777, "learning_rate": 3.90096086448888e-05, "loss": 0.531, "step": 179 }, { "epoch": 0.5750798722044729, "grad_norm": 0.9702327272254745, "learning_rate": 3.898636825284499e-05, "loss": 0.5679, "step": 180 }, { "epoch": 0.5782747603833865, "grad_norm": 0.9398312312400305, "learning_rate": 3.896286542219212e-05, "loss": 0.566, "step": 181 }, { "epoch": 0.5814696485623003, "grad_norm": 0.986760124602772, "learning_rate": 3.893910047779752e-05, "loss": 0.5536, "step": 182 }, { "epoch": 0.5846645367412141, "grad_norm": 0.7553011510468522, "learning_rate": 3.891507374815153e-05, "loss": 0.5491, "step": 183 }, { "epoch": 0.5878594249201278, "grad_norm": 0.7490780167276647, "learning_rate": 3.8890785565363046e-05, "loss": 0.5278, "step": 184 }, { "epoch": 0.5910543130990416, "grad_norm": 0.927901642878805, "learning_rate": 3.8866236265154864e-05, "loss": 0.5493, "step": 185 }, { "epoch": 0.5942492012779552, "grad_norm": 0.9377247340300795, "learning_rate": 3.8841426186859095e-05, "loss": 0.5118, "step": 186 }, { "epoch": 0.597444089456869, "grad_norm": 0.7685781592155283, "learning_rate": 3.881635567341243e-05, "loss": 0.5213, "step": 187 }, { "epoch": 0.6006389776357828, "grad_norm": 0.9161772600103768, "learning_rate": 3.879102507135142e-05, "loss": 0.528, "step": 188 }, { "epoch": 0.6038338658146964, "grad_norm": 0.7614758217612799, "learning_rate": 3.876543473080771e-05, "loss": 0.5121, "step": 189 }, { "epoch": 0.6070287539936102, "grad_norm": 1.0241100063473434, "learning_rate": 3.8739585005503136e-05, "loss": 0.5557, "step": 190 }, { "epoch": 0.610223642172524, "grad_norm": 0.8917603175893786, "learning_rate": 3.8713476252744896e-05, "loss": 0.5033, "step": 191 }, { "epoch": 0.6134185303514377, "grad_norm": 0.8866544366619419, "learning_rate": 3.8687108833420585e-05, "loss": 0.5459, "step": 192 }, { "epoch": 0.6166134185303515, "grad_norm": 0.8490994667110613, "learning_rate": 3.866048311199321e-05, "loss": 0.5761, "step": 193 }, { "epoch": 0.6198083067092651, "grad_norm": 1.0117997679265072, "learning_rate": 3.863359945649615e-05, "loss": 0.5597, "step": 194 }, { "epoch": 0.6230031948881789, "grad_norm": 0.8115827675976007, "learning_rate": 3.860645823852808e-05, "loss": 0.5415, "step": 195 }, { "epoch": 0.6261980830670927, "grad_norm": 1.0259667321397283, "learning_rate": 3.85790598332478e-05, "loss": 0.5435, "step": 196 }, { "epoch": 0.6293929712460063, "grad_norm": 0.7714558957563437, "learning_rate": 3.8551404619369115e-05, "loss": 0.5485, "step": 197 }, { "epoch": 0.6325878594249201, "grad_norm": 0.8744712822558212, "learning_rate": 3.8523492979155534e-05, "loss": 0.5025, "step": 198 }, { "epoch": 0.6357827476038339, "grad_norm": 0.9290898869366451, "learning_rate": 3.849532529841502e-05, "loss": 0.5205, "step": 199 }, { "epoch": 0.6389776357827476, "grad_norm": 0.9659909853502232, "learning_rate": 3.846690196649464e-05, "loss": 0.5074, "step": 200 }, { "epoch": 0.6421725239616614, "grad_norm": 0.9633294322023886, "learning_rate": 3.84382233762752e-05, "loss": 0.5504, "step": 201 }, { "epoch": 0.645367412140575, "grad_norm": 1.0208058978197756, "learning_rate": 3.840928992416583e-05, "loss": 0.5683, "step": 202 }, { "epoch": 0.6485623003194888, "grad_norm": 0.7720130021877678, "learning_rate": 3.8380102010098436e-05, "loss": 0.5101, "step": 203 }, { "epoch": 0.6517571884984026, "grad_norm": 0.9815422862820409, "learning_rate": 3.835066003752226e-05, "loss": 0.564, "step": 204 }, { "epoch": 0.6549520766773163, "grad_norm": 0.8106906464515937, "learning_rate": 3.832096441339825e-05, "loss": 0.5227, "step": 205 }, { "epoch": 0.65814696485623, "grad_norm": 0.9833175984925693, "learning_rate": 3.829101554819341e-05, "loss": 0.5564, "step": 206 }, { "epoch": 0.6613418530351438, "grad_norm": 0.74809638164356, "learning_rate": 3.826081385587523e-05, "loss": 0.5172, "step": 207 }, { "epoch": 0.6645367412140575, "grad_norm": 0.8646454866233487, "learning_rate": 3.823035975390585e-05, "loss": 0.5888, "step": 208 }, { "epoch": 0.6677316293929713, "grad_norm": 0.8049832826819769, "learning_rate": 3.8199653663236336e-05, "loss": 0.5792, "step": 209 }, { "epoch": 0.670926517571885, "grad_norm": 0.7543833230398367, "learning_rate": 3.8168696008300884e-05, "loss": 0.5196, "step": 210 }, { "epoch": 0.6741214057507987, "grad_norm": 0.9148118195164756, "learning_rate": 3.813748721701091e-05, "loss": 0.5444, "step": 211 }, { "epoch": 0.6773162939297125, "grad_norm": 0.7522832027542112, "learning_rate": 3.8106027720749176e-05, "loss": 0.5673, "step": 212 }, { "epoch": 0.6805111821086262, "grad_norm": 0.8180964056504714, "learning_rate": 3.807431795436379e-05, "loss": 0.5756, "step": 213 }, { "epoch": 0.6837060702875399, "grad_norm": 0.9161522568428497, "learning_rate": 3.8042358356162215e-05, "loss": 0.5901, "step": 214 }, { "epoch": 0.6869009584664537, "grad_norm": 0.943077264357207, "learning_rate": 3.801014936790522e-05, "loss": 0.4931, "step": 215 }, { "epoch": 0.6900958466453674, "grad_norm": 0.7950064346283184, "learning_rate": 3.797769143480075e-05, "loss": 0.5441, "step": 216 }, { "epoch": 0.6932907348242812, "grad_norm": 0.9413760937926158, "learning_rate": 3.79449850054978e-05, "loss": 0.5904, "step": 217 }, { "epoch": 0.6964856230031949, "grad_norm": 1.212260052461721, "learning_rate": 3.791203053208017e-05, "loss": 0.5766, "step": 218 }, { "epoch": 0.6996805111821086, "grad_norm": 0.8484051659303491, "learning_rate": 3.7878828470060274e-05, "loss": 0.5772, "step": 219 }, { "epoch": 0.7028753993610224, "grad_norm": 1.1355674304167553, "learning_rate": 3.7845379278372775e-05, "loss": 0.5679, "step": 220 }, { "epoch": 0.7060702875399361, "grad_norm": 0.984582570050398, "learning_rate": 3.781168341936834e-05, "loss": 0.5432, "step": 221 }, { "epoch": 0.7092651757188498, "grad_norm": 0.9476742495794508, "learning_rate": 3.777774135880712e-05, "loss": 0.5682, "step": 222 }, { "epoch": 0.7124600638977636, "grad_norm": 1.139069082180595, "learning_rate": 3.774355356585243e-05, "loss": 0.5121, "step": 223 }, { "epoch": 0.7156549520766773, "grad_norm": 0.7997882642082141, "learning_rate": 3.7709120513064196e-05, "loss": 0.5196, "step": 224 }, { "epoch": 0.7188498402555911, "grad_norm": 0.912339161175054, "learning_rate": 3.7674442676392456e-05, "loss": 0.5309, "step": 225 }, { "epoch": 0.7220447284345048, "grad_norm": 0.8646463276372267, "learning_rate": 3.7639520535170736e-05, "loss": 0.5764, "step": 226 }, { "epoch": 0.7252396166134185, "grad_norm": 0.8485244000483088, "learning_rate": 3.760435457210948e-05, "loss": 0.5711, "step": 227 }, { "epoch": 0.7284345047923323, "grad_norm": 0.8913545645187945, "learning_rate": 3.7568945273289355e-05, "loss": 0.5355, "step": 228 }, { "epoch": 0.731629392971246, "grad_norm": 0.8760365656277963, "learning_rate": 3.753329312815453e-05, "loss": 0.5402, "step": 229 }, { "epoch": 0.7348242811501597, "grad_norm": 0.9291923706353212, "learning_rate": 3.749739862950589e-05, "loss": 0.5323, "step": 230 }, { "epoch": 0.7380191693290735, "grad_norm": 1.1191138286045244, "learning_rate": 3.7461262273494277e-05, "loss": 0.5401, "step": 231 }, { "epoch": 0.7412140575079872, "grad_norm": 0.8298783641280376, "learning_rate": 3.742488455961358e-05, "loss": 0.5489, "step": 232 }, { "epoch": 0.744408945686901, "grad_norm": 0.8194676142692792, "learning_rate": 3.738826599069385e-05, "loss": 0.5277, "step": 233 }, { "epoch": 0.7476038338658147, "grad_norm": 0.7674059358894463, "learning_rate": 3.7351407072894356e-05, "loss": 0.5169, "step": 234 }, { "epoch": 0.7507987220447284, "grad_norm": 0.8486151770014176, "learning_rate": 3.7314308315696604e-05, "loss": 0.535, "step": 235 }, { "epoch": 0.7539936102236422, "grad_norm": 0.7391651407617897, "learning_rate": 3.7276970231897225e-05, "loss": 0.504, "step": 236 }, { "epoch": 0.7571884984025559, "grad_norm": 0.902259232556215, "learning_rate": 3.723939333760099e-05, "loss": 0.5613, "step": 237 }, { "epoch": 0.7603833865814696, "grad_norm": 0.8625873135638807, "learning_rate": 3.720157815221358e-05, "loss": 0.5244, "step": 238 }, { "epoch": 0.7635782747603834, "grad_norm": 0.8936431918114204, "learning_rate": 3.716352519843448e-05, "loss": 0.5426, "step": 239 }, { "epoch": 0.7667731629392971, "grad_norm": 0.995320435639344, "learning_rate": 3.71252350022497e-05, "loss": 0.5104, "step": 240 }, { "epoch": 0.7699680511182109, "grad_norm": 0.8031264080307341, "learning_rate": 3.708670809292455e-05, "loss": 0.5246, "step": 241 }, { "epoch": 0.7731629392971247, "grad_norm": 1.0835434981781038, "learning_rate": 3.704794500299627e-05, "loss": 0.5003, "step": 242 }, { "epoch": 0.7763578274760383, "grad_norm": 0.9478013614793179, "learning_rate": 3.700894626826674e-05, "loss": 0.5116, "step": 243 }, { "epoch": 0.7795527156549521, "grad_norm": 1.0469893211794972, "learning_rate": 3.696971242779499e-05, "loss": 0.6261, "step": 244 }, { "epoch": 0.7827476038338658, "grad_norm": 0.9297231377773715, "learning_rate": 3.693024402388984e-05, "loss": 0.5502, "step": 245 }, { "epoch": 0.7859424920127795, "grad_norm": 0.7610910877457355, "learning_rate": 3.689054160210232e-05, "loss": 0.542, "step": 246 }, { "epoch": 0.7891373801916933, "grad_norm": 1.1342717917686111, "learning_rate": 3.6850605711218176e-05, "loss": 0.5844, "step": 247 }, { "epoch": 0.792332268370607, "grad_norm": 0.6995938546636964, "learning_rate": 3.681043690325029e-05, "loss": 0.5343, "step": 248 }, { "epoch": 0.7955271565495208, "grad_norm": 1.1509298073605394, "learning_rate": 3.6770035733431014e-05, "loss": 0.5209, "step": 249 }, { "epoch": 0.7987220447284346, "grad_norm": 0.8295131498958032, "learning_rate": 3.6729402760204535e-05, "loss": 0.5369, "step": 250 }, { "epoch": 0.8019169329073482, "grad_norm": 0.9218479825070323, "learning_rate": 3.668853854521913e-05, "loss": 0.4855, "step": 251 }, { "epoch": 0.805111821086262, "grad_norm": 0.7441334936621781, "learning_rate": 3.66474436533194e-05, "loss": 0.5268, "step": 252 }, { "epoch": 0.8083067092651757, "grad_norm": 0.7507501619024712, "learning_rate": 3.660611865253848e-05, "loss": 0.5105, "step": 253 }, { "epoch": 0.8115015974440895, "grad_norm": 0.7924592861345243, "learning_rate": 3.6564564114090175e-05, "loss": 0.4829, "step": 254 }, { "epoch": 0.8146964856230032, "grad_norm": 0.7720230321659396, "learning_rate": 3.652278061236109e-05, "loss": 0.5, "step": 255 }, { "epoch": 0.8178913738019169, "grad_norm": 0.7979403807748843, "learning_rate": 3.648076872490263e-05, "loss": 0.5296, "step": 256 }, { "epoch": 0.8210862619808307, "grad_norm": 0.8135723649226206, "learning_rate": 3.6438529032423086e-05, "loss": 0.507, "step": 257 }, { "epoch": 0.8242811501597445, "grad_norm": 0.7476070159945926, "learning_rate": 3.639606211877958e-05, "loss": 0.5006, "step": 258 }, { "epoch": 0.8274760383386581, "grad_norm": 0.8362960189666772, "learning_rate": 3.635336857096997e-05, "loss": 0.5254, "step": 259 }, { "epoch": 0.8306709265175719, "grad_norm": 0.9452074954015198, "learning_rate": 3.631044897912478e-05, "loss": 0.5499, "step": 260 }, { "epoch": 0.8338658146964856, "grad_norm": 0.6680220539129625, "learning_rate": 3.6267303936499006e-05, "loss": 0.5311, "step": 261 }, { "epoch": 0.8370607028753994, "grad_norm": 0.9178609027182498, "learning_rate": 3.622393403946395e-05, "loss": 0.5675, "step": 262 }, { "epoch": 0.8402555910543131, "grad_norm": 0.8455480168827179, "learning_rate": 3.6180339887498953e-05, "loss": 0.5582, "step": 263 }, { "epoch": 0.8434504792332268, "grad_norm": 0.7440034772147296, "learning_rate": 3.6136522083183096e-05, "loss": 0.5813, "step": 264 }, { "epoch": 0.8466453674121406, "grad_norm": 0.718472836263842, "learning_rate": 3.6092481232186905e-05, "loss": 0.5302, "step": 265 }, { "epoch": 0.8498402555910544, "grad_norm": 0.8057027197237966, "learning_rate": 3.604821794326398e-05, "loss": 0.4891, "step": 266 }, { "epoch": 0.853035143769968, "grad_norm": 0.8835357018051417, "learning_rate": 3.600373282824252e-05, "loss": 0.5171, "step": 267 }, { "epoch": 0.8562300319488818, "grad_norm": 0.7771156562797832, "learning_rate": 3.595902650201695e-05, "loss": 0.5085, "step": 268 }, { "epoch": 0.8594249201277955, "grad_norm": 0.8465271291843096, "learning_rate": 3.591409958253937e-05, "loss": 0.5324, "step": 269 }, { "epoch": 0.8626198083067093, "grad_norm": 0.7718211766797882, "learning_rate": 3.5868952690811015e-05, "loss": 0.5752, "step": 270 }, { "epoch": 0.865814696485623, "grad_norm": 0.9775795997595186, "learning_rate": 3.582358645087368e-05, "loss": 0.5599, "step": 271 }, { "epoch": 0.8690095846645367, "grad_norm": 0.8312123906165099, "learning_rate": 3.577800148980112e-05, "loss": 0.5223, "step": 272 }, { "epoch": 0.8722044728434505, "grad_norm": 0.935336068810225, "learning_rate": 3.573219843769033e-05, "loss": 0.5083, "step": 273 }, { "epoch": 0.8753993610223643, "grad_norm": 0.8720606095772891, "learning_rate": 3.568617792765287e-05, "loss": 0.5636, "step": 274 }, { "epoch": 0.8785942492012779, "grad_norm": 0.7824277269020549, "learning_rate": 3.563994059580611e-05, "loss": 0.5461, "step": 275 }, { "epoch": 0.8817891373801917, "grad_norm": 1.1055706045113576, "learning_rate": 3.559348708126445e-05, "loss": 0.5623, "step": 276 }, { "epoch": 0.8849840255591054, "grad_norm": 0.7710258784149644, "learning_rate": 3.5546818026130444e-05, "loss": 0.5279, "step": 277 }, { "epoch": 0.8881789137380192, "grad_norm": 0.8879078744940178, "learning_rate": 3.549993407548595e-05, "loss": 0.4966, "step": 278 }, { "epoch": 0.8913738019169329, "grad_norm": 1.0405338141997835, "learning_rate": 3.545283587738324e-05, "loss": 0.5365, "step": 279 }, { "epoch": 0.8945686900958466, "grad_norm": 0.7971271443117728, "learning_rate": 3.5405524082836e-05, "loss": 0.5672, "step": 280 }, { "epoch": 0.8977635782747604, "grad_norm": 0.931450449668035, "learning_rate": 3.5357999345810335e-05, "loss": 0.5668, "step": 281 }, { "epoch": 0.9009584664536742, "grad_norm": 0.7895639631341859, "learning_rate": 3.5310262323215774e-05, "loss": 0.4955, "step": 282 }, { "epoch": 0.9041533546325878, "grad_norm": 0.9607888759719633, "learning_rate": 3.5262313674896125e-05, "loss": 0.5147, "step": 283 }, { "epoch": 0.9073482428115016, "grad_norm": 0.9689194114193417, "learning_rate": 3.521415406362041e-05, "loss": 0.5062, "step": 284 }, { "epoch": 0.9105431309904153, "grad_norm": 0.9647692491089708, "learning_rate": 3.5165784155073665e-05, "loss": 0.5625, "step": 285 }, { "epoch": 0.9137380191693291, "grad_norm": 0.8493612979149395, "learning_rate": 3.511720461784778e-05, "loss": 0.5424, "step": 286 }, { "epoch": 0.9169329073482428, "grad_norm": 0.8442771355018583, "learning_rate": 3.50684161234322e-05, "loss": 0.5632, "step": 287 }, { "epoch": 0.9201277955271565, "grad_norm": 0.8552088751974364, "learning_rate": 3.50194193462047e-05, "loss": 0.5372, "step": 288 }, { "epoch": 0.9233226837060703, "grad_norm": 0.8152267999689093, "learning_rate": 3.497021496342203e-05, "loss": 0.4861, "step": 289 }, { "epoch": 0.9265175718849841, "grad_norm": 0.9398979894292626, "learning_rate": 3.4920803655210553e-05, "loss": 0.5801, "step": 290 }, { "epoch": 0.9297124600638977, "grad_norm": 0.977293682261694, "learning_rate": 3.4871186104556874e-05, "loss": 0.5207, "step": 291 }, { "epoch": 0.9329073482428115, "grad_norm": 0.8549875703308473, "learning_rate": 3.482136299729836e-05, "loss": 0.5349, "step": 292 }, { "epoch": 0.9361022364217252, "grad_norm": 0.8819868669919189, "learning_rate": 3.4771335022113705e-05, "loss": 0.5597, "step": 293 }, { "epoch": 0.939297124600639, "grad_norm": 0.9666325301004642, "learning_rate": 3.4721102870513345e-05, "loss": 0.5329, "step": 294 }, { "epoch": 0.9424920127795527, "grad_norm": 0.8203876358079187, "learning_rate": 3.467066723682998e-05, "loss": 0.5246, "step": 295 }, { "epoch": 0.9456869009584664, "grad_norm": 1.1228342280176937, "learning_rate": 3.462002881820891e-05, "loss": 0.6097, "step": 296 }, { "epoch": 0.9488817891373802, "grad_norm": 0.7732572275271596, "learning_rate": 3.456918831459844e-05, "loss": 0.5253, "step": 297 }, { "epoch": 0.952076677316294, "grad_norm": 0.9752827068051154, "learning_rate": 3.451814642874017e-05, "loss": 0.5539, "step": 298 }, { "epoch": 0.9552715654952076, "grad_norm": 0.7900396864367236, "learning_rate": 3.4466903866159326e-05, "loss": 0.5457, "step": 299 }, { "epoch": 0.9584664536741214, "grad_norm": 0.8596711318037582, "learning_rate": 3.441546133515496e-05, "loss": 0.5266, "step": 300 }, { "epoch": 0.9616613418530351, "grad_norm": 0.9071723802112778, "learning_rate": 3.4363819546790216e-05, "loss": 0.5189, "step": 301 }, { "epoch": 0.9648562300319489, "grad_norm": 0.6897307625929432, "learning_rate": 3.431197921488242e-05, "loss": 0.5258, "step": 302 }, { "epoch": 0.9680511182108626, "grad_norm": 0.8072267951218067, "learning_rate": 3.425994105599331e-05, "loss": 0.5025, "step": 303 }, { "epoch": 0.9712460063897763, "grad_norm": 0.7299045123280457, "learning_rate": 3.4207705789419035e-05, "loss": 0.4942, "step": 304 }, { "epoch": 0.9744408945686901, "grad_norm": 0.811210391135453, "learning_rate": 3.41552741371803e-05, "loss": 0.5128, "step": 305 }, { "epoch": 0.9776357827476039, "grad_norm": 0.6833163220999185, "learning_rate": 3.4102646824012333e-05, "loss": 0.5036, "step": 306 }, { "epoch": 0.9808306709265175, "grad_norm": 0.7318928742301355, "learning_rate": 3.404982457735487e-05, "loss": 0.5248, "step": 307 }, { "epoch": 0.9840255591054313, "grad_norm": 0.8151408628855044, "learning_rate": 3.399680812734213e-05, "loss": 0.5244, "step": 308 }, { "epoch": 0.987220447284345, "grad_norm": 0.7365970167922717, "learning_rate": 3.3943598206792665e-05, "loss": 0.5334, "step": 309 }, { "epoch": 0.9904153354632588, "grad_norm": 0.6444531685595024, "learning_rate": 3.3890195551199334e-05, "loss": 0.506, "step": 310 }, { "epoch": 0.9936102236421726, "grad_norm": 0.7379917525512831, "learning_rate": 3.3836600898719e-05, "loss": 0.4884, "step": 311 }, { "epoch": 0.9968051118210862, "grad_norm": 0.8006414034782756, "learning_rate": 3.3782814990162457e-05, "loss": 0.6063, "step": 312 }, { "epoch": 1.0, "grad_norm": 0.8714839087095215, "learning_rate": 3.372883856898408e-05, "loss": 0.5957, "step": 313 }, { "epoch": 1.0031948881789137, "grad_norm": 0.8608076101674021, "learning_rate": 3.367467238127165e-05, "loss": 0.4153, "step": 314 }, { "epoch": 1.0063897763578276, "grad_norm": 0.7863968465089736, "learning_rate": 3.3620317175735945e-05, "loss": 0.4178, "step": 315 }, { "epoch": 1.0095846645367412, "grad_norm": 0.6819742061303111, "learning_rate": 3.3565773703700474e-05, "loss": 0.3475, "step": 316 }, { "epoch": 1.012779552715655, "grad_norm": 0.7870011705239355, "learning_rate": 3.351104271909104e-05, "loss": 0.3629, "step": 317 }, { "epoch": 1.0159744408945688, "grad_norm": 0.8316632267191167, "learning_rate": 3.345612497842532e-05, "loss": 0.3761, "step": 318 }, { "epoch": 1.0191693290734825, "grad_norm": 0.7862604900294438, "learning_rate": 3.3401021240802446e-05, "loss": 0.3627, "step": 319 }, { "epoch": 1.0223642172523961, "grad_norm": 1.0061847359485523, "learning_rate": 3.334573226789249e-05, "loss": 0.4051, "step": 320 }, { "epoch": 1.0255591054313098, "grad_norm": 0.8807904375499824, "learning_rate": 3.32902588239259e-05, "loss": 0.3968, "step": 321 }, { "epoch": 1.0287539936102237, "grad_norm": 1.1981019094258039, "learning_rate": 3.3234601675683005e-05, "loss": 0.4202, "step": 322 }, { "epoch": 1.0319488817891374, "grad_norm": 0.844697383840708, "learning_rate": 3.317876159248337e-05, "loss": 0.3743, "step": 323 }, { "epoch": 1.035143769968051, "grad_norm": 0.9126433986445898, "learning_rate": 3.3122739346175176e-05, "loss": 0.3855, "step": 324 }, { "epoch": 1.038338658146965, "grad_norm": 0.6598463168728887, "learning_rate": 3.306653571112454e-05, "loss": 0.3476, "step": 325 }, { "epoch": 1.0415335463258786, "grad_norm": 0.8158289993131012, "learning_rate": 3.301015146420484e-05, "loss": 0.3718, "step": 326 }, { "epoch": 1.0447284345047922, "grad_norm": 0.9397053531896098, "learning_rate": 3.295358738478593e-05, "loss": 0.4497, "step": 327 }, { "epoch": 1.0479233226837061, "grad_norm": 0.6428708472951121, "learning_rate": 3.2896844254723414e-05, "loss": 0.3422, "step": 328 }, { "epoch": 1.0511182108626198, "grad_norm": 0.9006480199213088, "learning_rate": 3.283992285834782e-05, "loss": 0.3803, "step": 329 }, { "epoch": 1.0543130990415335, "grad_norm": 0.804430943445223, "learning_rate": 3.2782823982453746e-05, "loss": 0.3999, "step": 330 }, { "epoch": 1.0575079872204474, "grad_norm": 0.9270904209069851, "learning_rate": 3.272554841628901e-05, "loss": 0.4319, "step": 331 }, { "epoch": 1.060702875399361, "grad_norm": 0.845825639588173, "learning_rate": 3.266809695154371e-05, "loss": 0.3746, "step": 332 }, { "epoch": 1.0638977635782747, "grad_norm": 0.8494398683413655, "learning_rate": 3.261047038233931e-05, "loss": 0.3969, "step": 333 }, { "epoch": 1.0670926517571886, "grad_norm": 0.7126024738944513, "learning_rate": 3.2552669505217646e-05, "loss": 0.3474, "step": 334 }, { "epoch": 1.0702875399361023, "grad_norm": 0.9359004672009658, "learning_rate": 3.2494695119129924e-05, "loss": 0.3707, "step": 335 }, { "epoch": 1.073482428115016, "grad_norm": 0.6005459131808137, "learning_rate": 3.243654802542568e-05, "loss": 0.3063, "step": 336 }, { "epoch": 1.0766773162939298, "grad_norm": 0.8842790772014463, "learning_rate": 3.2378229027841675e-05, "loss": 0.3765, "step": 337 }, { "epoch": 1.0798722044728435, "grad_norm": 0.9070675817695492, "learning_rate": 3.231973893249083e-05, "loss": 0.3779, "step": 338 }, { "epoch": 1.0830670926517572, "grad_norm": 0.670447746767721, "learning_rate": 3.226107854785106e-05, "loss": 0.4082, "step": 339 }, { "epoch": 1.0862619808306708, "grad_norm": 0.9213283733651502, "learning_rate": 3.220224868475408e-05, "loss": 0.4237, "step": 340 }, { "epoch": 1.0894568690095847, "grad_norm": 0.6839594328860654, "learning_rate": 3.2143250156374226e-05, "loss": 0.4307, "step": 341 }, { "epoch": 1.0926517571884984, "grad_norm": 0.7489428445729561, "learning_rate": 3.208408377821722e-05, "loss": 0.3652, "step": 342 }, { "epoch": 1.095846645367412, "grad_norm": 0.7550186524407567, "learning_rate": 3.202475036810886e-05, "loss": 0.406, "step": 343 }, { "epoch": 1.099041533546326, "grad_norm": 0.6395054008269865, "learning_rate": 3.1965250746183755e-05, "loss": 0.3711, "step": 344 }, { "epoch": 1.1022364217252396, "grad_norm": 0.657977446052051, "learning_rate": 3.190558573487397e-05, "loss": 0.3542, "step": 345 }, { "epoch": 1.1054313099041533, "grad_norm": 0.8206988613968245, "learning_rate": 3.1845756158897654e-05, "loss": 0.3985, "step": 346 }, { "epoch": 1.1086261980830672, "grad_norm": 0.7479778299255093, "learning_rate": 3.178576284524765e-05, "loss": 0.3371, "step": 347 }, { "epoch": 1.1118210862619808, "grad_norm": 0.753877520359999, "learning_rate": 3.1725606623180086e-05, "loss": 0.3699, "step": 348 }, { "epoch": 1.1150159744408945, "grad_norm": 0.8504076245127729, "learning_rate": 3.166528832420283e-05, "loss": 0.3912, "step": 349 }, { "epoch": 1.1182108626198084, "grad_norm": 0.6964611574953985, "learning_rate": 3.160480878206412e-05, "loss": 0.3386, "step": 350 }, { "epoch": 1.121405750798722, "grad_norm": 0.6824499798649496, "learning_rate": 3.154416883274092e-05, "loss": 0.3709, "step": 351 }, { "epoch": 1.1246006389776357, "grad_norm": 1.0185089841293544, "learning_rate": 3.148336931442745e-05, "loss": 0.3634, "step": 352 }, { "epoch": 1.1277955271565494, "grad_norm": 0.7083393549747615, "learning_rate": 3.142241106752356e-05, "loss": 0.3941, "step": 353 }, { "epoch": 1.1309904153354633, "grad_norm": 0.8028674737087654, "learning_rate": 3.136129493462312e-05, "loss": 0.3424, "step": 354 }, { "epoch": 1.134185303514377, "grad_norm": 0.9169589595364791, "learning_rate": 3.130002176050238e-05, "loss": 0.37, "step": 355 }, { "epoch": 1.1373801916932909, "grad_norm": 0.6610294457775537, "learning_rate": 3.123859239210827e-05, "loss": 0.3673, "step": 356 }, { "epoch": 1.1405750798722045, "grad_norm": 0.7218101821218883, "learning_rate": 3.1177007678546746e-05, "loss": 0.4232, "step": 357 }, { "epoch": 1.1437699680511182, "grad_norm": 0.9486057955199187, "learning_rate": 3.111526847107099e-05, "loss": 0.3852, "step": 358 }, { "epoch": 1.1469648562300319, "grad_norm": 0.6192218269870875, "learning_rate": 3.105337562306968e-05, "loss": 0.3301, "step": 359 }, { "epoch": 1.1501597444089458, "grad_norm": 0.6912746341979487, "learning_rate": 3.099132999005519e-05, "loss": 0.3625, "step": 360 }, { "epoch": 1.1533546325878594, "grad_norm": 0.8493378937993168, "learning_rate": 3.092913242965175e-05, "loss": 0.3951, "step": 361 }, { "epoch": 1.156549520766773, "grad_norm": 0.833993863731902, "learning_rate": 3.086678380158364e-05, "loss": 0.3902, "step": 362 }, { "epoch": 1.159744408945687, "grad_norm": 0.7398039881016663, "learning_rate": 3.0804284967663214e-05, "loss": 0.3924, "step": 363 }, { "epoch": 1.1629392971246006, "grad_norm": 0.7703232633649562, "learning_rate": 3.074163679177907e-05, "loss": 0.3761, "step": 364 }, { "epoch": 1.1661341853035143, "grad_norm": 0.89436430995006, "learning_rate": 3.06788401398841e-05, "loss": 0.3701, "step": 365 }, { "epoch": 1.1693290734824282, "grad_norm": 0.7039513259201169, "learning_rate": 3.061589587998346e-05, "loss": 0.3622, "step": 366 }, { "epoch": 1.1725239616613419, "grad_norm": 0.7355381764642869, "learning_rate": 3.055280488212266e-05, "loss": 0.3969, "step": 367 }, { "epoch": 1.1757188498402555, "grad_norm": 0.8100722031529548, "learning_rate": 3.0489568018375447e-05, "loss": 0.3718, "step": 368 }, { "epoch": 1.1789137380191694, "grad_norm": 0.7026117502103958, "learning_rate": 3.042618616283184e-05, "loss": 0.3405, "step": 369 }, { "epoch": 1.182108626198083, "grad_norm": 0.7207118281643955, "learning_rate": 3.036266019158596e-05, "loss": 0.3889, "step": 370 }, { "epoch": 1.1853035143769968, "grad_norm": 0.79060661451023, "learning_rate": 3.0298990982724e-05, "loss": 0.3994, "step": 371 }, { "epoch": 1.1884984025559104, "grad_norm": 0.8441043300302222, "learning_rate": 3.0235179416312025e-05, "loss": 0.3508, "step": 372 }, { "epoch": 1.1916932907348243, "grad_norm": 0.7349501648718484, "learning_rate": 3.017122637438385e-05, "loss": 0.3847, "step": 373 }, { "epoch": 1.194888178913738, "grad_norm": 0.8725781608132315, "learning_rate": 3.0107132740928832e-05, "loss": 0.3926, "step": 374 }, { "epoch": 1.1980830670926517, "grad_norm": 0.8327406117293235, "learning_rate": 3.004289940187964e-05, "loss": 0.3802, "step": 375 }, { "epoch": 1.2012779552715656, "grad_norm": 0.7199915365896178, "learning_rate": 2.9978527245100034e-05, "loss": 0.354, "step": 376 }, { "epoch": 1.2044728434504792, "grad_norm": 0.7419030308089841, "learning_rate": 2.991401716037255e-05, "loss": 0.3884, "step": 377 }, { "epoch": 1.207667731629393, "grad_norm": 0.8062260851883908, "learning_rate": 2.9849370039386284e-05, "loss": 0.366, "step": 378 }, { "epoch": 1.2108626198083068, "grad_norm": 0.7591181232522807, "learning_rate": 2.9784586775724443e-05, "loss": 0.3579, "step": 379 }, { "epoch": 1.2140575079872205, "grad_norm": 0.8434345898198293, "learning_rate": 2.971966826485212e-05, "loss": 0.4524, "step": 380 }, { "epoch": 1.2172523961661341, "grad_norm": 0.8495573303145608, "learning_rate": 2.9654615404103837e-05, "loss": 0.434, "step": 381 }, { "epoch": 1.220447284345048, "grad_norm": 0.6933721175439074, "learning_rate": 2.9589429092671155e-05, "loss": 0.4343, "step": 382 }, { "epoch": 1.2236421725239617, "grad_norm": 0.7274608975530785, "learning_rate": 2.952411023159027e-05, "loss": 0.3298, "step": 383 }, { "epoch": 1.2268370607028753, "grad_norm": 0.8168975435659065, "learning_rate": 2.945865972372954e-05, "loss": 0.4002, "step": 384 }, { "epoch": 1.230031948881789, "grad_norm": 0.8873260212581557, "learning_rate": 2.939307847377699e-05, "loss": 0.4397, "step": 385 }, { "epoch": 1.233226837060703, "grad_norm": 0.6916592999050895, "learning_rate": 2.9327367388227847e-05, "loss": 0.391, "step": 386 }, { "epoch": 1.2364217252396166, "grad_norm": 0.7553082417173358, "learning_rate": 2.926152737537198e-05, "loss": 0.3466, "step": 387 }, { "epoch": 1.2396166134185305, "grad_norm": 0.8650485170070908, "learning_rate": 2.9195559345281336e-05, "loss": 0.4146, "step": 388 }, { "epoch": 1.2428115015974441, "grad_norm": 0.7330543803583387, "learning_rate": 2.9129464209797404e-05, "loss": 0.3898, "step": 389 }, { "epoch": 1.2460063897763578, "grad_norm": 0.7363439527629848, "learning_rate": 2.906324288251857e-05, "loss": 0.4112, "step": 390 }, { "epoch": 1.2492012779552715, "grad_norm": 0.8327177625635779, "learning_rate": 2.8996896278787504e-05, "loss": 0.3905, "step": 391 }, { "epoch": 1.2523961661341854, "grad_norm": 0.6537335351551286, "learning_rate": 2.893042531567851e-05, "loss": 0.4207, "step": 392 }, { "epoch": 1.255591054313099, "grad_norm": 0.7146002049281741, "learning_rate": 2.886383091198483e-05, "loss": 0.3441, "step": 393 }, { "epoch": 1.2587859424920127, "grad_norm": 0.8078428165110145, "learning_rate": 2.8797113988205992e-05, "loss": 0.4221, "step": 394 }, { "epoch": 1.2619808306709266, "grad_norm": 0.5972404381082699, "learning_rate": 2.8730275466535027e-05, "loss": 0.3144, "step": 395 }, { "epoch": 1.2651757188498403, "grad_norm": 0.7706819426110447, "learning_rate": 2.866331627084576e-05, "loss": 0.3822, "step": 396 }, { "epoch": 1.268370607028754, "grad_norm": 0.6908783947369732, "learning_rate": 2.8596237326680035e-05, "loss": 0.3535, "step": 397 }, { "epoch": 1.2715654952076676, "grad_norm": 0.712295172157205, "learning_rate": 2.8529039561234904e-05, "loss": 0.3748, "step": 398 }, { "epoch": 1.2747603833865815, "grad_norm": 0.842337102034422, "learning_rate": 2.846172390334983e-05, "loss": 0.3949, "step": 399 }, { "epoch": 1.2779552715654952, "grad_norm": 0.7287013629663512, "learning_rate": 2.8394291283493846e-05, "loss": 0.419, "step": 400 }, { "epoch": 1.281150159744409, "grad_norm": 0.7190834632935403, "learning_rate": 2.8326742633752693e-05, "loss": 0.3852, "step": 401 }, { "epoch": 1.2843450479233227, "grad_norm": 0.76075306510952, "learning_rate": 2.82590788878159e-05, "loss": 0.4172, "step": 402 }, { "epoch": 1.2875399361022364, "grad_norm": 0.7039618344228508, "learning_rate": 2.8191300980963956e-05, "loss": 0.4121, "step": 403 }, { "epoch": 1.29073482428115, "grad_norm": 0.6460440389338991, "learning_rate": 2.8123409850055307e-05, "loss": 0.3896, "step": 404 }, { "epoch": 1.293929712460064, "grad_norm": 0.6974323179340803, "learning_rate": 2.8055406433513437e-05, "loss": 0.3549, "step": 405 }, { "epoch": 1.2971246006389776, "grad_norm": 0.8173306197186939, "learning_rate": 2.798729167131391e-05, "loss": 0.4078, "step": 406 }, { "epoch": 1.3003194888178915, "grad_norm": 0.7127151827524344, "learning_rate": 2.7919066504971355e-05, "loss": 0.3622, "step": 407 }, { "epoch": 1.3035143769968052, "grad_norm": 0.732663954477486, "learning_rate": 2.7850731877526454e-05, "loss": 0.3845, "step": 408 }, { "epoch": 1.3067092651757188, "grad_norm": 0.6128280361598326, "learning_rate": 2.7782288733532915e-05, "loss": 0.3877, "step": 409 }, { "epoch": 1.3099041533546325, "grad_norm": 0.6950127994352941, "learning_rate": 2.7713738019044424e-05, "loss": 0.3538, "step": 410 }, { "epoch": 1.3130990415335464, "grad_norm": 0.686187825577798, "learning_rate": 2.764508068160154e-05, "loss": 0.387, "step": 411 }, { "epoch": 1.31629392971246, "grad_norm": 0.6478458382514949, "learning_rate": 2.7576317670218626e-05, "loss": 0.3751, "step": 412 }, { "epoch": 1.3194888178913737, "grad_norm": 0.6815550771368388, "learning_rate": 2.7507449935370726e-05, "loss": 0.3475, "step": 413 }, { "epoch": 1.3226837060702876, "grad_norm": 0.6741037656638967, "learning_rate": 2.7438478428980407e-05, "loss": 0.396, "step": 414 }, { "epoch": 1.3258785942492013, "grad_norm": 0.7167174755449621, "learning_rate": 2.736940410440462e-05, "loss": 0.3964, "step": 415 }, { "epoch": 1.329073482428115, "grad_norm": 0.6842795899926827, "learning_rate": 2.7300227916421528e-05, "loss": 0.3973, "step": 416 }, { "epoch": 1.3322683706070286, "grad_norm": 0.702058426076705, "learning_rate": 2.7230950821217294e-05, "loss": 0.387, "step": 417 }, { "epoch": 1.3354632587859425, "grad_norm": 0.6368932760300601, "learning_rate": 2.7161573776372856e-05, "loss": 0.3964, "step": 418 }, { "epoch": 1.3386581469648562, "grad_norm": 0.6782570721877241, "learning_rate": 2.7092097740850712e-05, "loss": 0.4042, "step": 419 }, { "epoch": 1.34185303514377, "grad_norm": 0.5931978671155632, "learning_rate": 2.7022523674981674e-05, "loss": 0.395, "step": 420 }, { "epoch": 1.3450479233226837, "grad_norm": 0.7201174578306515, "learning_rate": 2.6952852540451536e-05, "loss": 0.3921, "step": 421 }, { "epoch": 1.3482428115015974, "grad_norm": 0.6946900695747145, "learning_rate": 2.688308530028786e-05, "loss": 0.3652, "step": 422 }, { "epoch": 1.351437699680511, "grad_norm": 0.698534785763629, "learning_rate": 2.6813222918846613e-05, "loss": 0.3741, "step": 423 }, { "epoch": 1.354632587859425, "grad_norm": 0.7564830310668963, "learning_rate": 2.6743266361798833e-05, "loss": 0.4067, "step": 424 }, { "epoch": 1.3578274760383386, "grad_norm": 0.8625137509946531, "learning_rate": 2.6673216596117324e-05, "loss": 0.3512, "step": 425 }, { "epoch": 1.3610223642172525, "grad_norm": 0.6004241242239579, "learning_rate": 2.660307459006325e-05, "loss": 0.3885, "step": 426 }, { "epoch": 1.3642172523961662, "grad_norm": 0.8493675462357329, "learning_rate": 2.653284131317276e-05, "loss": 0.359, "step": 427 }, { "epoch": 1.3674121405750799, "grad_norm": 0.7054484310915905, "learning_rate": 2.6462517736243612e-05, "loss": 0.3776, "step": 428 }, { "epoch": 1.3706070287539935, "grad_norm": 0.7635212688991575, "learning_rate": 2.639210483132171e-05, "loss": 0.3678, "step": 429 }, { "epoch": 1.3738019169329074, "grad_norm": 0.7551414066116379, "learning_rate": 2.6321603571687714e-05, "loss": 0.3538, "step": 430 }, { "epoch": 1.376996805111821, "grad_norm": 0.5933801420526407, "learning_rate": 2.625101493184355e-05, "loss": 0.3608, "step": 431 }, { "epoch": 1.3801916932907348, "grad_norm": 0.7485716718685856, "learning_rate": 2.618033988749895e-05, "loss": 0.3392, "step": 432 }, { "epoch": 1.3833865814696487, "grad_norm": 0.6856872085204931, "learning_rate": 2.6109579415557997e-05, "loss": 0.3696, "step": 433 }, { "epoch": 1.3865814696485623, "grad_norm": 0.6906273049790326, "learning_rate": 2.6038734494105562e-05, "loss": 0.3949, "step": 434 }, { "epoch": 1.389776357827476, "grad_norm": 0.7317141874141749, "learning_rate": 2.5967806102393844e-05, "loss": 0.3961, "step": 435 }, { "epoch": 1.3929712460063897, "grad_norm": 0.7084024455483174, "learning_rate": 2.5896795220828786e-05, "loss": 0.3729, "step": 436 }, { "epoch": 1.3961661341853036, "grad_norm": 0.6735234384239845, "learning_rate": 2.582570283095656e-05, "loss": 0.3755, "step": 437 }, { "epoch": 1.3993610223642172, "grad_norm": 0.639545495865673, "learning_rate": 2.575452991544998e-05, "loss": 0.3461, "step": 438 }, { "epoch": 1.4025559105431311, "grad_norm": 0.8314980113052899, "learning_rate": 2.5683277458094926e-05, "loss": 0.3766, "step": 439 }, { "epoch": 1.4057507987220448, "grad_norm": 0.5774548863952043, "learning_rate": 2.5611946443776733e-05, "loss": 0.3822, "step": 440 }, { "epoch": 1.4089456869009584, "grad_norm": 0.5808224894717059, "learning_rate": 2.5540537858466596e-05, "loss": 0.3936, "step": 441 }, { "epoch": 1.4121405750798721, "grad_norm": 0.6123085104187193, "learning_rate": 2.546905268920794e-05, "loss": 0.344, "step": 442 }, { "epoch": 1.415335463258786, "grad_norm": 0.5345986142478568, "learning_rate": 2.5397491924102758e-05, "loss": 0.3681, "step": 443 }, { "epoch": 1.4185303514376997, "grad_norm": 0.7318254435855026, "learning_rate": 2.532585655229797e-05, "loss": 0.3677, "step": 444 }, { "epoch": 1.4217252396166133, "grad_norm": 0.6012026977631383, "learning_rate": 2.525414756397174e-05, "loss": 0.334, "step": 445 }, { "epoch": 1.4249201277955272, "grad_norm": 0.7187336200880879, "learning_rate": 2.51823659503198e-05, "loss": 0.4127, "step": 446 }, { "epoch": 1.428115015974441, "grad_norm": 0.657446388301636, "learning_rate": 2.5110512703541747e-05, "loss": 0.367, "step": 447 }, { "epoch": 1.4313099041533546, "grad_norm": 0.6399302817983334, "learning_rate": 2.503858881682731e-05, "loss": 0.4096, "step": 448 }, { "epoch": 1.4345047923322682, "grad_norm": 0.6063474923650863, "learning_rate": 2.4966595284342664e-05, "loss": 0.3701, "step": 449 }, { "epoch": 1.4376996805111821, "grad_norm": 0.6983262454628201, "learning_rate": 2.489453310121663e-05, "loss": 0.3796, "step": 450 }, { "epoch": 1.4408945686900958, "grad_norm": 0.7119999272751214, "learning_rate": 2.4822403263526966e-05, "loss": 0.3553, "step": 451 }, { "epoch": 1.4440894568690097, "grad_norm": 0.67185797833669, "learning_rate": 2.4750206768286593e-05, "loss": 0.3517, "step": 452 }, { "epoch": 1.4472843450479234, "grad_norm": 0.6486337996555229, "learning_rate": 2.4677944613429778e-05, "loss": 0.3287, "step": 453 }, { "epoch": 1.450479233226837, "grad_norm": 0.6516969882558993, "learning_rate": 2.46056177977984e-05, "loss": 0.3514, "step": 454 }, { "epoch": 1.4536741214057507, "grad_norm": 0.7388191757920071, "learning_rate": 2.4533227321128084e-05, "loss": 0.4362, "step": 455 }, { "epoch": 1.4568690095846646, "grad_norm": 0.6119479848006957, "learning_rate": 2.4460774184034408e-05, "loss": 0.3825, "step": 456 }, { "epoch": 1.4600638977635783, "grad_norm": 0.7425050306965432, "learning_rate": 2.4388259387999097e-05, "loss": 0.3759, "step": 457 }, { "epoch": 1.4632587859424921, "grad_norm": 0.6502397685694954, "learning_rate": 2.4315683935356127e-05, "loss": 0.3829, "step": 458 }, { "epoch": 1.4664536741214058, "grad_norm": 0.6725716076372529, "learning_rate": 2.4243048829277916e-05, "loss": 0.3861, "step": 459 }, { "epoch": 1.4696485623003195, "grad_norm": 0.6219046409113833, "learning_rate": 2.4170355073761433e-05, "loss": 0.3736, "step": 460 }, { "epoch": 1.4728434504792332, "grad_norm": 0.5835078991741417, "learning_rate": 2.4097603673614325e-05, "loss": 0.3531, "step": 461 }, { "epoch": 1.476038338658147, "grad_norm": 0.6550851854107704, "learning_rate": 2.4024795634441044e-05, "loss": 0.3262, "step": 462 }, { "epoch": 1.4792332268370607, "grad_norm": 0.7675785572629827, "learning_rate": 2.3951931962628918e-05, "loss": 0.392, "step": 463 }, { "epoch": 1.4824281150159744, "grad_norm": 0.6411841347495657, "learning_rate": 2.3879013665334258e-05, "loss": 0.4024, "step": 464 }, { "epoch": 1.4856230031948883, "grad_norm": 0.6007938608835436, "learning_rate": 2.380604175046844e-05, "loss": 0.3661, "step": 465 }, { "epoch": 1.488817891373802, "grad_norm": 0.6525253798215147, "learning_rate": 2.373301722668398e-05, "loss": 0.3746, "step": 466 }, { "epoch": 1.4920127795527156, "grad_norm": 0.6446114375585952, "learning_rate": 2.365994110336054e-05, "loss": 0.3889, "step": 467 }, { "epoch": 1.4952076677316293, "grad_norm": 0.6653896661247665, "learning_rate": 2.358681439059106e-05, "loss": 0.3594, "step": 468 }, { "epoch": 1.4984025559105432, "grad_norm": 0.7011886854730449, "learning_rate": 2.3513638099167723e-05, "loss": 0.3889, "step": 469 }, { "epoch": 1.5015974440894568, "grad_norm": 0.7538883021599542, "learning_rate": 2.3440413240568022e-05, "loss": 0.3642, "step": 470 }, { "epoch": 1.5047923322683707, "grad_norm": 0.6733515586731865, "learning_rate": 2.3367140826940768e-05, "loss": 0.3482, "step": 471 }, { "epoch": 1.5079872204472844, "grad_norm": 0.600416095532099, "learning_rate": 2.329382187109211e-05, "loss": 0.3399, "step": 472 }, { "epoch": 1.511182108626198, "grad_norm": 0.687374773018976, "learning_rate": 2.3220457386471496e-05, "loss": 0.3754, "step": 473 }, { "epoch": 1.5143769968051117, "grad_norm": 0.748016995785705, "learning_rate": 2.3147048387157725e-05, "loss": 0.3648, "step": 474 }, { "epoch": 1.5175718849840254, "grad_norm": 0.7005861674092242, "learning_rate": 2.3073595887844884e-05, "loss": 0.305, "step": 475 }, { "epoch": 1.5207667731629393, "grad_norm": 0.608521420596584, "learning_rate": 2.3000100903828343e-05, "loss": 0.3601, "step": 476 }, { "epoch": 1.5239616613418532, "grad_norm": 0.6464475093135752, "learning_rate": 2.2926564450990716e-05, "loss": 0.3746, "step": 477 }, { "epoch": 1.5271565495207668, "grad_norm": 0.6448444881128504, "learning_rate": 2.2852987545787815e-05, "loss": 0.3714, "step": 478 }, { "epoch": 1.5303514376996805, "grad_norm": 0.6107451967926957, "learning_rate": 2.2779371205234604e-05, "loss": 0.3796, "step": 479 }, { "epoch": 1.5335463258785942, "grad_norm": 0.660686053273227, "learning_rate": 2.2705716446891143e-05, "loss": 0.3822, "step": 480 }, { "epoch": 1.5367412140575079, "grad_norm": 0.7750842489034678, "learning_rate": 2.263202428884853e-05, "loss": 0.4105, "step": 481 }, { "epoch": 1.5399361022364217, "grad_norm": 0.6055959124966067, "learning_rate": 2.2558295749714794e-05, "loss": 0.4151, "step": 482 }, { "epoch": 1.5431309904153354, "grad_norm": 0.6183198946717873, "learning_rate": 2.2484531848600866e-05, "loss": 0.3262, "step": 483 }, { "epoch": 1.5463258785942493, "grad_norm": 0.6370711793501769, "learning_rate": 2.2410733605106462e-05, "loss": 0.3857, "step": 484 }, { "epoch": 1.549520766773163, "grad_norm": 0.8015901036246532, "learning_rate": 2.233690203930599e-05, "loss": 0.3496, "step": 485 }, { "epoch": 1.5527156549520766, "grad_norm": 0.7615585707403518, "learning_rate": 2.2263038171734447e-05, "loss": 0.3672, "step": 486 }, { "epoch": 1.5559105431309903, "grad_norm": 0.5663892496577331, "learning_rate": 2.2189143023373337e-05, "loss": 0.3761, "step": 487 }, { "epoch": 1.5591054313099042, "grad_norm": 0.7878946252561335, "learning_rate": 2.2115217615636534e-05, "loss": 0.3588, "step": 488 }, { "epoch": 1.5623003194888179, "grad_norm": 0.7764697921864392, "learning_rate": 2.204126297035617e-05, "loss": 0.4196, "step": 489 }, { "epoch": 1.5654952076677318, "grad_norm": 0.6542418496260201, "learning_rate": 2.1967280109768505e-05, "loss": 0.3408, "step": 490 }, { "epoch": 1.5686900958466454, "grad_norm": 0.6223599528533879, "learning_rate": 2.1893270056499832e-05, "loss": 0.3777, "step": 491 }, { "epoch": 1.571884984025559, "grad_norm": 0.661700414509121, "learning_rate": 2.1819233833552275e-05, "loss": 0.3128, "step": 492 }, { "epoch": 1.5750798722044728, "grad_norm": 0.6234974404457322, "learning_rate": 2.1745172464289722e-05, "loss": 0.3962, "step": 493 }, { "epoch": 1.5782747603833864, "grad_norm": 0.6676512384075517, "learning_rate": 2.167108697242363e-05, "loss": 0.3468, "step": 494 }, { "epoch": 1.5814696485623003, "grad_norm": 0.6209137519763187, "learning_rate": 2.1596978381998883e-05, "loss": 0.344, "step": 495 }, { "epoch": 1.5846645367412142, "grad_norm": 0.6215059716975698, "learning_rate": 2.152284771737966e-05, "loss": 0.3742, "step": 496 }, { "epoch": 1.5878594249201279, "grad_norm": 0.6023037671559589, "learning_rate": 2.1448696003235252e-05, "loss": 0.3752, "step": 497 }, { "epoch": 1.5910543130990416, "grad_norm": 0.6369188120702737, "learning_rate": 2.1374524264525905e-05, "loss": 0.3796, "step": 498 }, { "epoch": 1.5942492012779552, "grad_norm": 0.5900720442619971, "learning_rate": 2.130033352648866e-05, "loss": 0.3535, "step": 499 }, { "epoch": 1.5974440894568689, "grad_norm": 0.6255668983966362, "learning_rate": 2.122612481462316e-05, "loss": 0.4114, "step": 500 }, { "epoch": 1.6006389776357828, "grad_norm": 0.6798385259233033, "learning_rate": 2.115189915467752e-05, "loss": 0.389, "step": 501 }, { "epoch": 1.6038338658146964, "grad_norm": 0.6905443213231501, "learning_rate": 2.1077657572634092e-05, "loss": 0.3246, "step": 502 }, { "epoch": 1.6070287539936103, "grad_norm": 0.571423682418217, "learning_rate": 2.1003401094695325e-05, "loss": 0.3344, "step": 503 }, { "epoch": 1.610223642172524, "grad_norm": 0.6504514480344465, "learning_rate": 2.0929130747269567e-05, "loss": 0.3621, "step": 504 }, { "epoch": 1.6134185303514377, "grad_norm": 0.6411322199210792, "learning_rate": 2.0854847556956856e-05, "loss": 0.3734, "step": 505 }, { "epoch": 1.6166134185303513, "grad_norm": 0.6380721378438481, "learning_rate": 2.078055255053478e-05, "loss": 0.4034, "step": 506 }, { "epoch": 1.619808306709265, "grad_norm": 0.6249192416083079, "learning_rate": 2.070624675494424e-05, "loss": 0.3504, "step": 507 }, { "epoch": 1.623003194888179, "grad_norm": 0.6741471873600642, "learning_rate": 2.0631931197275267e-05, "loss": 0.3197, "step": 508 }, { "epoch": 1.6261980830670928, "grad_norm": 0.6125040165749199, "learning_rate": 2.0557606904752833e-05, "loss": 0.3419, "step": 509 }, { "epoch": 1.6293929712460065, "grad_norm": 0.6665831610245562, "learning_rate": 2.0483274904722647e-05, "loss": 0.3399, "step": 510 }, { "epoch": 1.6325878594249201, "grad_norm": 0.6782509805533894, "learning_rate": 2.0408936224636958e-05, "loss": 0.384, "step": 511 }, { "epoch": 1.6357827476038338, "grad_norm": 0.7436644924935581, "learning_rate": 2.033459189204034e-05, "loss": 0.3595, "step": 512 }, { "epoch": 1.6389776357827475, "grad_norm": 0.7574508093688915, "learning_rate": 2.026024293455551e-05, "loss": 0.403, "step": 513 }, { "epoch": 1.6421725239616614, "grad_norm": 0.6776067254300645, "learning_rate": 2.0185890379869115e-05, "loss": 0.3563, "step": 514 }, { "epoch": 1.645367412140575, "grad_norm": 0.5691981666919255, "learning_rate": 2.0111535255717496e-05, "loss": 0.3613, "step": 515 }, { "epoch": 1.648562300319489, "grad_norm": 0.6501032885266717, "learning_rate": 2.0037178589872547e-05, "loss": 0.3553, "step": 516 }, { "epoch": 1.6517571884984026, "grad_norm": 0.5894185843432012, "learning_rate": 1.9962821410127456e-05, "loss": 0.3335, "step": 517 }, { "epoch": 1.6549520766773163, "grad_norm": 0.639710192390572, "learning_rate": 1.9888464744282504e-05, "loss": 0.3627, "step": 518 }, { "epoch": 1.65814696485623, "grad_norm": 0.6216821494956912, "learning_rate": 1.981410962013089e-05, "loss": 0.3344, "step": 519 }, { "epoch": 1.6613418530351438, "grad_norm": 0.6345005257067161, "learning_rate": 1.9739757065444492e-05, "loss": 0.3698, "step": 520 }, { "epoch": 1.6645367412140575, "grad_norm": 0.8101905044732475, "learning_rate": 1.9665408107959657e-05, "loss": 0.3861, "step": 521 }, { "epoch": 1.6677316293929714, "grad_norm": 0.5961761807889937, "learning_rate": 1.9591063775363045e-05, "loss": 0.3535, "step": 522 }, { "epoch": 1.670926517571885, "grad_norm": 0.7163037162115128, "learning_rate": 1.951672509527736e-05, "loss": 0.3573, "step": 523 }, { "epoch": 1.6741214057507987, "grad_norm": 0.7626145562758659, "learning_rate": 1.944239309524717e-05, "loss": 0.3943, "step": 524 }, { "epoch": 1.6773162939297124, "grad_norm": 0.6369019346205154, "learning_rate": 1.936806880272474e-05, "loss": 0.3311, "step": 525 }, { "epoch": 1.680511182108626, "grad_norm": 0.6564798194303639, "learning_rate": 1.9293753245055772e-05, "loss": 0.4014, "step": 526 }, { "epoch": 1.68370607028754, "grad_norm": 0.6312445430308768, "learning_rate": 1.9219447449465222e-05, "loss": 0.3123, "step": 527 }, { "epoch": 1.6869009584664538, "grad_norm": 0.674163910217299, "learning_rate": 1.9145152443043147e-05, "loss": 0.4069, "step": 528 }, { "epoch": 1.6900958466453675, "grad_norm": 0.5512026479165967, "learning_rate": 1.9070869252730443e-05, "loss": 0.3823, "step": 529 }, { "epoch": 1.6932907348242812, "grad_norm": 0.5459303135589316, "learning_rate": 1.899659890530468e-05, "loss": 0.3801, "step": 530 }, { "epoch": 1.6964856230031948, "grad_norm": 0.7107489428061275, "learning_rate": 1.8922342427365915e-05, "loss": 0.3743, "step": 531 }, { "epoch": 1.6996805111821085, "grad_norm": 0.6821024548908552, "learning_rate": 1.8848100845322486e-05, "loss": 0.4001, "step": 532 }, { "epoch": 1.7028753993610224, "grad_norm": 0.7194530880466413, "learning_rate": 1.8773875185376845e-05, "loss": 0.3967, "step": 533 }, { "epoch": 1.706070287539936, "grad_norm": 0.619789910277277, "learning_rate": 1.869966647351135e-05, "loss": 0.3914, "step": 534 }, { "epoch": 1.70926517571885, "grad_norm": 0.6514123622117488, "learning_rate": 1.86254757354741e-05, "loss": 0.355, "step": 535 }, { "epoch": 1.7124600638977636, "grad_norm": 0.5831800498454969, "learning_rate": 1.8551303996764755e-05, "loss": 0.3715, "step": 536 }, { "epoch": 1.7156549520766773, "grad_norm": 0.7041038447903819, "learning_rate": 1.8477152282620344e-05, "loss": 0.3452, "step": 537 }, { "epoch": 1.718849840255591, "grad_norm": 0.6489046638136863, "learning_rate": 1.8403021618001124e-05, "loss": 0.328, "step": 538 }, { "epoch": 1.7220447284345048, "grad_norm": 0.6220134083521842, "learning_rate": 1.8328913027576373e-05, "loss": 0.3644, "step": 539 }, { "epoch": 1.7252396166134185, "grad_norm": 0.691164548614104, "learning_rate": 1.825482753571028e-05, "loss": 0.386, "step": 540 }, { "epoch": 1.7284345047923324, "grad_norm": 0.6471216980585122, "learning_rate": 1.818076616644773e-05, "loss": 0.3863, "step": 541 }, { "epoch": 1.731629392971246, "grad_norm": 0.5955048340238702, "learning_rate": 1.8106729943500174e-05, "loss": 0.3813, "step": 542 }, { "epoch": 1.7348242811501597, "grad_norm": 0.5557497123983334, "learning_rate": 1.80327198902315e-05, "loss": 0.4207, "step": 543 }, { "epoch": 1.7380191693290734, "grad_norm": 0.6140635892910118, "learning_rate": 1.7958737029643835e-05, "loss": 0.352, "step": 544 }, { "epoch": 1.741214057507987, "grad_norm": 0.654044064069364, "learning_rate": 1.788478238436347e-05, "loss": 0.3887, "step": 545 }, { "epoch": 1.744408945686901, "grad_norm": 0.5807508632491186, "learning_rate": 1.781085697662667e-05, "loss": 0.3833, "step": 546 }, { "epoch": 1.7476038338658149, "grad_norm": 0.7236056668219373, "learning_rate": 1.7736961828265553e-05, "loss": 0.387, "step": 547 }, { "epoch": 1.7507987220447285, "grad_norm": 0.6286981610831269, "learning_rate": 1.7663097960694017e-05, "loss": 0.413, "step": 548 }, { "epoch": 1.7539936102236422, "grad_norm": 0.6471275268399443, "learning_rate": 1.758926639489354e-05, "loss": 0.3265, "step": 549 }, { "epoch": 1.7571884984025559, "grad_norm": 0.6634839205028399, "learning_rate": 1.7515468151399134e-05, "loss": 0.3959, "step": 550 }, { "epoch": 1.7603833865814695, "grad_norm": 0.6755904835423869, "learning_rate": 1.7441704250285212e-05, "loss": 0.3606, "step": 551 }, { "epoch": 1.7635782747603834, "grad_norm": 0.6500797325645201, "learning_rate": 1.7367975711151483e-05, "loss": 0.3876, "step": 552 }, { "epoch": 1.766773162939297, "grad_norm": 0.6477835743216911, "learning_rate": 1.729428355310886e-05, "loss": 0.3158, "step": 553 }, { "epoch": 1.769968051118211, "grad_norm": 0.6950215837197072, "learning_rate": 1.7220628794765403e-05, "loss": 0.3578, "step": 554 }, { "epoch": 1.7731629392971247, "grad_norm": 0.7161970787305121, "learning_rate": 1.7147012454212195e-05, "loss": 0.4181, "step": 555 }, { "epoch": 1.7763578274760383, "grad_norm": 0.56303267169658, "learning_rate": 1.7073435549009288e-05, "loss": 0.3609, "step": 556 }, { "epoch": 1.779552715654952, "grad_norm": 0.6914199086511422, "learning_rate": 1.699989909617166e-05, "loss": 0.3109, "step": 557 }, { "epoch": 1.7827476038338657, "grad_norm": 0.6547054751902353, "learning_rate": 1.6926404112155123e-05, "loss": 0.3595, "step": 558 }, { "epoch": 1.7859424920127795, "grad_norm": 0.7444527094326194, "learning_rate": 1.6852951612842278e-05, "loss": 0.3476, "step": 559 }, { "epoch": 1.7891373801916934, "grad_norm": 0.8274662111243524, "learning_rate": 1.677954261352851e-05, "loss": 0.3673, "step": 560 }, { "epoch": 1.792332268370607, "grad_norm": 0.7793207248626209, "learning_rate": 1.6706178128907897e-05, "loss": 0.3756, "step": 561 }, { "epoch": 1.7955271565495208, "grad_norm": 0.6411675936700109, "learning_rate": 1.6632859173059232e-05, "loss": 0.3573, "step": 562 }, { "epoch": 1.7987220447284344, "grad_norm": 0.7225443818014319, "learning_rate": 1.655958675943198e-05, "loss": 0.3443, "step": 563 }, { "epoch": 1.8019169329073481, "grad_norm": 0.7764335703579314, "learning_rate": 1.6486361900832284e-05, "loss": 0.3644, "step": 564 }, { "epoch": 1.805111821086262, "grad_norm": 0.7643738155044116, "learning_rate": 1.6413185609408946e-05, "loss": 0.3814, "step": 565 }, { "epoch": 1.8083067092651757, "grad_norm": 0.6797205069953752, "learning_rate": 1.6340058896639464e-05, "loss": 0.3431, "step": 566 }, { "epoch": 1.8115015974440896, "grad_norm": 0.6487569782925131, "learning_rate": 1.6266982773316032e-05, "loss": 0.3827, "step": 567 }, { "epoch": 1.8146964856230032, "grad_norm": 0.7645835950918118, "learning_rate": 1.6193958249531562e-05, "loss": 0.4112, "step": 568 }, { "epoch": 1.817891373801917, "grad_norm": 0.7113029044269039, "learning_rate": 1.612098633466575e-05, "loss": 0.3779, "step": 569 }, { "epoch": 1.8210862619808306, "grad_norm": 0.5948671080899526, "learning_rate": 1.6048068037371092e-05, "loss": 0.3195, "step": 570 }, { "epoch": 1.8242811501597445, "grad_norm": 0.70421201984334, "learning_rate": 1.597520436555896e-05, "loss": 0.3776, "step": 571 }, { "epoch": 1.8274760383386581, "grad_norm": 0.6726359632379854, "learning_rate": 1.590239632638568e-05, "loss": 0.4225, "step": 572 }, { "epoch": 1.830670926517572, "grad_norm": 0.681506981858163, "learning_rate": 1.582964492623857e-05, "loss": 0.3627, "step": 573 }, { "epoch": 1.8338658146964857, "grad_norm": 0.6579916459695835, "learning_rate": 1.575695117072209e-05, "loss": 0.3566, "step": 574 }, { "epoch": 1.8370607028753994, "grad_norm": 0.5767726433601224, "learning_rate": 1.568431606464388e-05, "loss": 0.3582, "step": 575 }, { "epoch": 1.840255591054313, "grad_norm": 0.6950140418517404, "learning_rate": 1.5611740612000906e-05, "loss": 0.3861, "step": 576 }, { "epoch": 1.8434504792332267, "grad_norm": 0.5791968225578055, "learning_rate": 1.5539225815965595e-05, "loss": 0.3383, "step": 577 }, { "epoch": 1.8466453674121406, "grad_norm": 0.6049662279870999, "learning_rate": 1.546677267887193e-05, "loss": 0.3719, "step": 578 }, { "epoch": 1.8498402555910545, "grad_norm": 0.6161946790988828, "learning_rate": 1.5394382202201605e-05, "loss": 0.382, "step": 579 }, { "epoch": 1.8530351437699681, "grad_norm": 0.6325260524024612, "learning_rate": 1.5322055386570225e-05, "loss": 0.3587, "step": 580 }, { "epoch": 1.8562300319488818, "grad_norm": 0.6241897380088132, "learning_rate": 1.5249793231713418e-05, "loss": 0.3386, "step": 581 }, { "epoch": 1.8594249201277955, "grad_norm": 0.565220565027428, "learning_rate": 1.5177596736473034e-05, "loss": 0.3774, "step": 582 }, { "epoch": 1.8626198083067091, "grad_norm": 0.5845662723376586, "learning_rate": 1.5105466898783379e-05, "loss": 0.3235, "step": 583 }, { "epoch": 1.865814696485623, "grad_norm": 0.6433701307468581, "learning_rate": 1.5033404715657344e-05, "loss": 0.3728, "step": 584 }, { "epoch": 1.8690095846645367, "grad_norm": 0.6619778705276104, "learning_rate": 1.4961411183172686e-05, "loss": 0.346, "step": 585 }, { "epoch": 1.8722044728434506, "grad_norm": 0.7410247957434669, "learning_rate": 1.4889487296458258e-05, "loss": 0.367, "step": 586 }, { "epoch": 1.8753993610223643, "grad_norm": 0.6248658361645802, "learning_rate": 1.4817634049680207e-05, "loss": 0.3377, "step": 587 }, { "epoch": 1.878594249201278, "grad_norm": 0.5927935921175941, "learning_rate": 1.4745852436028262e-05, "loss": 0.3355, "step": 588 }, { "epoch": 1.8817891373801916, "grad_norm": 0.6031296090039979, "learning_rate": 1.4674143447702036e-05, "loss": 0.3432, "step": 589 }, { "epoch": 1.8849840255591053, "grad_norm": 0.6978415390998038, "learning_rate": 1.4602508075897249e-05, "loss": 0.4307, "step": 590 }, { "epoch": 1.8881789137380192, "grad_norm": 0.6802206273285568, "learning_rate": 1.453094731079206e-05, "loss": 0.3337, "step": 591 }, { "epoch": 1.891373801916933, "grad_norm": 0.6075920734911536, "learning_rate": 1.4459462141533407e-05, "loss": 0.3959, "step": 592 }, { "epoch": 1.8945686900958467, "grad_norm": 0.5500279524690617, "learning_rate": 1.4388053556223274e-05, "loss": 0.3456, "step": 593 }, { "epoch": 1.8977635782747604, "grad_norm": 0.598113110586812, "learning_rate": 1.4316722541905081e-05, "loss": 0.3581, "step": 594 }, { "epoch": 1.900958466453674, "grad_norm": 0.5879717265811937, "learning_rate": 1.4245470084550026e-05, "loss": 0.3484, "step": 595 }, { "epoch": 1.9041533546325877, "grad_norm": 0.5983260797622781, "learning_rate": 1.4174297169043447e-05, "loss": 0.3968, "step": 596 }, { "epoch": 1.9073482428115016, "grad_norm": 0.5893645004999872, "learning_rate": 1.410320477917122e-05, "loss": 0.3377, "step": 597 }, { "epoch": 1.9105431309904153, "grad_norm": 0.5981250674431736, "learning_rate": 1.4032193897606164e-05, "loss": 0.3728, "step": 598 }, { "epoch": 1.9137380191693292, "grad_norm": 0.7033342157204642, "learning_rate": 1.3961265505894442e-05, "loss": 0.3986, "step": 599 }, { "epoch": 1.9169329073482428, "grad_norm": 0.6193784424182153, "learning_rate": 1.3890420584442007e-05, "loss": 0.3833, "step": 600 }, { "epoch": 1.9201277955271565, "grad_norm": 0.6378469529500934, "learning_rate": 1.3819660112501054e-05, "loss": 0.4048, "step": 601 }, { "epoch": 1.9233226837060702, "grad_norm": 0.6260896725715146, "learning_rate": 1.374898506815646e-05, "loss": 0.3259, "step": 602 }, { "epoch": 1.926517571884984, "grad_norm": 0.6830049362505771, "learning_rate": 1.3678396428312291e-05, "loss": 0.3824, "step": 603 }, { "epoch": 1.9297124600638977, "grad_norm": 0.6210066067190025, "learning_rate": 1.3607895168678296e-05, "loss": 0.3612, "step": 604 }, { "epoch": 1.9329073482428116, "grad_norm": 0.6413094034332517, "learning_rate": 1.3537482263756391e-05, "loss": 0.3548, "step": 605 }, { "epoch": 1.9361022364217253, "grad_norm": 0.620634478966929, "learning_rate": 1.3467158686827242e-05, "loss": 0.3884, "step": 606 }, { "epoch": 1.939297124600639, "grad_norm": 0.6407808790372046, "learning_rate": 1.339692540993676e-05, "loss": 0.3948, "step": 607 }, { "epoch": 1.9424920127795526, "grad_norm": 0.7276368275858982, "learning_rate": 1.332678340388268e-05, "loss": 0.347, "step": 608 }, { "epoch": 1.9456869009584663, "grad_norm": 0.6880632837814936, "learning_rate": 1.3256733638201172e-05, "loss": 0.3811, "step": 609 }, { "epoch": 1.9488817891373802, "grad_norm": 0.568414071034355, "learning_rate": 1.3186777081153398e-05, "loss": 0.3852, "step": 610 }, { "epoch": 1.952076677316294, "grad_norm": 0.6125392745773798, "learning_rate": 1.311691469971214e-05, "loss": 0.3314, "step": 611 }, { "epoch": 1.9552715654952078, "grad_norm": 0.6154906870246765, "learning_rate": 1.3047147459548469e-05, "loss": 0.3983, "step": 612 }, { "epoch": 1.9584664536741214, "grad_norm": 0.7090879134482768, "learning_rate": 1.297747632501834e-05, "loss": 0.3547, "step": 613 }, { "epoch": 1.961661341853035, "grad_norm": 0.7919803140094102, "learning_rate": 1.2907902259149287e-05, "loss": 0.3884, "step": 614 }, { "epoch": 1.9648562300319488, "grad_norm": 0.6135804624651449, "learning_rate": 1.2838426223627152e-05, "loss": 0.3292, "step": 615 }, { "epoch": 1.9680511182108626, "grad_norm": 0.627406953051245, "learning_rate": 1.2769049178782716e-05, "loss": 0.3379, "step": 616 }, { "epoch": 1.9712460063897763, "grad_norm": 0.7149906660956054, "learning_rate": 1.2699772083578472e-05, "loss": 0.3727, "step": 617 }, { "epoch": 1.9744408945686902, "grad_norm": 0.7249915937043523, "learning_rate": 1.2630595895595383e-05, "loss": 0.3424, "step": 618 }, { "epoch": 1.9776357827476039, "grad_norm": 0.7589039306193895, "learning_rate": 1.2561521571019603e-05, "loss": 0.3637, "step": 619 }, { "epoch": 1.9808306709265175, "grad_norm": 0.6176823766792565, "learning_rate": 1.249255006462928e-05, "loss": 0.3495, "step": 620 }, { "epoch": 1.9840255591054312, "grad_norm": 0.7150454220639018, "learning_rate": 1.2423682329781378e-05, "loss": 0.3628, "step": 621 }, { "epoch": 1.9872204472843449, "grad_norm": 0.6869892242638056, "learning_rate": 1.2354919318398473e-05, "loss": 0.3528, "step": 622 }, { "epoch": 1.9904153354632588, "grad_norm": 0.7412388678758558, "learning_rate": 1.2286261980955583e-05, "loss": 0.3744, "step": 623 }, { "epoch": 1.9936102236421727, "grad_norm": 0.7562821838566286, "learning_rate": 1.2217711266467092e-05, "loss": 0.3947, "step": 624 }, { "epoch": 1.9968051118210863, "grad_norm": 0.5867017695210721, "learning_rate": 1.2149268122473554e-05, "loss": 0.327, "step": 625 }, { "epoch": 2.0, "grad_norm": 0.5635062124393114, "learning_rate": 1.2080933495028648e-05, "loss": 0.3089, "step": 626 }, { "epoch": 2.0031948881789137, "grad_norm": 0.6715285391032262, "learning_rate": 1.2012708328686093e-05, "loss": 0.2396, "step": 627 }, { "epoch": 2.0063897763578273, "grad_norm": 0.5999774675489589, "learning_rate": 1.1944593566486562e-05, "loss": 0.2402, "step": 628 }, { "epoch": 2.009584664536741, "grad_norm": 0.6523644403725044, "learning_rate": 1.18765901499447e-05, "loss": 0.2595, "step": 629 }, { "epoch": 2.012779552715655, "grad_norm": 0.6688113012671898, "learning_rate": 1.1808699019036047e-05, "loss": 0.2063, "step": 630 }, { "epoch": 2.015974440894569, "grad_norm": 0.8690136893873772, "learning_rate": 1.17409211121841e-05, "loss": 0.2628, "step": 631 }, { "epoch": 2.0191693290734825, "grad_norm": 0.5849734190846696, "learning_rate": 1.1673257366247319e-05, "loss": 0.2178, "step": 632 }, { "epoch": 2.022364217252396, "grad_norm": 0.6360648521093288, "learning_rate": 1.1605708716506161e-05, "loss": 0.2374, "step": 633 }, { "epoch": 2.02555910543131, "grad_norm": 0.5809260418478515, "learning_rate": 1.1538276096650175e-05, "loss": 0.1988, "step": 634 }, { "epoch": 2.0287539936102235, "grad_norm": 0.6364023975071599, "learning_rate": 1.1470960438765108e-05, "loss": 0.253, "step": 635 }, { "epoch": 2.0319488817891376, "grad_norm": 0.608832086457546, "learning_rate": 1.1403762673319983e-05, "loss": 0.2254, "step": 636 }, { "epoch": 2.0351437699680512, "grad_norm": 0.6196453859771298, "learning_rate": 1.133668372915425e-05, "loss": 0.2477, "step": 637 }, { "epoch": 2.038338658146965, "grad_norm": 0.49320636326890754, "learning_rate": 1.1269724533464984e-05, "loss": 0.2344, "step": 638 }, { "epoch": 2.0415335463258786, "grad_norm": 0.5893706817555677, "learning_rate": 1.1202886011794023e-05, "loss": 0.284, "step": 639 }, { "epoch": 2.0447284345047922, "grad_norm": 0.5927365358551429, "learning_rate": 1.1136169088015177e-05, "loss": 0.2446, "step": 640 }, { "epoch": 2.047923322683706, "grad_norm": 0.5912266488563296, "learning_rate": 1.1069574684321505e-05, "loss": 0.249, "step": 641 }, { "epoch": 2.0511182108626196, "grad_norm": 0.5068898035040869, "learning_rate": 1.1003103721212503e-05, "loss": 0.2404, "step": 642 }, { "epoch": 2.0543130990415337, "grad_norm": 0.49830277604732454, "learning_rate": 1.0936757117481438e-05, "loss": 0.2226, "step": 643 }, { "epoch": 2.0575079872204474, "grad_norm": 0.5137466675546016, "learning_rate": 1.0870535790202606e-05, "loss": 0.2246, "step": 644 }, { "epoch": 2.060702875399361, "grad_norm": 0.5579716315671657, "learning_rate": 1.080444065471867e-05, "loss": 0.218, "step": 645 }, { "epoch": 2.0638977635782747, "grad_norm": 0.5528250590936209, "learning_rate": 1.0738472624628034e-05, "loss": 0.2485, "step": 646 }, { "epoch": 2.0670926517571884, "grad_norm": 0.5493618613823972, "learning_rate": 1.0672632611772156e-05, "loss": 0.2425, "step": 647 }, { "epoch": 2.070287539936102, "grad_norm": 0.5673228300389772, "learning_rate": 1.0606921526223016e-05, "loss": 0.2497, "step": 648 }, { "epoch": 2.073482428115016, "grad_norm": 0.5203097618726312, "learning_rate": 1.0541340276270468e-05, "loss": 0.212, "step": 649 }, { "epoch": 2.07667731629393, "grad_norm": 0.5534033378725121, "learning_rate": 1.0475889768409729e-05, "loss": 0.1971, "step": 650 }, { "epoch": 2.0798722044728435, "grad_norm": 0.5300132980491199, "learning_rate": 1.0410570907328848e-05, "loss": 0.2576, "step": 651 }, { "epoch": 2.083067092651757, "grad_norm": 0.5690239162135191, "learning_rate": 1.0345384595896161e-05, "loss": 0.2366, "step": 652 }, { "epoch": 2.086261980830671, "grad_norm": 0.5256402067518808, "learning_rate": 1.028033173514788e-05, "loss": 0.235, "step": 653 }, { "epoch": 2.0894568690095845, "grad_norm": 0.48537333431940805, "learning_rate": 1.0215413224275552e-05, "loss": 0.2154, "step": 654 }, { "epoch": 2.0926517571884986, "grad_norm": 0.5161821862609771, "learning_rate": 1.0150629960613721e-05, "loss": 0.21, "step": 655 }, { "epoch": 2.0958466453674123, "grad_norm": 0.5281317540190527, "learning_rate": 1.0085982839627445e-05, "loss": 0.1899, "step": 656 }, { "epoch": 2.099041533546326, "grad_norm": 0.4709200890585819, "learning_rate": 1.0021472754899966e-05, "loss": 0.2292, "step": 657 }, { "epoch": 2.1022364217252396, "grad_norm": 0.45322842520766593, "learning_rate": 9.957100598120357e-06, "loss": 0.2147, "step": 658 }, { "epoch": 2.1054313099041533, "grad_norm": 0.5018911122380977, "learning_rate": 9.89286725907117e-06, "loss": 0.2665, "step": 659 }, { "epoch": 2.108626198083067, "grad_norm": 0.5090958885022994, "learning_rate": 9.828773625616145e-06, "loss": 0.2071, "step": 660 }, { "epoch": 2.1118210862619806, "grad_norm": 0.5042008645673148, "learning_rate": 9.764820583687978e-06, "loss": 0.22, "step": 661 }, { "epoch": 2.1150159744408947, "grad_norm": 0.5636525223827146, "learning_rate": 9.701009017276008e-06, "loss": 0.2255, "step": 662 }, { "epoch": 2.1182108626198084, "grad_norm": 0.4968534545435798, "learning_rate": 9.637339808414042e-06, "loss": 0.2187, "step": 663 }, { "epoch": 2.121405750798722, "grad_norm": 0.536209126935731, "learning_rate": 9.573813837168166e-06, "loss": 0.2212, "step": 664 }, { "epoch": 2.1246006389776357, "grad_norm": 0.477893192356411, "learning_rate": 9.510431981624554e-06, "loss": 0.2084, "step": 665 }, { "epoch": 2.1277955271565494, "grad_norm": 0.47741193568130846, "learning_rate": 9.447195117877343e-06, "loss": 0.216, "step": 666 }, { "epoch": 2.130990415335463, "grad_norm": 0.5101367545264079, "learning_rate": 9.384104120016542e-06, "loss": 0.2634, "step": 667 }, { "epoch": 2.134185303514377, "grad_norm": 0.49187172517640576, "learning_rate": 9.321159860115909e-06, "loss": 0.224, "step": 668 }, { "epoch": 2.137380191693291, "grad_norm": 0.5056192356778784, "learning_rate": 9.258363208220929e-06, "loss": 0.2228, "step": 669 }, { "epoch": 2.1405750798722045, "grad_norm": 0.5445137543547477, "learning_rate": 9.195715032336794e-06, "loss": 0.2252, "step": 670 }, { "epoch": 2.143769968051118, "grad_norm": 0.5060926192022697, "learning_rate": 9.13321619841637e-06, "loss": 0.2774, "step": 671 }, { "epoch": 2.146964856230032, "grad_norm": 0.47010736862307156, "learning_rate": 9.070867570348247e-06, "loss": 0.22, "step": 672 }, { "epoch": 2.1501597444089455, "grad_norm": 0.5094958330325633, "learning_rate": 9.00867000994482e-06, "loss": 0.2256, "step": 673 }, { "epoch": 2.1533546325878596, "grad_norm": 0.5259589614128072, "learning_rate": 8.946624376930333e-06, "loss": 0.2755, "step": 674 }, { "epoch": 2.1565495207667733, "grad_norm": 0.5139540383730451, "learning_rate": 8.884731528929019e-06, "loss": 0.2659, "step": 675 }, { "epoch": 2.159744408945687, "grad_norm": 0.4832097809114296, "learning_rate": 8.822992321453264e-06, "loss": 0.215, "step": 676 }, { "epoch": 2.1629392971246006, "grad_norm": 0.44217298247605424, "learning_rate": 8.76140760789174e-06, "loss": 0.2212, "step": 677 }, { "epoch": 2.1661341853035143, "grad_norm": 0.5003409469817744, "learning_rate": 8.69997823949763e-06, "loss": 0.1728, "step": 678 }, { "epoch": 2.169329073482428, "grad_norm": 0.4819137577229696, "learning_rate": 8.638705065376887e-06, "loss": 0.2091, "step": 679 }, { "epoch": 2.1725239616613417, "grad_norm": 0.48347484950099356, "learning_rate": 8.577588932476448e-06, "loss": 0.208, "step": 680 }, { "epoch": 2.1757188498402558, "grad_norm": 0.48841924781203694, "learning_rate": 8.516630685572553e-06, "loss": 0.2097, "step": 681 }, { "epoch": 2.1789137380191694, "grad_norm": 0.5540389648785453, "learning_rate": 8.455831167259086e-06, "loss": 0.2343, "step": 682 }, { "epoch": 2.182108626198083, "grad_norm": 0.45425575230845544, "learning_rate": 8.395191217935883e-06, "loss": 0.2278, "step": 683 }, { "epoch": 2.1853035143769968, "grad_norm": 0.4779637593364659, "learning_rate": 8.33471167579717e-06, "loss": 0.2637, "step": 684 }, { "epoch": 2.1884984025559104, "grad_norm": 0.5285221690710397, "learning_rate": 8.274393376819924e-06, "loss": 0.2347, "step": 685 }, { "epoch": 2.191693290734824, "grad_norm": 0.473579878257591, "learning_rate": 8.214237154752345e-06, "loss": 0.2159, "step": 686 }, { "epoch": 2.194888178913738, "grad_norm": 0.501799232462877, "learning_rate": 8.154243841102351e-06, "loss": 0.2476, "step": 687 }, { "epoch": 2.198083067092652, "grad_norm": 0.4627453575153537, "learning_rate": 8.09441426512604e-06, "loss": 0.2466, "step": 688 }, { "epoch": 2.2012779552715656, "grad_norm": 0.5185845450190892, "learning_rate": 8.03474925381625e-06, "loss": 0.2418, "step": 689 }, { "epoch": 2.2044728434504792, "grad_norm": 0.49421035355847515, "learning_rate": 7.97524963189115e-06, "loss": 0.2903, "step": 690 }, { "epoch": 2.207667731629393, "grad_norm": 0.4884543130989985, "learning_rate": 7.91591622178279e-06, "loss": 0.2604, "step": 691 }, { "epoch": 2.2108626198083066, "grad_norm": 0.421393965123483, "learning_rate": 7.856749843625777e-06, "loss": 0.2027, "step": 692 }, { "epoch": 2.2140575079872207, "grad_norm": 0.49505325003001727, "learning_rate": 7.797751315245927e-06, "loss": 0.2265, "step": 693 }, { "epoch": 2.2172523961661343, "grad_norm": 0.5048775790142346, "learning_rate": 7.738921452148949e-06, "loss": 0.2624, "step": 694 }, { "epoch": 2.220447284345048, "grad_norm": 0.48026767805561044, "learning_rate": 7.68026106750917e-06, "loss": 0.267, "step": 695 }, { "epoch": 2.2236421725239617, "grad_norm": 0.5099721248485902, "learning_rate": 7.621770972158331e-06, "loss": 0.2275, "step": 696 }, { "epoch": 2.2268370607028753, "grad_norm": 0.478130569524219, "learning_rate": 7.563451974574332e-06, "loss": 0.2135, "step": 697 }, { "epoch": 2.230031948881789, "grad_norm": 0.4333719232122216, "learning_rate": 7.5053048808700814e-06, "loss": 0.2028, "step": 698 }, { "epoch": 2.2332268370607027, "grad_norm": 0.4907979978569055, "learning_rate": 7.447330494782363e-06, "loss": 0.2448, "step": 699 }, { "epoch": 2.236421725239617, "grad_norm": 0.6087204868248344, "learning_rate": 7.389529617660705e-06, "loss": 0.2514, "step": 700 }, { "epoch": 2.2396166134185305, "grad_norm": 0.4779517559418497, "learning_rate": 7.331903048456299e-06, "loss": 0.1698, "step": 701 }, { "epoch": 2.242811501597444, "grad_norm": 0.48682990012078686, "learning_rate": 7.274451583711e-06, "loss": 0.2456, "step": 702 }, { "epoch": 2.246006389776358, "grad_norm": 0.5696452459133338, "learning_rate": 7.217176017546263e-06, "loss": 0.23, "step": 703 }, { "epoch": 2.2492012779552715, "grad_norm": 0.47827230908811474, "learning_rate": 7.160077141652186e-06, "loss": 0.2121, "step": 704 }, { "epoch": 2.252396166134185, "grad_norm": 0.49227009280747874, "learning_rate": 7.1031557452765934e-06, "loss": 0.2662, "step": 705 }, { "epoch": 2.255591054313099, "grad_norm": 0.5350393375681801, "learning_rate": 7.046412615214075e-06, "loss": 0.2493, "step": 706 }, { "epoch": 2.258785942492013, "grad_norm": 0.4684828298726711, "learning_rate": 6.98984853579517e-06, "loss": 0.1838, "step": 707 }, { "epoch": 2.2619808306709266, "grad_norm": 0.4784754257674382, "learning_rate": 6.933464288875467e-06, "loss": 0.1974, "step": 708 }, { "epoch": 2.2651757188498403, "grad_norm": 0.4997974355017908, "learning_rate": 6.8772606538248285e-06, "loss": 0.2016, "step": 709 }, { "epoch": 2.268370607028754, "grad_norm": 0.47011903064932126, "learning_rate": 6.821238407516635e-06, "loss": 0.2346, "step": 710 }, { "epoch": 2.2715654952076676, "grad_norm": 0.45084286894131964, "learning_rate": 6.765398324316996e-06, "loss": 0.2261, "step": 711 }, { "epoch": 2.2747603833865817, "grad_norm": 0.4586308317736137, "learning_rate": 6.7097411760741075e-06, "loss": 0.2141, "step": 712 }, { "epoch": 2.2779552715654954, "grad_norm": 0.452772588205902, "learning_rate": 6.654267732107516e-06, "loss": 0.1987, "step": 713 }, { "epoch": 2.281150159744409, "grad_norm": 0.5116763949149057, "learning_rate": 6.598978759197554e-06, "loss": 0.2392, "step": 714 }, { "epoch": 2.2843450479233227, "grad_norm": 0.498901942763996, "learning_rate": 6.543875021574686e-06, "loss": 0.1921, "step": 715 }, { "epoch": 2.2875399361022364, "grad_norm": 0.49554234705109035, "learning_rate": 6.4889572809089655e-06, "loss": 0.2205, "step": 716 }, { "epoch": 2.29073482428115, "grad_norm": 0.4845867709952482, "learning_rate": 6.43422629629953e-06, "loss": 0.2497, "step": 717 }, { "epoch": 2.2939297124600637, "grad_norm": 0.47026482069183134, "learning_rate": 6.379682824264055e-06, "loss": 0.2295, "step": 718 }, { "epoch": 2.297124600638978, "grad_norm": 0.4649585300155169, "learning_rate": 6.325327618728356e-06, "loss": 0.1516, "step": 719 }, { "epoch": 2.3003194888178915, "grad_norm": 0.40206850228054203, "learning_rate": 6.271161431015922e-06, "loss": 0.2417, "step": 720 }, { "epoch": 2.303514376996805, "grad_norm": 0.430228031696727, "learning_rate": 6.2171850098375475e-06, "loss": 0.2368, "step": 721 }, { "epoch": 2.306709265175719, "grad_norm": 0.42538480611909024, "learning_rate": 6.163399101281e-06, "loss": 0.1991, "step": 722 }, { "epoch": 2.3099041533546325, "grad_norm": 0.5044088982415396, "learning_rate": 6.1098044488006735e-06, "loss": 0.1973, "step": 723 }, { "epoch": 2.313099041533546, "grad_norm": 0.4668130698688352, "learning_rate": 6.056401793207329e-06, "loss": 0.2262, "step": 724 }, { "epoch": 2.31629392971246, "grad_norm": 0.5258923718903504, "learning_rate": 6.003191872657878e-06, "loss": 0.2634, "step": 725 }, { "epoch": 2.319488817891374, "grad_norm": 0.47646849150208387, "learning_rate": 5.950175422645134e-06, "loss": 0.2301, "step": 726 }, { "epoch": 2.3226837060702876, "grad_norm": 0.4597740271176972, "learning_rate": 5.897353175987668e-06, "loss": 0.2313, "step": 727 }, { "epoch": 2.3258785942492013, "grad_norm": 0.4428572782581877, "learning_rate": 5.844725862819703e-06, "loss": 0.2174, "step": 728 }, { "epoch": 2.329073482428115, "grad_norm": 0.5173021855990265, "learning_rate": 5.792294210580971e-06, "loss": 0.2068, "step": 729 }, { "epoch": 2.3322683706070286, "grad_norm": 0.4883764779635357, "learning_rate": 5.740058944006697e-06, "loss": 0.194, "step": 730 }, { "epoch": 2.3354632587859427, "grad_norm": 0.5032508220310962, "learning_rate": 5.688020785117581e-06, "loss": 0.2653, "step": 731 }, { "epoch": 2.3386581469648564, "grad_norm": 0.5190580775432023, "learning_rate": 5.636180453209789e-06, "loss": 0.2323, "step": 732 }, { "epoch": 2.34185303514377, "grad_norm": 0.4545099042298369, "learning_rate": 5.584538664845034e-06, "loss": 0.2282, "step": 733 }, { "epoch": 2.3450479233226837, "grad_norm": 0.46066006930429154, "learning_rate": 5.533096133840677e-06, "loss": 0.2454, "step": 734 }, { "epoch": 2.3482428115015974, "grad_norm": 0.49707563335332516, "learning_rate": 5.48185357125983e-06, "loss": 0.2457, "step": 735 }, { "epoch": 2.351437699680511, "grad_norm": 0.4739619495394498, "learning_rate": 5.4308116854015644e-06, "loss": 0.2192, "step": 736 }, { "epoch": 2.3546325878594248, "grad_norm": 0.46153270987931605, "learning_rate": 5.379971181791093e-06, "loss": 0.2727, "step": 737 }, { "epoch": 2.357827476038339, "grad_norm": 0.44872264474740164, "learning_rate": 5.3293327631700185e-06, "loss": 0.2112, "step": 738 }, { "epoch": 2.3610223642172525, "grad_norm": 0.46169938248569553, "learning_rate": 5.278897129486656e-06, "loss": 0.2021, "step": 739 }, { "epoch": 2.364217252396166, "grad_norm": 0.4614793999691915, "learning_rate": 5.228664977886304e-06, "loss": 0.201, "step": 740 }, { "epoch": 2.36741214057508, "grad_norm": 0.48601214815606647, "learning_rate": 5.178637002701639e-06, "loss": 0.2016, "step": 741 }, { "epoch": 2.3706070287539935, "grad_norm": 0.4448925101508038, "learning_rate": 5.128813895443132e-06, "loss": 0.2411, "step": 742 }, { "epoch": 2.373801916932907, "grad_norm": 0.45678562943215706, "learning_rate": 5.079196344789454e-06, "loss": 0.2397, "step": 743 }, { "epoch": 2.376996805111821, "grad_norm": 0.47737064374073584, "learning_rate": 5.029785036577976e-06, "loss": 0.2168, "step": 744 }, { "epoch": 2.380191693290735, "grad_norm": 0.4330785555805196, "learning_rate": 4.980580653795306e-06, "loss": 0.2206, "step": 745 }, { "epoch": 2.3833865814696487, "grad_norm": 0.5282508944251668, "learning_rate": 4.931583876567807e-06, "loss": 0.2209, "step": 746 }, { "epoch": 2.3865814696485623, "grad_norm": 0.48191470832355815, "learning_rate": 4.882795382152223e-06, "loss": 0.2604, "step": 747 }, { "epoch": 2.389776357827476, "grad_norm": 0.45329891295506547, "learning_rate": 4.834215844926338e-06, "loss": 0.2662, "step": 748 }, { "epoch": 2.3929712460063897, "grad_norm": 0.5020462913719165, "learning_rate": 4.785845936379601e-06, "loss": 0.1824, "step": 749 }, { "epoch": 2.3961661341853033, "grad_norm": 0.5125956468296781, "learning_rate": 4.737686325103883e-06, "loss": 0.2388, "step": 750 }, { "epoch": 2.3993610223642174, "grad_norm": 0.4610736608378653, "learning_rate": 4.6897376767842365e-06, "loss": 0.2184, "step": 751 }, { "epoch": 2.402555910543131, "grad_norm": 0.4215976064237638, "learning_rate": 4.642000654189673e-06, "loss": 0.2239, "step": 752 }, { "epoch": 2.405750798722045, "grad_norm": 0.4605285704227125, "learning_rate": 4.59447591716401e-06, "loss": 0.2298, "step": 753 }, { "epoch": 2.4089456869009584, "grad_norm": 0.4485924651231439, "learning_rate": 4.547164122616767e-06, "loss": 0.2197, "step": 754 }, { "epoch": 2.412140575079872, "grad_norm": 0.4611809765297732, "learning_rate": 4.500065924514059e-06, "loss": 0.2405, "step": 755 }, { "epoch": 2.415335463258786, "grad_norm": 0.44198322300277526, "learning_rate": 4.453181973869565e-06, "loss": 0.261, "step": 756 }, { "epoch": 2.4185303514377, "grad_norm": 0.4422002397029462, "learning_rate": 4.406512918735555e-06, "loss": 0.2086, "step": 757 }, { "epoch": 2.4217252396166136, "grad_norm": 0.44126751575207934, "learning_rate": 4.360059404193892e-06, "loss": 0.238, "step": 758 }, { "epoch": 2.4249201277955272, "grad_norm": 0.4639687021386926, "learning_rate": 4.313822072347136e-06, "loss": 0.215, "step": 759 }, { "epoch": 2.428115015974441, "grad_norm": 0.44442484361823825, "learning_rate": 4.267801562309679e-06, "loss": 0.1696, "step": 760 }, { "epoch": 2.4313099041533546, "grad_norm": 0.45364102082678376, "learning_rate": 4.221998510198888e-06, "loss": 0.2313, "step": 761 }, { "epoch": 2.4345047923322682, "grad_norm": 0.5633393868088448, "learning_rate": 4.176413549126322e-06, "loss": 0.1996, "step": 762 }, { "epoch": 2.437699680511182, "grad_norm": 0.4842473998169101, "learning_rate": 4.131047309188994e-06, "loss": 0.2374, "step": 763 }, { "epoch": 2.440894568690096, "grad_norm": 0.49271155780265713, "learning_rate": 4.085900417460633e-06, "loss": 0.2428, "step": 764 }, { "epoch": 2.4440894568690097, "grad_norm": 0.4474969801362191, "learning_rate": 4.040973497983052e-06, "loss": 0.2149, "step": 765 }, { "epoch": 2.4472843450479234, "grad_norm": 0.49449842276919154, "learning_rate": 3.996267171757486e-06, "loss": 0.2221, "step": 766 }, { "epoch": 2.450479233226837, "grad_norm": 0.4045769885160123, "learning_rate": 3.951782056736027e-06, "loss": 0.2088, "step": 767 }, { "epoch": 2.4536741214057507, "grad_norm": 0.420197638385653, "learning_rate": 3.907518767813097e-06, "loss": 0.2508, "step": 768 }, { "epoch": 2.4568690095846644, "grad_norm": 0.4712286531119528, "learning_rate": 3.863477916816914e-06, "loss": 0.2012, "step": 769 }, { "epoch": 2.460063897763578, "grad_norm": 0.4427542368579534, "learning_rate": 3.819660112501053e-06, "loss": 0.2397, "step": 770 }, { "epoch": 2.463258785942492, "grad_norm": 0.49326258562927866, "learning_rate": 3.7760659605360506e-06, "loss": 0.2334, "step": 771 }, { "epoch": 2.466453674121406, "grad_norm": 0.4601236932926047, "learning_rate": 3.732696063500998e-06, "loss": 0.2034, "step": 772 }, { "epoch": 2.4696485623003195, "grad_norm": 0.4472250977974353, "learning_rate": 3.689551020875226e-06, "loss": 0.2271, "step": 773 }, { "epoch": 2.472843450479233, "grad_norm": 0.46936589235681647, "learning_rate": 3.6466314290300366e-06, "loss": 0.1636, "step": 774 }, { "epoch": 2.476038338658147, "grad_norm": 0.4357541144936099, "learning_rate": 3.603937881220425e-06, "loss": 0.2292, "step": 775 }, { "epoch": 2.479233226837061, "grad_norm": 0.6172961003964446, "learning_rate": 3.5614709675769166e-06, "loss": 0.2371, "step": 776 }, { "epoch": 2.4824281150159746, "grad_norm": 0.42287295079164994, "learning_rate": 3.519231275097372e-06, "loss": 0.192, "step": 777 }, { "epoch": 2.4856230031948883, "grad_norm": 0.44907721421445357, "learning_rate": 3.477219387638917e-06, "loss": 0.275, "step": 778 }, { "epoch": 2.488817891373802, "grad_norm": 0.5148479668458735, "learning_rate": 3.435435885909828e-06, "loss": 0.2505, "step": 779 }, { "epoch": 2.4920127795527156, "grad_norm": 0.4708481234430973, "learning_rate": 3.393881347461525e-06, "loss": 0.2337, "step": 780 }, { "epoch": 2.4952076677316293, "grad_norm": 0.4652941463915953, "learning_rate": 3.3525563466806068e-06, "loss": 0.2068, "step": 781 }, { "epoch": 2.498402555910543, "grad_norm": 0.45937429044470846, "learning_rate": 3.311461454780871e-06, "loss": 0.2616, "step": 782 }, { "epoch": 2.501597444089457, "grad_norm": 0.4540154081882579, "learning_rate": 3.2705972397954655e-06, "loss": 0.2004, "step": 783 }, { "epoch": 2.5047923322683707, "grad_norm": 0.44687303635857867, "learning_rate": 3.22996426656899e-06, "loss": 0.2137, "step": 784 }, { "epoch": 2.5079872204472844, "grad_norm": 0.42549399369528, "learning_rate": 3.1895630967497147e-06, "loss": 0.237, "step": 785 }, { "epoch": 2.511182108626198, "grad_norm": 0.4686919863303311, "learning_rate": 3.1493942887818287e-06, "loss": 0.1818, "step": 786 }, { "epoch": 2.5143769968051117, "grad_norm": 0.43180677830115527, "learning_rate": 3.1094583978976887e-06, "loss": 0.2135, "step": 787 }, { "epoch": 2.5175718849840254, "grad_norm": 0.41051305687717876, "learning_rate": 3.0697559761101623e-06, "loss": 0.2362, "step": 788 }, { "epoch": 2.520766773162939, "grad_norm": 0.4339764538651385, "learning_rate": 3.0302875722050064e-06, "loss": 0.1871, "step": 789 }, { "epoch": 2.523961661341853, "grad_norm": 0.44537108832627426, "learning_rate": 2.99105373173326e-06, "loss": 0.2317, "step": 790 }, { "epoch": 2.527156549520767, "grad_norm": 0.48453456082844887, "learning_rate": 2.9520549970037238e-06, "loss": 0.1899, "step": 791 }, { "epoch": 2.5303514376996805, "grad_norm": 0.4280884923970404, "learning_rate": 2.913291907075451e-06, "loss": 0.2116, "step": 792 }, { "epoch": 2.533546325878594, "grad_norm": 0.4503925387506874, "learning_rate": 2.8747649977502945e-06, "loss": 0.266, "step": 793 }, { "epoch": 2.536741214057508, "grad_norm": 0.44028462915181193, "learning_rate": 2.836474801565521e-06, "loss": 0.216, "step": 794 }, { "epoch": 2.539936102236422, "grad_norm": 0.4484069903506647, "learning_rate": 2.7984218477864213e-06, "loss": 0.2081, "step": 795 }, { "epoch": 2.543130990415335, "grad_norm": 0.4412193065052018, "learning_rate": 2.7606066623990145e-06, "loss": 0.2943, "step": 796 }, { "epoch": 2.5463258785942493, "grad_norm": 0.4281448882843457, "learning_rate": 2.723029768102776e-06, "loss": 0.1912, "step": 797 }, { "epoch": 2.549520766773163, "grad_norm": 0.48266966006465656, "learning_rate": 2.6856916843034062e-06, "loss": 0.2363, "step": 798 }, { "epoch": 2.5527156549520766, "grad_norm": 0.4313024255011029, "learning_rate": 2.648592927105642e-06, "loss": 0.2356, "step": 799 }, { "epoch": 2.5559105431309903, "grad_norm": 0.42876561566865096, "learning_rate": 2.611734009306155e-06, "loss": 0.2222, "step": 800 }, { "epoch": 2.559105431309904, "grad_norm": 0.39838879945078226, "learning_rate": 2.5751154403864264e-06, "loss": 0.2119, "step": 801 }, { "epoch": 2.562300319488818, "grad_norm": 0.4755341593418832, "learning_rate": 2.5387377265057246e-06, "loss": 0.235, "step": 802 }, { "epoch": 2.5654952076677318, "grad_norm": 0.45820320153330835, "learning_rate": 2.502601370494111e-06, "loss": 0.2016, "step": 803 }, { "epoch": 2.5686900958466454, "grad_norm": 0.49185493187997026, "learning_rate": 2.4667068718454766e-06, "loss": 0.2038, "step": 804 }, { "epoch": 2.571884984025559, "grad_norm": 0.4346051571719638, "learning_rate": 2.4310547267106443e-06, "loss": 0.168, "step": 805 }, { "epoch": 2.5750798722044728, "grad_norm": 0.47891912799119724, "learning_rate": 2.395645427890525e-06, "loss": 0.192, "step": 806 }, { "epoch": 2.5782747603833864, "grad_norm": 0.4957624567636564, "learning_rate": 2.360479464829275e-06, "loss": 0.2015, "step": 807 }, { "epoch": 2.5814696485623, "grad_norm": 0.43031345754147116, "learning_rate": 2.3255573236075523e-06, "loss": 0.2332, "step": 808 }, { "epoch": 2.584664536741214, "grad_norm": 0.4493876332117976, "learning_rate": 2.2908794869358044e-06, "loss": 0.1779, "step": 809 }, { "epoch": 2.587859424920128, "grad_norm": 0.48046582003978555, "learning_rate": 2.2564464341475724e-06, "loss": 0.2085, "step": 810 }, { "epoch": 2.5910543130990416, "grad_norm": 0.4610477157344099, "learning_rate": 2.2222586411928826e-06, "loss": 0.2733, "step": 811 }, { "epoch": 2.594249201277955, "grad_norm": 0.42491586645922075, "learning_rate": 2.1883165806316688e-06, "loss": 0.2045, "step": 812 }, { "epoch": 2.597444089456869, "grad_norm": 0.4558620955090394, "learning_rate": 2.154620721627225e-06, "loss": 0.2348, "step": 813 }, { "epoch": 2.600638977635783, "grad_norm": 0.4177140807716825, "learning_rate": 2.121171529939734e-06, "loss": 0.2154, "step": 814 }, { "epoch": 2.6038338658146962, "grad_norm": 0.44671878373727514, "learning_rate": 2.0879694679198346e-06, "loss": 0.2339, "step": 815 }, { "epoch": 2.6070287539936103, "grad_norm": 0.5421104481464741, "learning_rate": 2.055014994502207e-06, "loss": 0.2628, "step": 816 }, { "epoch": 2.610223642172524, "grad_norm": 0.4183342876241132, "learning_rate": 2.022308565199249e-06, "loss": 0.2308, "step": 817 }, { "epoch": 2.6134185303514377, "grad_norm": 0.4076706224784102, "learning_rate": 1.989850632094783e-06, "loss": 0.2697, "step": 818 }, { "epoch": 2.6166134185303513, "grad_norm": 0.4327417815919417, "learning_rate": 1.9576416438377864e-06, "loss": 0.2204, "step": 819 }, { "epoch": 2.619808306709265, "grad_norm": 0.4465688214774834, "learning_rate": 1.925682045636217e-06, "loss": 0.2326, "step": 820 }, { "epoch": 2.623003194888179, "grad_norm": 0.4574212377076256, "learning_rate": 1.8939722792508307e-06, "loss": 0.2263, "step": 821 }, { "epoch": 2.626198083067093, "grad_norm": 0.4455916657403554, "learning_rate": 1.8625127829890922e-06, "loss": 0.2387, "step": 822 }, { "epoch": 2.6293929712460065, "grad_norm": 0.436528550029091, "learning_rate": 1.8313039916991204e-06, "loss": 0.2384, "step": 823 }, { "epoch": 2.63258785942492, "grad_norm": 0.44219679143144724, "learning_rate": 1.8003463367636676e-06, "loss": 0.2269, "step": 824 }, { "epoch": 2.635782747603834, "grad_norm": 0.44101913923028035, "learning_rate": 1.7696402460941554e-06, "loss": 0.2712, "step": 825 }, { "epoch": 2.6389776357827475, "grad_norm": 0.43323308580675207, "learning_rate": 1.7391861441247715e-06, "loss": 0.2645, "step": 826 }, { "epoch": 2.642172523961661, "grad_norm": 0.451593663495069, "learning_rate": 1.7089844518065902e-06, "loss": 0.2218, "step": 827 }, { "epoch": 2.6453674121405752, "grad_norm": 0.42165221052628815, "learning_rate": 1.6790355866017604e-06, "loss": 0.2272, "step": 828 }, { "epoch": 2.648562300319489, "grad_norm": 0.45660435100783486, "learning_rate": 1.6493399624777428e-06, "loss": 0.2322, "step": 829 }, { "epoch": 2.6517571884984026, "grad_norm": 0.46289265194890444, "learning_rate": 1.6198979899015687e-06, "loss": 0.2469, "step": 830 }, { "epoch": 2.6549520766773163, "grad_norm": 0.42669559181390876, "learning_rate": 1.5907100758341787e-06, "loss": 0.1998, "step": 831 }, { "epoch": 2.65814696485623, "grad_norm": 0.4190815694114573, "learning_rate": 1.5617766237248023e-06, "loss": 0.2103, "step": 832 }, { "epoch": 2.661341853035144, "grad_norm": 0.41531478233240604, "learning_rate": 1.5330980335053714e-06, "loss": 0.2039, "step": 833 }, { "epoch": 2.6645367412140573, "grad_norm": 0.4357539666161216, "learning_rate": 1.5046747015849893e-06, "loss": 0.2375, "step": 834 }, { "epoch": 2.6677316293929714, "grad_norm": 0.44449107969992363, "learning_rate": 1.4765070208444732e-06, "loss": 0.2807, "step": 835 }, { "epoch": 2.670926517571885, "grad_norm": 0.43149390691820894, "learning_rate": 1.4485953806308883e-06, "loss": 0.2307, "step": 836 }, { "epoch": 2.6741214057507987, "grad_norm": 0.43315728774154344, "learning_rate": 1.4209401667522028e-06, "loss": 0.2276, "step": 837 }, { "epoch": 2.6773162939297124, "grad_norm": 0.4499922141030728, "learning_rate": 1.3935417614719327e-06, "loss": 0.2079, "step": 838 }, { "epoch": 2.680511182108626, "grad_norm": 0.49094900978907974, "learning_rate": 1.366400543503854e-06, "loss": 0.1824, "step": 839 }, { "epoch": 2.68370607028754, "grad_norm": 0.47322551913300975, "learning_rate": 1.3395168880067978e-06, "loss": 0.2501, "step": 840 }, { "epoch": 2.686900958466454, "grad_norm": 0.43241827743072775, "learning_rate": 1.3128911665794198e-06, "loss": 0.2489, "step": 841 }, { "epoch": 2.6900958466453675, "grad_norm": 0.5782151533369344, "learning_rate": 1.2865237472551106e-06, "loss": 0.2477, "step": 842 }, { "epoch": 2.693290734824281, "grad_norm": 0.4494764243991699, "learning_rate": 1.2604149944968725e-06, "loss": 0.2111, "step": 843 }, { "epoch": 2.696485623003195, "grad_norm": 0.45156798802844117, "learning_rate": 1.234565269192296e-06, "loss": 0.176, "step": 844 }, { "epoch": 2.6996805111821085, "grad_norm": 0.4390858768011781, "learning_rate": 1.2089749286485808e-06, "loss": 0.2475, "step": 845 }, { "epoch": 2.702875399361022, "grad_norm": 0.4190824848284048, "learning_rate": 1.183644326587574e-06, "loss": 0.2275, "step": 846 }, { "epoch": 2.7060702875399363, "grad_norm": 0.4919591533349322, "learning_rate": 1.1585738131409107e-06, "loss": 0.2096, "step": 847 }, { "epoch": 2.70926517571885, "grad_norm": 0.4983722601382295, "learning_rate": 1.1337637348451369e-06, "loss": 0.2353, "step": 848 }, { "epoch": 2.7124600638977636, "grad_norm": 0.394892627520411, "learning_rate": 1.1092144346369581e-06, "loss": 0.2215, "step": 849 }, { "epoch": 2.7156549520766773, "grad_norm": 0.4103602053022109, "learning_rate": 1.0849262518484704e-06, "loss": 0.195, "step": 850 }, { "epoch": 2.718849840255591, "grad_norm": 0.4478481118516963, "learning_rate": 1.060899522202483e-06, "loss": 0.2243, "step": 851 }, { "epoch": 2.722044728434505, "grad_norm": 0.44779585136229183, "learning_rate": 1.037134577807879e-06, "loss": 0.1981, "step": 852 }, { "epoch": 2.7252396166134183, "grad_norm": 0.4309440677606521, "learning_rate": 1.0136317471550195e-06, "loss": 0.2119, "step": 853 }, { "epoch": 2.7284345047923324, "grad_norm": 0.44538470312104783, "learning_rate": 9.903913551112e-07, "loss": 0.2343, "step": 854 }, { "epoch": 2.731629392971246, "grad_norm": 0.46088845916460575, "learning_rate": 9.67413722916175e-07, "loss": 0.2384, "step": 855 }, { "epoch": 2.7348242811501597, "grad_norm": 0.4435462990703814, "learning_rate": 9.446991681776985e-07, "loss": 0.2338, "step": 856 }, { "epoch": 2.7380191693290734, "grad_norm": 0.4083232549654808, "learning_rate": 9.222480048671412e-07, "loss": 0.2039, "step": 857 }, { "epoch": 2.741214057507987, "grad_norm": 0.4751109728713644, "learning_rate": 9.000605433151643e-07, "loss": 0.2202, "step": 858 }, { "epoch": 2.744408945686901, "grad_norm": 0.42315898167185495, "learning_rate": 8.781370902074049e-07, "loss": 0.2429, "step": 859 }, { "epoch": 2.747603833865815, "grad_norm": 0.4389817241489918, "learning_rate": 8.564779485802566e-07, "loss": 0.2523, "step": 860 }, { "epoch": 2.7507987220447285, "grad_norm": 0.40824675808125727, "learning_rate": 8.350834178166755e-07, "loss": 0.2317, "step": 861 }, { "epoch": 2.753993610223642, "grad_norm": 0.44644266395927934, "learning_rate": 8.139537936420372e-07, "loss": 0.2198, "step": 862 }, { "epoch": 2.757188498402556, "grad_norm": 0.4364831760939215, "learning_rate": 7.93089368120048e-07, "loss": 0.2424, "step": 863 }, { "epoch": 2.7603833865814695, "grad_norm": 0.47326054771389603, "learning_rate": 7.724904296487246e-07, "loss": 0.2386, "step": 864 }, { "epoch": 2.763578274760383, "grad_norm": 0.4571089142274389, "learning_rate": 7.521572629563834e-07, "loss": 0.1619, "step": 865 }, { "epoch": 2.7667731629392973, "grad_norm": 0.4221090741397694, "learning_rate": 7.320901490977217e-07, "loss": 0.2486, "step": 866 }, { "epoch": 2.769968051118211, "grad_norm": 0.44759152235134214, "learning_rate": 7.122893654499318e-07, "loss": 0.2376, "step": 867 }, { "epoch": 2.7731629392971247, "grad_norm": 0.4559649616233877, "learning_rate": 6.927551857088576e-07, "loss": 0.2216, "step": 868 }, { "epoch": 2.7763578274760383, "grad_norm": 0.3938914848222988, "learning_rate": 6.734878798852174e-07, "loss": 0.2331, "step": 869 }, { "epoch": 2.779552715654952, "grad_norm": 0.44967947965212385, "learning_rate": 6.544877143008777e-07, "loss": 0.2303, "step": 870 }, { "epoch": 2.7827476038338657, "grad_norm": 0.4204688969968103, "learning_rate": 6.357549515851525e-07, "loss": 0.2497, "step": 871 }, { "epoch": 2.7859424920127793, "grad_norm": 0.42708869477683215, "learning_rate": 6.172898506712033e-07, "loss": 0.2502, "step": 872 }, { "epoch": 2.7891373801916934, "grad_norm": 0.4289703544035532, "learning_rate": 5.990926667924313e-07, "loss": 0.2637, "step": 873 }, { "epoch": 2.792332268370607, "grad_norm": 0.43987128340382603, "learning_rate": 5.811636514789598e-07, "loss": 0.193, "step": 874 }, { "epoch": 2.7955271565495208, "grad_norm": 0.5004291788721248, "learning_rate": 5.635030525541685e-07, "loss": 0.2105, "step": 875 }, { "epoch": 2.7987220447284344, "grad_norm": 0.44122554674258213, "learning_rate": 5.461111141312492e-07, "loss": 0.1874, "step": 876 }, { "epoch": 2.801916932907348, "grad_norm": 0.4230374283981073, "learning_rate": 5.289880766098421e-07, "loss": 0.2113, "step": 877 }, { "epoch": 2.8051118210862622, "grad_norm": 0.4219143270150449, "learning_rate": 5.121341766727184e-07, "loss": 0.1856, "step": 878 }, { "epoch": 2.8083067092651754, "grad_norm": 0.4280311905897577, "learning_rate": 4.955496472824939e-07, "loss": 0.2479, "step": 879 }, { "epoch": 2.8115015974440896, "grad_norm": 0.4275114109142309, "learning_rate": 4.79234717678414e-07, "loss": 0.2109, "step": 880 }, { "epoch": 2.8146964856230032, "grad_norm": 0.41231768670069835, "learning_rate": 4.631896133732006e-07, "loss": 0.1914, "step": 881 }, { "epoch": 2.817891373801917, "grad_norm": 0.4046385798085887, "learning_rate": 4.474145561499099e-07, "loss": 0.2497, "step": 882 }, { "epoch": 2.8210862619808306, "grad_norm": 0.4337798293798453, "learning_rate": 4.319097640588821e-07, "loss": 0.2105, "step": 883 }, { "epoch": 2.8242811501597442, "grad_norm": 0.4367363776983278, "learning_rate": 4.166754514147275e-07, "loss": 0.2541, "step": 884 }, { "epoch": 2.8274760383386583, "grad_norm": 0.4121592143726346, "learning_rate": 4.0171182879335856e-07, "loss": 0.2934, "step": 885 }, { "epoch": 2.830670926517572, "grad_norm": 0.45692427171461536, "learning_rate": 3.870191030290782e-07, "loss": 0.2123, "step": 886 }, { "epoch": 2.8338658146964857, "grad_norm": 0.44515522805300783, "learning_rate": 3.7259747721173134e-07, "loss": 0.1926, "step": 887 }, { "epoch": 2.8370607028753994, "grad_norm": 0.41210623411640795, "learning_rate": 3.584471506838871e-07, "loss": 0.2355, "step": 888 }, { "epoch": 2.840255591054313, "grad_norm": 0.7278276221930801, "learning_rate": 3.445683190380833e-07, "loss": 0.2734, "step": 889 }, { "epoch": 2.8434504792332267, "grad_norm": 0.4293466579902048, "learning_rate": 3.3096117411413056e-07, "loss": 0.2084, "step": 890 }, { "epoch": 2.8466453674121404, "grad_norm": 0.3945276989591356, "learning_rate": 3.1762590399645907e-07, "loss": 0.2355, "step": 891 }, { "epoch": 2.8498402555910545, "grad_norm": 0.4169674719101088, "learning_rate": 3.045626930115053e-07, "loss": 0.2556, "step": 892 }, { "epoch": 2.853035143769968, "grad_norm": 0.4677238531453402, "learning_rate": 2.917717217251914e-07, "loss": 0.2067, "step": 893 }, { "epoch": 2.856230031948882, "grad_norm": 0.4508849526360919, "learning_rate": 2.7925316694039637e-07, "loss": 0.2264, "step": 894 }, { "epoch": 2.8594249201277955, "grad_norm": 1.3406369405711993, "learning_rate": 2.670072016945402e-07, "loss": 0.3042, "step": 895 }, { "epoch": 2.862619808306709, "grad_norm": 0.3953393489058587, "learning_rate": 2.5503399525717674e-07, "loss": 0.2038, "step": 896 }, { "epoch": 2.8658146964856233, "grad_norm": 0.3991610340225267, "learning_rate": 2.433337131276581e-07, "loss": 0.2806, "step": 897 }, { "epoch": 2.8690095846645365, "grad_norm": 0.42671305379955443, "learning_rate": 2.3190651703284273e-07, "loss": 0.2369, "step": 898 }, { "epoch": 2.8722044728434506, "grad_norm": 0.41224602176747227, "learning_rate": 2.207525649248754e-07, "loss": 0.2171, "step": 899 }, { "epoch": 2.8753993610223643, "grad_norm": 0.509647560563868, "learning_rate": 2.0987201097897757e-07, "loss": 0.2097, "step": 900 }, { "epoch": 2.878594249201278, "grad_norm": 0.4724203634577862, "learning_rate": 1.9926500559134477e-07, "loss": 0.24, "step": 901 }, { "epoch": 2.8817891373801916, "grad_norm": 0.44222221500216696, "learning_rate": 1.8893169537704813e-07, "loss": 0.2815, "step": 902 }, { "epoch": 2.8849840255591053, "grad_norm": 0.43359238337650113, "learning_rate": 1.7887222316800957e-07, "loss": 0.2058, "step": 903 }, { "epoch": 2.8881789137380194, "grad_norm": 0.4309496609438018, "learning_rate": 1.690867280110431e-07, "loss": 0.2481, "step": 904 }, { "epoch": 2.891373801916933, "grad_norm": 0.4330580118936636, "learning_rate": 1.5957534516590988e-07, "loss": 0.2267, "step": 905 }, { "epoch": 2.8945686900958467, "grad_norm": 0.41196459499404176, "learning_rate": 1.503382061034686e-07, "loss": 0.2471, "step": 906 }, { "epoch": 2.8977635782747604, "grad_norm": 0.46467452130408804, "learning_rate": 1.4137543850384572e-07, "loss": 0.2321, "step": 907 }, { "epoch": 2.900958466453674, "grad_norm": 0.430788659047838, "learning_rate": 1.3268716625467914e-07, "loss": 0.2805, "step": 908 }, { "epoch": 2.9041533546325877, "grad_norm": 0.4431748531892815, "learning_rate": 1.242735094493952e-07, "loss": 0.2397, "step": 909 }, { "epoch": 2.9073482428115014, "grad_norm": 0.4435883937921819, "learning_rate": 1.1613458438556102e-07, "loss": 0.2752, "step": 910 }, { "epoch": 2.9105431309904155, "grad_norm": 0.4304268716716866, "learning_rate": 1.0827050356326585e-07, "loss": 0.26, "step": 911 }, { "epoch": 2.913738019169329, "grad_norm": 0.4399673032424353, "learning_rate": 1.0068137568357783e-07, "loss": 0.2205, "step": 912 }, { "epoch": 2.916932907348243, "grad_norm": 0.4445872824299983, "learning_rate": 9.336730564702745e-08, "loss": 0.1941, "step": 913 }, { "epoch": 2.9201277955271565, "grad_norm": 0.4433121701499639, "learning_rate": 8.632839455216869e-08, "loss": 0.2012, "step": 914 }, { "epoch": 2.92332268370607, "grad_norm": 0.445710760902897, "learning_rate": 7.956473969417789e-08, "loss": 0.1946, "step": 915 }, { "epoch": 2.9265175718849843, "grad_norm": 0.44031338022101135, "learning_rate": 7.307643456351044e-08, "loss": 0.236, "step": 916 }, { "epoch": 2.9297124600638975, "grad_norm": 0.42316443693466144, "learning_rate": 6.686356884460177e-08, "loss": 0.2314, "step": 917 }, { "epoch": 2.9329073482428116, "grad_norm": 0.4449659893027522, "learning_rate": 6.092622841463502e-08, "loss": 0.1657, "step": 918 }, { "epoch": 2.9361022364217253, "grad_norm": 0.4225339222713818, "learning_rate": 5.526449534235534e-08, "loss": 0.2542, "step": 919 }, { "epoch": 2.939297124600639, "grad_norm": 0.44649236358142536, "learning_rate": 4.9878447886926305e-08, "loss": 0.2343, "step": 920 }, { "epoch": 2.9424920127795526, "grad_norm": 0.4096672005948774, "learning_rate": 4.4768160496859725e-08, "loss": 0.2727, "step": 921 }, { "epoch": 2.9456869009584663, "grad_norm": 0.4393531135600368, "learning_rate": 3.993370380897421e-08, "loss": 0.225, "step": 922 }, { "epoch": 2.9488817891373804, "grad_norm": 0.47244217125437676, "learning_rate": 3.537514464743152e-08, "loss": 0.2135, "step": 923 }, { "epoch": 2.952076677316294, "grad_norm": 0.4328106188446803, "learning_rate": 3.109254602280398e-08, "loss": 0.218, "step": 924 }, { "epoch": 2.9552715654952078, "grad_norm": 0.4358995352547161, "learning_rate": 2.7085967131201818e-08, "loss": 0.2325, "step": 925 }, { "epoch": 2.9584664536741214, "grad_norm": 0.4376093029807377, "learning_rate": 2.3355463353467168e-08, "loss": 0.2082, "step": 926 }, { "epoch": 2.961661341853035, "grad_norm": 0.42933069783001954, "learning_rate": 1.9901086254396908e-08, "loss": 0.2006, "step": 927 }, { "epoch": 2.9648562300319488, "grad_norm": 0.43253728612411796, "learning_rate": 1.672288358203211e-08, "loss": 0.2086, "step": 928 }, { "epoch": 2.9680511182108624, "grad_norm": 0.49147199258587176, "learning_rate": 1.382089926700303e-08, "loss": 0.2226, "step": 929 }, { "epoch": 2.9712460063897765, "grad_norm": 0.4073568057815397, "learning_rate": 1.1195173421914007e-08, "loss": 0.2687, "step": 930 }, { "epoch": 2.97444089456869, "grad_norm": 0.392034693925328, "learning_rate": 8.84574234079727e-09, "loss": 0.2252, "step": 931 }, { "epoch": 2.977635782747604, "grad_norm": 0.44649691262839863, "learning_rate": 6.772638498606654e-09, "loss": 0.241, "step": 932 }, { "epoch": 2.9808306709265175, "grad_norm": 0.4212064663206084, "learning_rate": 4.97589055076908e-09, "loss": 0.2497, "step": 933 }, { "epoch": 2.984025559105431, "grad_norm": 0.4438386239006071, "learning_rate": 3.4555233327893124e-09, "loss": 0.2348, "step": 934 }, { "epoch": 2.987220447284345, "grad_norm": 0.43882394327805996, "learning_rate": 2.2115578599035683e-09, "loss": 0.2373, "step": 935 }, { "epoch": 2.9904153354632586, "grad_norm": 0.4544247984602708, "learning_rate": 1.244011326797523e-09, "loss": 0.2172, "step": 936 }, { "epoch": 2.9936102236421727, "grad_norm": 0.4700550963913251, "learning_rate": 5.52897107355399e-10, "loss": 0.2571, "step": 937 }, { "epoch": 2.9968051118210863, "grad_norm": 0.4038371285545047, "learning_rate": 1.3822475449121186e-10, "loss": 0.2118, "step": 938 }, { "epoch": 3.0, "grad_norm": 0.3831820442811552, "learning_rate": 0.0, "loss": 0.2001, "step": 939 }, { "epoch": 3.0, "step": 939, "total_flos": 406391461183488.0, "train_loss": 0.3877937021696022, "train_runtime": 10098.5367, "train_samples_per_second": 2.97, "train_steps_per_second": 0.093 } ], "logging_steps": 1.0, "max_steps": 939, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 406391461183488.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }