{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 7512, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007987220447284345, "grad_norm": 63.611212563977595, "learning_rate": 2.6595744680851065e-08, "loss": 1.2894, "step": 2 }, { "epoch": 0.001597444089456869, "grad_norm": 105.73905369340336, "learning_rate": 5.319148936170213e-08, "loss": 1.1515, "step": 4 }, { "epoch": 0.0023961661341853034, "grad_norm": 97.48653856616731, "learning_rate": 7.978723404255319e-08, "loss": 1.1725, "step": 6 }, { "epoch": 0.003194888178913738, "grad_norm": 113.1691721809818, "learning_rate": 1.0638297872340426e-07, "loss": 1.2009, "step": 8 }, { "epoch": 0.003993610223642172, "grad_norm": 234.6545065934056, "learning_rate": 1.3297872340425533e-07, "loss": 1.2124, "step": 10 }, { "epoch": 0.004792332268370607, "grad_norm": 56.50421981052306, "learning_rate": 1.5957446808510638e-07, "loss": 1.2292, "step": 12 }, { "epoch": 0.005591054313099041, "grad_norm": 67.76208027590963, "learning_rate": 1.8617021276595745e-07, "loss": 1.1731, "step": 14 }, { "epoch": 0.006389776357827476, "grad_norm": 19.773545305324223, "learning_rate": 2.1276595744680852e-07, "loss": 1.1682, "step": 16 }, { "epoch": 0.00718849840255591, "grad_norm": 108.48956883104074, "learning_rate": 2.393617021276596e-07, "loss": 1.1885, "step": 18 }, { "epoch": 0.007987220447284345, "grad_norm": 74.594447029578, "learning_rate": 2.6595744680851066e-07, "loss": 1.1143, "step": 20 }, { "epoch": 0.00878594249201278, "grad_norm": 193.76064469208063, "learning_rate": 2.9255319148936174e-07, "loss": 1.1366, "step": 22 }, { "epoch": 0.009584664536741214, "grad_norm": 165.84272450497056, "learning_rate": 3.1914893617021275e-07, "loss": 1.1753, "step": 24 }, { "epoch": 0.010383386581469648, "grad_norm": 150.10218395823307, "learning_rate": 3.457446808510639e-07, "loss": 1.1228, "step": 26 }, { "epoch": 0.011182108626198083, "grad_norm": 25.354463533053462, "learning_rate": 3.723404255319149e-07, "loss": 1.138, "step": 28 }, { "epoch": 0.011980830670926517, "grad_norm": 106.72660838009219, "learning_rate": 3.98936170212766e-07, "loss": 1.1287, "step": 30 }, { "epoch": 0.012779552715654952, "grad_norm": 83.97368303722932, "learning_rate": 4.2553191489361704e-07, "loss": 1.0818, "step": 32 }, { "epoch": 0.013578274760383386, "grad_norm": 72.42974677624225, "learning_rate": 4.5212765957446816e-07, "loss": 0.974, "step": 34 }, { "epoch": 0.01437699680511182, "grad_norm": 98.949142462491, "learning_rate": 4.787234042553192e-07, "loss": 1.0153, "step": 36 }, { "epoch": 0.015175718849840255, "grad_norm": 81.09462884756049, "learning_rate": 5.053191489361702e-07, "loss": 1.0437, "step": 38 }, { "epoch": 0.01597444089456869, "grad_norm": 71.5460678669271, "learning_rate": 5.319148936170213e-07, "loss": 0.9027, "step": 40 }, { "epoch": 0.016773162939297124, "grad_norm": 97.98883013825395, "learning_rate": 5.585106382978723e-07, "loss": 0.8252, "step": 42 }, { "epoch": 0.01757188498402556, "grad_norm": 130.2877391749827, "learning_rate": 5.851063829787235e-07, "loss": 0.7532, "step": 44 }, { "epoch": 0.018370607028753993, "grad_norm": 98.81399161595317, "learning_rate": 6.117021276595745e-07, "loss": 0.7013, "step": 46 }, { "epoch": 0.019169329073482427, "grad_norm": 22.295409656345377, "learning_rate": 6.382978723404255e-07, "loss": 0.6467, "step": 48 }, { "epoch": 0.019968051118210862, "grad_norm": 34.582920133699176, "learning_rate": 6.648936170212766e-07, "loss": 0.6351, "step": 50 }, { "epoch": 0.020766773162939296, "grad_norm": 50.861520008988386, "learning_rate": 6.914893617021278e-07, "loss": 0.5912, "step": 52 }, { "epoch": 0.02156549520766773, "grad_norm": 49.12926837947303, "learning_rate": 7.180851063829789e-07, "loss": 0.5413, "step": 54 }, { "epoch": 0.022364217252396165, "grad_norm": 20.811523309346935, "learning_rate": 7.446808510638298e-07, "loss": 0.479, "step": 56 }, { "epoch": 0.0231629392971246, "grad_norm": 12.332350980787394, "learning_rate": 7.712765957446809e-07, "loss": 0.4714, "step": 58 }, { "epoch": 0.023961661341853034, "grad_norm": 15.193345562611738, "learning_rate": 7.97872340425532e-07, "loss": 0.4622, "step": 60 }, { "epoch": 0.02476038338658147, "grad_norm": 5.637530351411472, "learning_rate": 8.24468085106383e-07, "loss": 0.4079, "step": 62 }, { "epoch": 0.025559105431309903, "grad_norm": 11.657766039541796, "learning_rate": 8.510638297872341e-07, "loss": 0.3716, "step": 64 }, { "epoch": 0.026357827476038338, "grad_norm": 8.003062600948699, "learning_rate": 8.776595744680852e-07, "loss": 0.4022, "step": 66 }, { "epoch": 0.027156549520766772, "grad_norm": 9.863778660497967, "learning_rate": 9.042553191489363e-07, "loss": 0.4061, "step": 68 }, { "epoch": 0.027955271565495207, "grad_norm": 26.781259322239684, "learning_rate": 9.308510638297872e-07, "loss": 0.3638, "step": 70 }, { "epoch": 0.02875399361022364, "grad_norm": 5.445843816912823, "learning_rate": 9.574468085106384e-07, "loss": 0.3581, "step": 72 }, { "epoch": 0.029552715654952075, "grad_norm": 9.213363331667637, "learning_rate": 9.840425531914895e-07, "loss": 0.3702, "step": 74 }, { "epoch": 0.03035143769968051, "grad_norm": 4.727611360243214, "learning_rate": 1.0106382978723404e-06, "loss": 0.3487, "step": 76 }, { "epoch": 0.031150159744408944, "grad_norm": 18.19406307616779, "learning_rate": 1.0372340425531915e-06, "loss": 0.3222, "step": 78 }, { "epoch": 0.03194888178913738, "grad_norm": 9.595831277572767, "learning_rate": 1.0638297872340427e-06, "loss": 0.3258, "step": 80 }, { "epoch": 0.03274760383386582, "grad_norm": 35.38120873285774, "learning_rate": 1.0904255319148938e-06, "loss": 0.3055, "step": 82 }, { "epoch": 0.03354632587859425, "grad_norm": 5.084938749131369, "learning_rate": 1.1170212765957447e-06, "loss": 0.3174, "step": 84 }, { "epoch": 0.034345047923322686, "grad_norm": 7.402514872898367, "learning_rate": 1.1436170212765958e-06, "loss": 0.3097, "step": 86 }, { "epoch": 0.03514376996805112, "grad_norm": 3.949621968931367, "learning_rate": 1.170212765957447e-06, "loss": 0.3152, "step": 88 }, { "epoch": 0.035942492012779555, "grad_norm": 10.814942895226139, "learning_rate": 1.196808510638298e-06, "loss": 0.3037, "step": 90 }, { "epoch": 0.036741214057507986, "grad_norm": 14.354678286624155, "learning_rate": 1.223404255319149e-06, "loss": 0.3261, "step": 92 }, { "epoch": 0.037539936102236424, "grad_norm": 6.168542351180327, "learning_rate": 1.25e-06, "loss": 0.31, "step": 94 }, { "epoch": 0.038338658146964855, "grad_norm": 11.539898124756117, "learning_rate": 1.276595744680851e-06, "loss": 0.311, "step": 96 }, { "epoch": 0.03913738019169329, "grad_norm": 4.013221758787171, "learning_rate": 1.3031914893617024e-06, "loss": 0.2996, "step": 98 }, { "epoch": 0.039936102236421724, "grad_norm": 3.860547140813227, "learning_rate": 1.3297872340425533e-06, "loss": 0.2763, "step": 100 }, { "epoch": 0.04073482428115016, "grad_norm": 17.85966416524752, "learning_rate": 1.3563829787234042e-06, "loss": 0.2904, "step": 102 }, { "epoch": 0.04153354632587859, "grad_norm": 16.697164957447292, "learning_rate": 1.3829787234042555e-06, "loss": 0.2832, "step": 104 }, { "epoch": 0.04233226837060703, "grad_norm": 25.51601683379472, "learning_rate": 1.4095744680851064e-06, "loss": 0.2861, "step": 106 }, { "epoch": 0.04313099041533546, "grad_norm": 43.70792394022941, "learning_rate": 1.4361702127659578e-06, "loss": 0.2882, "step": 108 }, { "epoch": 0.0439297124600639, "grad_norm": 21.39234312256687, "learning_rate": 1.4627659574468087e-06, "loss": 0.2868, "step": 110 }, { "epoch": 0.04472843450479233, "grad_norm": 4.299149147995996, "learning_rate": 1.4893617021276596e-06, "loss": 0.2802, "step": 112 }, { "epoch": 0.04552715654952077, "grad_norm": 4.853881749446152, "learning_rate": 1.515957446808511e-06, "loss": 0.2766, "step": 114 }, { "epoch": 0.0463258785942492, "grad_norm": 14.252871564540007, "learning_rate": 1.5425531914893618e-06, "loss": 0.2696, "step": 116 }, { "epoch": 0.04712460063897764, "grad_norm": 4.330757958493644, "learning_rate": 1.5691489361702128e-06, "loss": 0.2868, "step": 118 }, { "epoch": 0.04792332268370607, "grad_norm": 5.144959414872726, "learning_rate": 1.595744680851064e-06, "loss": 0.2985, "step": 120 }, { "epoch": 0.048722044728434506, "grad_norm": 13.510588438287128, "learning_rate": 1.622340425531915e-06, "loss": 0.2903, "step": 122 }, { "epoch": 0.04952076677316294, "grad_norm": 9.984785828599879, "learning_rate": 1.648936170212766e-06, "loss": 0.3019, "step": 124 }, { "epoch": 0.050319488817891375, "grad_norm": 11.150832562553619, "learning_rate": 1.6755319148936172e-06, "loss": 0.271, "step": 126 }, { "epoch": 0.051118210862619806, "grad_norm": 14.058578339810987, "learning_rate": 1.7021276595744682e-06, "loss": 0.273, "step": 128 }, { "epoch": 0.051916932907348244, "grad_norm": 5.00151816005258, "learning_rate": 1.7287234042553195e-06, "loss": 0.2607, "step": 130 }, { "epoch": 0.052715654952076675, "grad_norm": 15.004005157673818, "learning_rate": 1.7553191489361704e-06, "loss": 0.2654, "step": 132 }, { "epoch": 0.05351437699680511, "grad_norm": 7.002483899819058, "learning_rate": 1.7819148936170213e-06, "loss": 0.2623, "step": 134 }, { "epoch": 0.054313099041533544, "grad_norm": 3.9841361364612062, "learning_rate": 1.8085106382978727e-06, "loss": 0.2934, "step": 136 }, { "epoch": 0.05511182108626198, "grad_norm": 3.8494401246050054, "learning_rate": 1.8351063829787236e-06, "loss": 0.274, "step": 138 }, { "epoch": 0.05591054313099041, "grad_norm": 5.497525399265219, "learning_rate": 1.8617021276595745e-06, "loss": 0.2647, "step": 140 }, { "epoch": 0.05670926517571885, "grad_norm": 4.980587322498712, "learning_rate": 1.8882978723404258e-06, "loss": 0.273, "step": 142 }, { "epoch": 0.05750798722044728, "grad_norm": 3.7891589019522653, "learning_rate": 1.9148936170212767e-06, "loss": 0.2517, "step": 144 }, { "epoch": 0.05830670926517572, "grad_norm": 4.92828775714686, "learning_rate": 1.941489361702128e-06, "loss": 0.2704, "step": 146 }, { "epoch": 0.05910543130990415, "grad_norm": 4.103755865538166, "learning_rate": 1.968085106382979e-06, "loss": 0.2591, "step": 148 }, { "epoch": 0.05990415335463259, "grad_norm": 3.435413585690327, "learning_rate": 1.99468085106383e-06, "loss": 0.2509, "step": 150 }, { "epoch": 0.06070287539936102, "grad_norm": 4.277721208929287, "learning_rate": 2.021276595744681e-06, "loss": 0.2584, "step": 152 }, { "epoch": 0.06150159744408946, "grad_norm": 3.799249236289544, "learning_rate": 2.047872340425532e-06, "loss": 0.2589, "step": 154 }, { "epoch": 0.06230031948881789, "grad_norm": 3.7296853670347825, "learning_rate": 2.074468085106383e-06, "loss": 0.2567, "step": 156 }, { "epoch": 0.06309904153354633, "grad_norm": 4.512870491032634, "learning_rate": 2.101063829787234e-06, "loss": 0.2481, "step": 158 }, { "epoch": 0.06389776357827476, "grad_norm": 9.719375176261458, "learning_rate": 2.1276595744680853e-06, "loss": 0.24, "step": 160 }, { "epoch": 0.06469648562300319, "grad_norm": 5.994224591000894, "learning_rate": 2.1542553191489364e-06, "loss": 0.245, "step": 162 }, { "epoch": 0.06549520766773163, "grad_norm": 5.4415526514767025, "learning_rate": 2.1808510638297876e-06, "loss": 0.2653, "step": 164 }, { "epoch": 0.06629392971246006, "grad_norm": 5.939080958451638, "learning_rate": 2.2074468085106387e-06, "loss": 0.2472, "step": 166 }, { "epoch": 0.0670926517571885, "grad_norm": 5.160493886252985, "learning_rate": 2.2340425531914894e-06, "loss": 0.2471, "step": 168 }, { "epoch": 0.06789137380191693, "grad_norm": 3.652646803021308, "learning_rate": 2.2606382978723405e-06, "loss": 0.2324, "step": 170 }, { "epoch": 0.06869009584664537, "grad_norm": 3.8466095536871343, "learning_rate": 2.2872340425531916e-06, "loss": 0.2452, "step": 172 }, { "epoch": 0.0694888178913738, "grad_norm": 3.3542675856152537, "learning_rate": 2.3138297872340428e-06, "loss": 0.2419, "step": 174 }, { "epoch": 0.07028753993610223, "grad_norm": 4.481843921006337, "learning_rate": 2.340425531914894e-06, "loss": 0.2451, "step": 176 }, { "epoch": 0.07108626198083066, "grad_norm": 3.03610682279202, "learning_rate": 2.367021276595745e-06, "loss": 0.2183, "step": 178 }, { "epoch": 0.07188498402555911, "grad_norm": 3.1175806494594482, "learning_rate": 2.393617021276596e-06, "loss": 0.2349, "step": 180 }, { "epoch": 0.07268370607028754, "grad_norm": 3.4236470265390033, "learning_rate": 2.420212765957447e-06, "loss": 0.2406, "step": 182 }, { "epoch": 0.07348242811501597, "grad_norm": 3.604740428294787, "learning_rate": 2.446808510638298e-06, "loss": 0.2349, "step": 184 }, { "epoch": 0.0742811501597444, "grad_norm": 3.255303266074429, "learning_rate": 2.473404255319149e-06, "loss": 0.2402, "step": 186 }, { "epoch": 0.07507987220447285, "grad_norm": 3.030059130242592, "learning_rate": 2.5e-06, "loss": 0.2233, "step": 188 }, { "epoch": 0.07587859424920128, "grad_norm": 5.396885260460128, "learning_rate": 2.5265957446808513e-06, "loss": 0.2264, "step": 190 }, { "epoch": 0.07667731629392971, "grad_norm": 3.4037128475892384, "learning_rate": 2.553191489361702e-06, "loss": 0.2266, "step": 192 }, { "epoch": 0.07747603833865814, "grad_norm": 3.540697871576238, "learning_rate": 2.5797872340425536e-06, "loss": 0.2302, "step": 194 }, { "epoch": 0.07827476038338659, "grad_norm": 2.8155061709596616, "learning_rate": 2.6063829787234047e-06, "loss": 0.2442, "step": 196 }, { "epoch": 0.07907348242811502, "grad_norm": 2.9188721708463885, "learning_rate": 2.6329787234042554e-06, "loss": 0.2261, "step": 198 }, { "epoch": 0.07987220447284345, "grad_norm": 2.9114746100952598, "learning_rate": 2.6595744680851065e-06, "loss": 0.2287, "step": 200 }, { "epoch": 0.08067092651757188, "grad_norm": 3.2180252275052257, "learning_rate": 2.6861702127659577e-06, "loss": 0.2187, "step": 202 }, { "epoch": 0.08146964856230032, "grad_norm": 2.8873639429163314, "learning_rate": 2.7127659574468084e-06, "loss": 0.2158, "step": 204 }, { "epoch": 0.08226837060702875, "grad_norm": 3.489228955679207, "learning_rate": 2.73936170212766e-06, "loss": 0.2258, "step": 206 }, { "epoch": 0.08306709265175719, "grad_norm": 2.981624517929962, "learning_rate": 2.765957446808511e-06, "loss": 0.2108, "step": 208 }, { "epoch": 0.08386581469648563, "grad_norm": 3.21155859005065, "learning_rate": 2.7925531914893617e-06, "loss": 0.216, "step": 210 }, { "epoch": 0.08466453674121406, "grad_norm": 3.0096285120757935, "learning_rate": 2.819148936170213e-06, "loss": 0.2248, "step": 212 }, { "epoch": 0.08546325878594249, "grad_norm": 2.9629033272723735, "learning_rate": 2.845744680851064e-06, "loss": 0.2319, "step": 214 }, { "epoch": 0.08626198083067092, "grad_norm": 3.1407766370777646, "learning_rate": 2.8723404255319155e-06, "loss": 0.2083, "step": 216 }, { "epoch": 0.08706070287539937, "grad_norm": 2.987122511363789, "learning_rate": 2.8989361702127662e-06, "loss": 0.2169, "step": 218 }, { "epoch": 0.0878594249201278, "grad_norm": 3.0880793923727063, "learning_rate": 2.9255319148936174e-06, "loss": 0.2103, "step": 220 }, { "epoch": 0.08865814696485623, "grad_norm": 2.7280690541697075, "learning_rate": 2.9521276595744685e-06, "loss": 0.1997, "step": 222 }, { "epoch": 0.08945686900958466, "grad_norm": 4.457072529565402, "learning_rate": 2.978723404255319e-06, "loss": 0.212, "step": 224 }, { "epoch": 0.0902555910543131, "grad_norm": 2.6269741757814176, "learning_rate": 3.0053191489361703e-06, "loss": 0.2092, "step": 226 }, { "epoch": 0.09105431309904154, "grad_norm": 6.22452274606642, "learning_rate": 3.031914893617022e-06, "loss": 0.2049, "step": 228 }, { "epoch": 0.09185303514376997, "grad_norm": 2.877291306172186, "learning_rate": 3.0585106382978726e-06, "loss": 0.2083, "step": 230 }, { "epoch": 0.0926517571884984, "grad_norm": 5.1194444563249935, "learning_rate": 3.0851063829787237e-06, "loss": 0.2054, "step": 232 }, { "epoch": 0.09345047923322684, "grad_norm": 6.2843799815917745, "learning_rate": 3.111702127659575e-06, "loss": 0.2194, "step": 234 }, { "epoch": 0.09424920127795527, "grad_norm": 5.861687233187676, "learning_rate": 3.1382978723404255e-06, "loss": 0.2284, "step": 236 }, { "epoch": 0.0950479233226837, "grad_norm": 3.213879356151013, "learning_rate": 3.164893617021277e-06, "loss": 0.2207, "step": 238 }, { "epoch": 0.09584664536741214, "grad_norm": 3.1630850214772255, "learning_rate": 3.191489361702128e-06, "loss": 0.2066, "step": 240 }, { "epoch": 0.09664536741214058, "grad_norm": 3.148239375316538, "learning_rate": 3.218085106382979e-06, "loss": 0.2212, "step": 242 }, { "epoch": 0.09744408945686901, "grad_norm": 2.508025815693479, "learning_rate": 3.24468085106383e-06, "loss": 0.2054, "step": 244 }, { "epoch": 0.09824281150159744, "grad_norm": 2.909768854428482, "learning_rate": 3.271276595744681e-06, "loss": 0.2186, "step": 246 }, { "epoch": 0.09904153354632587, "grad_norm": 7.665173774329105, "learning_rate": 3.297872340425532e-06, "loss": 0.2228, "step": 248 }, { "epoch": 0.09984025559105432, "grad_norm": 2.253729407128243, "learning_rate": 3.3244680851063834e-06, "loss": 0.2049, "step": 250 }, { "epoch": 0.10063897763578275, "grad_norm": 3.8706889996285336, "learning_rate": 3.3510638297872345e-06, "loss": 0.2259, "step": 252 }, { "epoch": 0.10143769968051118, "grad_norm": 3.5646371999914357, "learning_rate": 3.377659574468085e-06, "loss": 0.1859, "step": 254 }, { "epoch": 0.10223642172523961, "grad_norm": 2.7753062530577597, "learning_rate": 3.4042553191489363e-06, "loss": 0.2166, "step": 256 }, { "epoch": 0.10303514376996806, "grad_norm": 3.0863522747098022, "learning_rate": 3.4308510638297874e-06, "loss": 0.2346, "step": 258 }, { "epoch": 0.10383386581469649, "grad_norm": 2.7019568800833897, "learning_rate": 3.457446808510639e-06, "loss": 0.2031, "step": 260 }, { "epoch": 0.10463258785942492, "grad_norm": 2.297946892414017, "learning_rate": 3.4840425531914897e-06, "loss": 0.1934, "step": 262 }, { "epoch": 0.10543130990415335, "grad_norm": 2.714689444342462, "learning_rate": 3.510638297872341e-06, "loss": 0.2191, "step": 264 }, { "epoch": 0.1062300319488818, "grad_norm": 2.2968600529073098, "learning_rate": 3.537234042553192e-06, "loss": 0.2078, "step": 266 }, { "epoch": 0.10702875399361023, "grad_norm": 2.740039735249897, "learning_rate": 3.5638297872340426e-06, "loss": 0.2196, "step": 268 }, { "epoch": 0.10782747603833866, "grad_norm": 2.4769261724935423, "learning_rate": 3.5904255319148938e-06, "loss": 0.2217, "step": 270 }, { "epoch": 0.10862619808306709, "grad_norm": 2.3980460329324713, "learning_rate": 3.6170212765957453e-06, "loss": 0.2174, "step": 272 }, { "epoch": 0.10942492012779553, "grad_norm": 2.350186883116406, "learning_rate": 3.643617021276596e-06, "loss": 0.2164, "step": 274 }, { "epoch": 0.11022364217252396, "grad_norm": 2.3070632701092, "learning_rate": 3.670212765957447e-06, "loss": 0.2058, "step": 276 }, { "epoch": 0.1110223642172524, "grad_norm": 2.5876090442770914, "learning_rate": 3.6968085106382983e-06, "loss": 0.2102, "step": 278 }, { "epoch": 0.11182108626198083, "grad_norm": 2.527498495500664, "learning_rate": 3.723404255319149e-06, "loss": 0.1878, "step": 280 }, { "epoch": 0.11261980830670927, "grad_norm": 3.1408931185668734, "learning_rate": 3.7500000000000005e-06, "loss": 0.2028, "step": 282 }, { "epoch": 0.1134185303514377, "grad_norm": 2.438660009024771, "learning_rate": 3.7765957446808516e-06, "loss": 0.198, "step": 284 }, { "epoch": 0.11421725239616613, "grad_norm": 3.163227332011823, "learning_rate": 3.8031914893617023e-06, "loss": 0.2143, "step": 286 }, { "epoch": 0.11501597444089456, "grad_norm": 2.62548659237651, "learning_rate": 3.8297872340425535e-06, "loss": 0.2129, "step": 288 }, { "epoch": 0.11581469648562301, "grad_norm": 4.68249530525775, "learning_rate": 3.856382978723405e-06, "loss": 0.211, "step": 290 }, { "epoch": 0.11661341853035144, "grad_norm": 2.5096724167776387, "learning_rate": 3.882978723404256e-06, "loss": 0.1901, "step": 292 }, { "epoch": 0.11741214057507987, "grad_norm": 2.172894520215211, "learning_rate": 3.909574468085106e-06, "loss": 0.1936, "step": 294 }, { "epoch": 0.1182108626198083, "grad_norm": 5.3196819233421495, "learning_rate": 3.936170212765958e-06, "loss": 0.2104, "step": 296 }, { "epoch": 0.11900958466453675, "grad_norm": 2.4110105936218904, "learning_rate": 3.962765957446809e-06, "loss": 0.2143, "step": 298 }, { "epoch": 0.11980830670926518, "grad_norm": 2.4033618282750724, "learning_rate": 3.98936170212766e-06, "loss": 0.2164, "step": 300 }, { "epoch": 0.12060702875399361, "grad_norm": 3.457118213267144, "learning_rate": 4.015957446808511e-06, "loss": 0.2032, "step": 302 }, { "epoch": 0.12140575079872204, "grad_norm": 2.408758592662987, "learning_rate": 4.042553191489362e-06, "loss": 0.2115, "step": 304 }, { "epoch": 0.12220447284345048, "grad_norm": 2.59506801145671, "learning_rate": 4.069148936170213e-06, "loss": 0.2214, "step": 306 }, { "epoch": 0.12300319488817892, "grad_norm": 3.0414661937486933, "learning_rate": 4.095744680851064e-06, "loss": 0.2088, "step": 308 }, { "epoch": 0.12380191693290735, "grad_norm": 3.647758244704823, "learning_rate": 4.122340425531915e-06, "loss": 0.2198, "step": 310 }, { "epoch": 0.12460063897763578, "grad_norm": 2.67698633677868, "learning_rate": 4.148936170212766e-06, "loss": 0.2252, "step": 312 }, { "epoch": 0.1253993610223642, "grad_norm": 2.898397178237324, "learning_rate": 4.175531914893618e-06, "loss": 0.2181, "step": 314 }, { "epoch": 0.12619808306709265, "grad_norm": 2.6840679225522526, "learning_rate": 4.202127659574468e-06, "loss": 0.1992, "step": 316 }, { "epoch": 0.1269968051118211, "grad_norm": 3.1572391222603886, "learning_rate": 4.228723404255319e-06, "loss": 0.1952, "step": 318 }, { "epoch": 0.12779552715654952, "grad_norm": 3.5678297945143873, "learning_rate": 4.255319148936171e-06, "loss": 0.2083, "step": 320 }, { "epoch": 0.12859424920127796, "grad_norm": 2.645675379463557, "learning_rate": 4.281914893617022e-06, "loss": 0.2196, "step": 322 }, { "epoch": 0.12939297124600638, "grad_norm": 3.2806373380609672, "learning_rate": 4.308510638297873e-06, "loss": 0.2044, "step": 324 }, { "epoch": 0.13019169329073482, "grad_norm": 3.877339337802895, "learning_rate": 4.3351063829787236e-06, "loss": 0.2188, "step": 326 }, { "epoch": 0.13099041533546327, "grad_norm": 2.855122134235383, "learning_rate": 4.361702127659575e-06, "loss": 0.2178, "step": 328 }, { "epoch": 0.13178913738019168, "grad_norm": 2.2492003663786764, "learning_rate": 4.388297872340426e-06, "loss": 0.2113, "step": 330 }, { "epoch": 0.13258785942492013, "grad_norm": 2.3120507711054166, "learning_rate": 4.414893617021277e-06, "loss": 0.2162, "step": 332 }, { "epoch": 0.13338658146964857, "grad_norm": 2.306824843021891, "learning_rate": 4.441489361702128e-06, "loss": 0.1951, "step": 334 }, { "epoch": 0.134185303514377, "grad_norm": 2.4562848392795424, "learning_rate": 4.468085106382979e-06, "loss": 0.2091, "step": 336 }, { "epoch": 0.13498402555910544, "grad_norm": 2.327058395343108, "learning_rate": 4.49468085106383e-06, "loss": 0.1936, "step": 338 }, { "epoch": 0.13578274760383385, "grad_norm": 3.2365595394106377, "learning_rate": 4.521276595744681e-06, "loss": 0.2076, "step": 340 }, { "epoch": 0.1365814696485623, "grad_norm": 2.3773207333074797, "learning_rate": 4.547872340425532e-06, "loss": 0.215, "step": 342 }, { "epoch": 0.13738019169329074, "grad_norm": 2.588667676878435, "learning_rate": 4.574468085106383e-06, "loss": 0.2142, "step": 344 }, { "epoch": 0.13817891373801916, "grad_norm": 1.9922157018157862, "learning_rate": 4.601063829787235e-06, "loss": 0.188, "step": 346 }, { "epoch": 0.1389776357827476, "grad_norm": 1.9986841603432328, "learning_rate": 4.6276595744680855e-06, "loss": 0.2123, "step": 348 }, { "epoch": 0.13977635782747605, "grad_norm": 3.8191377619872346, "learning_rate": 4.654255319148936e-06, "loss": 0.1909, "step": 350 }, { "epoch": 0.14057507987220447, "grad_norm": 2.707801265704851, "learning_rate": 4.680851063829788e-06, "loss": 0.1804, "step": 352 }, { "epoch": 0.1413738019169329, "grad_norm": 2.3382091629044264, "learning_rate": 4.707446808510639e-06, "loss": 0.2268, "step": 354 }, { "epoch": 0.14217252396166133, "grad_norm": 2.4107325950799963, "learning_rate": 4.73404255319149e-06, "loss": 0.204, "step": 356 }, { "epoch": 0.14297124600638977, "grad_norm": 2.1678512268790944, "learning_rate": 4.760638297872341e-06, "loss": 0.2078, "step": 358 }, { "epoch": 0.14376996805111822, "grad_norm": 2.028736424354992, "learning_rate": 4.787234042553192e-06, "loss": 0.2091, "step": 360 }, { "epoch": 0.14456869009584664, "grad_norm": 2.396579988473481, "learning_rate": 4.813829787234043e-06, "loss": 0.2045, "step": 362 }, { "epoch": 0.14536741214057508, "grad_norm": 2.3056223101352162, "learning_rate": 4.840425531914894e-06, "loss": 0.2012, "step": 364 }, { "epoch": 0.14616613418530353, "grad_norm": 2.1325793827510964, "learning_rate": 4.867021276595745e-06, "loss": 0.1997, "step": 366 }, { "epoch": 0.14696485623003194, "grad_norm": 2.2181283979891213, "learning_rate": 4.893617021276596e-06, "loss": 0.2216, "step": 368 }, { "epoch": 0.1477635782747604, "grad_norm": 2.2520136230085988, "learning_rate": 4.9202127659574475e-06, "loss": 0.2093, "step": 370 }, { "epoch": 0.1485623003194888, "grad_norm": 2.3640436529911817, "learning_rate": 4.946808510638298e-06, "loss": 0.2169, "step": 372 }, { "epoch": 0.14936102236421725, "grad_norm": 2.3535553790924952, "learning_rate": 4.973404255319149e-06, "loss": 0.2098, "step": 374 }, { "epoch": 0.1501597444089457, "grad_norm": 1.8756114301592517, "learning_rate": 5e-06, "loss": 0.1986, "step": 376 }, { "epoch": 0.1509584664536741, "grad_norm": 2.292659355977137, "learning_rate": 5.026595744680851e-06, "loss": 0.2046, "step": 378 }, { "epoch": 0.15175718849840256, "grad_norm": 2.6775913216468994, "learning_rate": 5.053191489361703e-06, "loss": 0.2376, "step": 380 }, { "epoch": 0.152555910543131, "grad_norm": 2.140742604355663, "learning_rate": 5.079787234042553e-06, "loss": 0.2076, "step": 382 }, { "epoch": 0.15335463258785942, "grad_norm": 2.2577220617248983, "learning_rate": 5.106382978723404e-06, "loss": 0.1857, "step": 384 }, { "epoch": 0.15415335463258786, "grad_norm": 2.322529114683112, "learning_rate": 5.1329787234042565e-06, "loss": 0.2252, "step": 386 }, { "epoch": 0.15495207667731628, "grad_norm": 2.174339578110931, "learning_rate": 5.159574468085107e-06, "loss": 0.1859, "step": 388 }, { "epoch": 0.15575079872204473, "grad_norm": 2.6754457483199996, "learning_rate": 5.186170212765958e-06, "loss": 0.2134, "step": 390 }, { "epoch": 0.15654952076677317, "grad_norm": 2.5869207840676287, "learning_rate": 5.212765957446809e-06, "loss": 0.2313, "step": 392 }, { "epoch": 0.1573482428115016, "grad_norm": 2.2594133874239555, "learning_rate": 5.23936170212766e-06, "loss": 0.2063, "step": 394 }, { "epoch": 0.15814696485623003, "grad_norm": 2.3216671883603532, "learning_rate": 5.265957446808511e-06, "loss": 0.2068, "step": 396 }, { "epoch": 0.15894568690095848, "grad_norm": 2.2695053056973014, "learning_rate": 5.292553191489362e-06, "loss": 0.1911, "step": 398 }, { "epoch": 0.1597444089456869, "grad_norm": 2.2393947266924474, "learning_rate": 5.319148936170213e-06, "loss": 0.2134, "step": 400 }, { "epoch": 0.16054313099041534, "grad_norm": 2.3780064899990223, "learning_rate": 5.345744680851064e-06, "loss": 0.1861, "step": 402 }, { "epoch": 0.16134185303514376, "grad_norm": 2.0639877092575456, "learning_rate": 5.372340425531915e-06, "loss": 0.1934, "step": 404 }, { "epoch": 0.1621405750798722, "grad_norm": 2.3834910258697883, "learning_rate": 5.398936170212766e-06, "loss": 0.2268, "step": 406 }, { "epoch": 0.16293929712460065, "grad_norm": 2.1443375139668253, "learning_rate": 5.425531914893617e-06, "loss": 0.1974, "step": 408 }, { "epoch": 0.16373801916932906, "grad_norm": 2.1691183471396323, "learning_rate": 5.452127659574469e-06, "loss": 0.1985, "step": 410 }, { "epoch": 0.1645367412140575, "grad_norm": 2.4848574597065904, "learning_rate": 5.47872340425532e-06, "loss": 0.2127, "step": 412 }, { "epoch": 0.16533546325878595, "grad_norm": 2.3579622949897723, "learning_rate": 5.5053191489361705e-06, "loss": 0.2174, "step": 414 }, { "epoch": 0.16613418530351437, "grad_norm": 2.070124617271312, "learning_rate": 5.531914893617022e-06, "loss": 0.1862, "step": 416 }, { "epoch": 0.16693290734824281, "grad_norm": 2.1267517696375213, "learning_rate": 5.558510638297873e-06, "loss": 0.2201, "step": 418 }, { "epoch": 0.16773162939297126, "grad_norm": 2.7521587042179134, "learning_rate": 5.5851063829787235e-06, "loss": 0.1829, "step": 420 }, { "epoch": 0.16853035143769968, "grad_norm": 2.4493518149473106, "learning_rate": 5.611702127659575e-06, "loss": 0.2229, "step": 422 }, { "epoch": 0.16932907348242812, "grad_norm": 2.248345922944704, "learning_rate": 5.638297872340426e-06, "loss": 0.1946, "step": 424 }, { "epoch": 0.17012779552715654, "grad_norm": 2.0487939416257763, "learning_rate": 5.664893617021277e-06, "loss": 0.2045, "step": 426 }, { "epoch": 0.17092651757188498, "grad_norm": 2.2646018633860696, "learning_rate": 5.691489361702128e-06, "loss": 0.2146, "step": 428 }, { "epoch": 0.17172523961661343, "grad_norm": 1.8960573449253009, "learning_rate": 5.718085106382979e-06, "loss": 0.2079, "step": 430 }, { "epoch": 0.17252396166134185, "grad_norm": 2.176740909319673, "learning_rate": 5.744680851063831e-06, "loss": 0.21, "step": 432 }, { "epoch": 0.1733226837060703, "grad_norm": 2.1430161168845987, "learning_rate": 5.771276595744682e-06, "loss": 0.2119, "step": 434 }, { "epoch": 0.17412140575079874, "grad_norm": 1.9672794318914277, "learning_rate": 5.7978723404255325e-06, "loss": 0.2103, "step": 436 }, { "epoch": 0.17492012779552715, "grad_norm": 2.1725129238295904, "learning_rate": 5.824468085106384e-06, "loss": 0.2097, "step": 438 }, { "epoch": 0.1757188498402556, "grad_norm": 1.9320076776263606, "learning_rate": 5.851063829787235e-06, "loss": 0.2141, "step": 440 }, { "epoch": 0.17651757188498401, "grad_norm": 2.0107708483969544, "learning_rate": 5.877659574468085e-06, "loss": 0.2138, "step": 442 }, { "epoch": 0.17731629392971246, "grad_norm": 1.9692652691440204, "learning_rate": 5.904255319148937e-06, "loss": 0.1874, "step": 444 }, { "epoch": 0.1781150159744409, "grad_norm": 2.1222862804558305, "learning_rate": 5.930851063829788e-06, "loss": 0.1875, "step": 446 }, { "epoch": 0.17891373801916932, "grad_norm": 2.0073449512509414, "learning_rate": 5.957446808510638e-06, "loss": 0.2073, "step": 448 }, { "epoch": 0.17971246006389777, "grad_norm": 2.0189150793812027, "learning_rate": 5.98404255319149e-06, "loss": 0.1886, "step": 450 }, { "epoch": 0.1805111821086262, "grad_norm": 2.327500057066851, "learning_rate": 6.010638297872341e-06, "loss": 0.2026, "step": 452 }, { "epoch": 0.18130990415335463, "grad_norm": 2.2501232755144183, "learning_rate": 6.037234042553191e-06, "loss": 0.2132, "step": 454 }, { "epoch": 0.18210862619808307, "grad_norm": 2.317281078409067, "learning_rate": 6.063829787234044e-06, "loss": 0.2112, "step": 456 }, { "epoch": 0.1829073482428115, "grad_norm": 1.9924372480322023, "learning_rate": 6.090425531914894e-06, "loss": 0.2092, "step": 458 }, { "epoch": 0.18370607028753994, "grad_norm": 2.2346624173451963, "learning_rate": 6.117021276595745e-06, "loss": 0.2144, "step": 460 }, { "epoch": 0.18450479233226838, "grad_norm": 2.175934261679526, "learning_rate": 6.143617021276597e-06, "loss": 0.1849, "step": 462 }, { "epoch": 0.1853035143769968, "grad_norm": 2.266384026702202, "learning_rate": 6.170212765957447e-06, "loss": 0.2113, "step": 464 }, { "epoch": 0.18610223642172524, "grad_norm": 1.9034790451109018, "learning_rate": 6.196808510638298e-06, "loss": 0.2115, "step": 466 }, { "epoch": 0.1869009584664537, "grad_norm": 2.2484524626124087, "learning_rate": 6.22340425531915e-06, "loss": 0.2069, "step": 468 }, { "epoch": 0.1876996805111821, "grad_norm": 2.0092685848128102, "learning_rate": 6.25e-06, "loss": 0.2031, "step": 470 }, { "epoch": 0.18849840255591055, "grad_norm": 1.870835955521368, "learning_rate": 6.276595744680851e-06, "loss": 0.1903, "step": 472 }, { "epoch": 0.18929712460063897, "grad_norm": 1.9836945986391599, "learning_rate": 6.303191489361703e-06, "loss": 0.2256, "step": 474 }, { "epoch": 0.1900958466453674, "grad_norm": 1.8732225695197424, "learning_rate": 6.329787234042554e-06, "loss": 0.2, "step": 476 }, { "epoch": 0.19089456869009586, "grad_norm": 2.0073497811760856, "learning_rate": 6.356382978723404e-06, "loss": 0.1996, "step": 478 }, { "epoch": 0.19169329073482427, "grad_norm": 2.1089028896634785, "learning_rate": 6.382978723404256e-06, "loss": 0.1705, "step": 480 }, { "epoch": 0.19249201277955272, "grad_norm": 2.3296857896927032, "learning_rate": 6.409574468085107e-06, "loss": 0.2127, "step": 482 }, { "epoch": 0.19329073482428116, "grad_norm": 1.9240064591019737, "learning_rate": 6.436170212765958e-06, "loss": 0.2044, "step": 484 }, { "epoch": 0.19408945686900958, "grad_norm": 2.0124517600701757, "learning_rate": 6.462765957446809e-06, "loss": 0.2149, "step": 486 }, { "epoch": 0.19488817891373802, "grad_norm": 2.1954071243853, "learning_rate": 6.48936170212766e-06, "loss": 0.2202, "step": 488 }, { "epoch": 0.19568690095846644, "grad_norm": 1.9656201249731728, "learning_rate": 6.515957446808511e-06, "loss": 0.1946, "step": 490 }, { "epoch": 0.1964856230031949, "grad_norm": 1.989066037021515, "learning_rate": 6.542553191489362e-06, "loss": 0.2004, "step": 492 }, { "epoch": 0.19728434504792333, "grad_norm": 1.872072162374712, "learning_rate": 6.569148936170213e-06, "loss": 0.2077, "step": 494 }, { "epoch": 0.19808306709265175, "grad_norm": 1.9921049486714641, "learning_rate": 6.595744680851064e-06, "loss": 0.1874, "step": 496 }, { "epoch": 0.1988817891373802, "grad_norm": 1.7219513015277055, "learning_rate": 6.622340425531916e-06, "loss": 0.1821, "step": 498 }, { "epoch": 0.19968051118210864, "grad_norm": 2.0749672908831616, "learning_rate": 6.648936170212767e-06, "loss": 0.2053, "step": 500 }, { "epoch": 0.19968051118210864, "eval_loss": 0.19043219089508057, "eval_runtime": 420.1871, "eval_samples_per_second": 42.379, "eval_steps_per_second": 5.298, "step": 500 }, { "epoch": 0.20047923322683706, "grad_norm": 2.037306801330803, "learning_rate": 6.6755319148936174e-06, "loss": 0.2176, "step": 502 }, { "epoch": 0.2012779552715655, "grad_norm": 1.9203371381017114, "learning_rate": 6.702127659574469e-06, "loss": 0.2125, "step": 504 }, { "epoch": 0.20207667731629392, "grad_norm": 1.985111227444487, "learning_rate": 6.72872340425532e-06, "loss": 0.1973, "step": 506 }, { "epoch": 0.20287539936102236, "grad_norm": 1.8951174304875817, "learning_rate": 6.75531914893617e-06, "loss": 0.2149, "step": 508 }, { "epoch": 0.2036741214057508, "grad_norm": 2.0497514964752748, "learning_rate": 6.781914893617022e-06, "loss": 0.1978, "step": 510 }, { "epoch": 0.20447284345047922, "grad_norm": 2.0766753850394175, "learning_rate": 6.808510638297873e-06, "loss": 0.2375, "step": 512 }, { "epoch": 0.20527156549520767, "grad_norm": 2.346429408581163, "learning_rate": 6.835106382978723e-06, "loss": 0.2194, "step": 514 }, { "epoch": 0.20607028753993611, "grad_norm": 2.0826174062149647, "learning_rate": 6.861702127659575e-06, "loss": 0.2056, "step": 516 }, { "epoch": 0.20686900958466453, "grad_norm": 1.99670732186987, "learning_rate": 6.888297872340426e-06, "loss": 0.221, "step": 518 }, { "epoch": 0.20766773162939298, "grad_norm": 2.036868735781395, "learning_rate": 6.914893617021278e-06, "loss": 0.2165, "step": 520 }, { "epoch": 0.2084664536741214, "grad_norm": 1.8657555178307903, "learning_rate": 6.941489361702129e-06, "loss": 0.2146, "step": 522 }, { "epoch": 0.20926517571884984, "grad_norm": 1.9998386354651805, "learning_rate": 6.968085106382979e-06, "loss": 0.2165, "step": 524 }, { "epoch": 0.21006389776357828, "grad_norm": 1.890648331838507, "learning_rate": 6.994680851063831e-06, "loss": 0.2215, "step": 526 }, { "epoch": 0.2108626198083067, "grad_norm": 1.8453249030357137, "learning_rate": 7.021276595744682e-06, "loss": 0.227, "step": 528 }, { "epoch": 0.21166134185303515, "grad_norm": 1.8696171981456626, "learning_rate": 7.047872340425532e-06, "loss": 0.202, "step": 530 }, { "epoch": 0.2124600638977636, "grad_norm": 1.9343681296268649, "learning_rate": 7.074468085106384e-06, "loss": 0.2207, "step": 532 }, { "epoch": 0.213258785942492, "grad_norm": 2.004530964431762, "learning_rate": 7.101063829787235e-06, "loss": 0.2024, "step": 534 }, { "epoch": 0.21405750798722045, "grad_norm": 1.9649699331241137, "learning_rate": 7.127659574468085e-06, "loss": 0.201, "step": 536 }, { "epoch": 0.21485623003194887, "grad_norm": 1.998147156306178, "learning_rate": 7.154255319148937e-06, "loss": 0.1925, "step": 538 }, { "epoch": 0.21565495207667731, "grad_norm": 1.872474740663219, "learning_rate": 7.1808510638297875e-06, "loss": 0.1979, "step": 540 }, { "epoch": 0.21645367412140576, "grad_norm": 2.0711681755729634, "learning_rate": 7.207446808510638e-06, "loss": 0.2468, "step": 542 }, { "epoch": 0.21725239616613418, "grad_norm": 1.9701565676595827, "learning_rate": 7.234042553191491e-06, "loss": 0.2092, "step": 544 }, { "epoch": 0.21805111821086262, "grad_norm": 2.0764550423720927, "learning_rate": 7.260638297872341e-06, "loss": 0.2148, "step": 546 }, { "epoch": 0.21884984025559107, "grad_norm": 3.213288011368614, "learning_rate": 7.287234042553192e-06, "loss": 0.2287, "step": 548 }, { "epoch": 0.21964856230031948, "grad_norm": 1.8177331310953289, "learning_rate": 7.313829787234044e-06, "loss": 0.207, "step": 550 }, { "epoch": 0.22044728434504793, "grad_norm": 1.7655366549601414, "learning_rate": 7.340425531914894e-06, "loss": 0.2064, "step": 552 }, { "epoch": 0.22124600638977635, "grad_norm": 1.9594454321675279, "learning_rate": 7.367021276595745e-06, "loss": 0.1922, "step": 554 }, { "epoch": 0.2220447284345048, "grad_norm": 2.1733754411582407, "learning_rate": 7.3936170212765965e-06, "loss": 0.2266, "step": 556 }, { "epoch": 0.22284345047923323, "grad_norm": 1.8248091216623237, "learning_rate": 7.420212765957447e-06, "loss": 0.2221, "step": 558 }, { "epoch": 0.22364217252396165, "grad_norm": 2.058872918004827, "learning_rate": 7.446808510638298e-06, "loss": 0.2225, "step": 560 }, { "epoch": 0.2244408945686901, "grad_norm": 1.909102118780366, "learning_rate": 7.47340425531915e-06, "loss": 0.1983, "step": 562 }, { "epoch": 0.22523961661341854, "grad_norm": 2.0278745632782584, "learning_rate": 7.500000000000001e-06, "loss": 0.2076, "step": 564 }, { "epoch": 0.22603833865814696, "grad_norm": 1.8175943014340494, "learning_rate": 7.526595744680851e-06, "loss": 0.1953, "step": 566 }, { "epoch": 0.2268370607028754, "grad_norm": 2.025268083189413, "learning_rate": 7.553191489361703e-06, "loss": 0.2338, "step": 568 }, { "epoch": 0.22763578274760382, "grad_norm": 1.8856154811157113, "learning_rate": 7.579787234042554e-06, "loss": 0.2077, "step": 570 }, { "epoch": 0.22843450479233227, "grad_norm": 2.59965148545968, "learning_rate": 7.606382978723405e-06, "loss": 0.2057, "step": 572 }, { "epoch": 0.2292332268370607, "grad_norm": 1.977278774789199, "learning_rate": 7.632978723404256e-06, "loss": 0.2021, "step": 574 }, { "epoch": 0.23003194888178913, "grad_norm": 2.1109133002264735, "learning_rate": 7.659574468085107e-06, "loss": 0.2156, "step": 576 }, { "epoch": 0.23083067092651757, "grad_norm": 2.017717675854075, "learning_rate": 7.686170212765958e-06, "loss": 0.2235, "step": 578 }, { "epoch": 0.23162939297124602, "grad_norm": 1.8309604789907734, "learning_rate": 7.71276595744681e-06, "loss": 0.1801, "step": 580 }, { "epoch": 0.23242811501597443, "grad_norm": 1.7699335955775852, "learning_rate": 7.73936170212766e-06, "loss": 0.1943, "step": 582 }, { "epoch": 0.23322683706070288, "grad_norm": 1.9194267663124567, "learning_rate": 7.765957446808511e-06, "loss": 0.2153, "step": 584 }, { "epoch": 0.2340255591054313, "grad_norm": 2.097345382192139, "learning_rate": 7.792553191489362e-06, "loss": 0.2338, "step": 586 }, { "epoch": 0.23482428115015974, "grad_norm": 1.8940063065960397, "learning_rate": 7.819148936170213e-06, "loss": 0.2031, "step": 588 }, { "epoch": 0.2356230031948882, "grad_norm": 1.9690164598346551, "learning_rate": 7.845744680851064e-06, "loss": 0.2118, "step": 590 }, { "epoch": 0.2364217252396166, "grad_norm": 3.228651225314339, "learning_rate": 7.872340425531916e-06, "loss": 0.2039, "step": 592 }, { "epoch": 0.23722044728434505, "grad_norm": 1.9156226006051282, "learning_rate": 7.898936170212767e-06, "loss": 0.2252, "step": 594 }, { "epoch": 0.2380191693290735, "grad_norm": 1.7768150211461646, "learning_rate": 7.925531914893617e-06, "loss": 0.225, "step": 596 }, { "epoch": 0.2388178913738019, "grad_norm": 1.9652050222591573, "learning_rate": 7.95212765957447e-06, "loss": 0.1965, "step": 598 }, { "epoch": 0.23961661341853036, "grad_norm": 2.0572778537541323, "learning_rate": 7.97872340425532e-06, "loss": 0.203, "step": 600 }, { "epoch": 0.24041533546325877, "grad_norm": 1.9770278124409975, "learning_rate": 8.005319148936171e-06, "loss": 0.2106, "step": 602 }, { "epoch": 0.24121405750798722, "grad_norm": 1.8557771165334138, "learning_rate": 8.031914893617022e-06, "loss": 0.2075, "step": 604 }, { "epoch": 0.24201277955271566, "grad_norm": 1.770990487744795, "learning_rate": 8.058510638297873e-06, "loss": 0.2138, "step": 606 }, { "epoch": 0.24281150159744408, "grad_norm": 1.6450172966793275, "learning_rate": 8.085106382978723e-06, "loss": 0.1858, "step": 608 }, { "epoch": 0.24361022364217252, "grad_norm": 1.6531257153455905, "learning_rate": 8.111702127659576e-06, "loss": 0.2181, "step": 610 }, { "epoch": 0.24440894568690097, "grad_norm": 1.7953441983194227, "learning_rate": 8.138297872340426e-06, "loss": 0.2168, "step": 612 }, { "epoch": 0.2452076677316294, "grad_norm": 1.9624853486169551, "learning_rate": 8.164893617021277e-06, "loss": 0.2025, "step": 614 }, { "epoch": 0.24600638977635783, "grad_norm": 1.9202333243211456, "learning_rate": 8.191489361702128e-06, "loss": 0.2094, "step": 616 }, { "epoch": 0.24680511182108625, "grad_norm": 1.9054794785418179, "learning_rate": 8.218085106382978e-06, "loss": 0.2287, "step": 618 }, { "epoch": 0.2476038338658147, "grad_norm": 1.6738673715101495, "learning_rate": 8.24468085106383e-06, "loss": 0.2113, "step": 620 }, { "epoch": 0.24840255591054314, "grad_norm": 1.6070602027792467, "learning_rate": 8.271276595744682e-06, "loss": 0.1973, "step": 622 }, { "epoch": 0.24920127795527156, "grad_norm": 2.0785351155835134, "learning_rate": 8.297872340425532e-06, "loss": 0.2166, "step": 624 }, { "epoch": 0.25, "grad_norm": 1.7980372006314775, "learning_rate": 8.324468085106385e-06, "loss": 0.2113, "step": 626 }, { "epoch": 0.2507987220447284, "grad_norm": 2.0745759191840096, "learning_rate": 8.351063829787235e-06, "loss": 0.2359, "step": 628 }, { "epoch": 0.2515974440894569, "grad_norm": 1.9864922897804826, "learning_rate": 8.377659574468086e-06, "loss": 0.2062, "step": 630 }, { "epoch": 0.2523961661341853, "grad_norm": 2.030788677329014, "learning_rate": 8.404255319148937e-06, "loss": 0.2217, "step": 632 }, { "epoch": 0.2531948881789137, "grad_norm": 1.7547740269944636, "learning_rate": 8.430851063829787e-06, "loss": 0.2203, "step": 634 }, { "epoch": 0.2539936102236422, "grad_norm": 1.7889762015226203, "learning_rate": 8.457446808510638e-06, "loss": 0.2165, "step": 636 }, { "epoch": 0.2547923322683706, "grad_norm": 1.6288012322878784, "learning_rate": 8.48404255319149e-06, "loss": 0.1979, "step": 638 }, { "epoch": 0.25559105431309903, "grad_norm": 1.7348141744007637, "learning_rate": 8.510638297872341e-06, "loss": 0.1996, "step": 640 }, { "epoch": 0.2563897763578275, "grad_norm": 1.7262432044292573, "learning_rate": 8.537234042553192e-06, "loss": 0.1901, "step": 642 }, { "epoch": 0.2571884984025559, "grad_norm": 1.9327837811354087, "learning_rate": 8.563829787234044e-06, "loss": 0.2001, "step": 644 }, { "epoch": 0.25798722044728434, "grad_norm": 2.211386389180962, "learning_rate": 8.590425531914895e-06, "loss": 0.2213, "step": 646 }, { "epoch": 0.25878594249201275, "grad_norm": 1.8243601975607768, "learning_rate": 8.617021276595746e-06, "loss": 0.224, "step": 648 }, { "epoch": 0.2595846645367412, "grad_norm": 1.932439055129622, "learning_rate": 8.643617021276596e-06, "loss": 0.1924, "step": 650 }, { "epoch": 0.26038338658146964, "grad_norm": 1.746238022604233, "learning_rate": 8.670212765957447e-06, "loss": 0.2149, "step": 652 }, { "epoch": 0.26118210862619806, "grad_norm": 1.7613718087296124, "learning_rate": 8.696808510638298e-06, "loss": 0.2085, "step": 654 }, { "epoch": 0.26198083067092653, "grad_norm": 1.7993776636313457, "learning_rate": 8.72340425531915e-06, "loss": 0.2287, "step": 656 }, { "epoch": 0.26277955271565495, "grad_norm": 2.6631359329733972, "learning_rate": 8.750000000000001e-06, "loss": 0.1972, "step": 658 }, { "epoch": 0.26357827476038337, "grad_norm": 2.0244414755737146, "learning_rate": 8.776595744680852e-06, "loss": 0.2113, "step": 660 }, { "epoch": 0.26437699680511184, "grad_norm": 2.635052210402534, "learning_rate": 8.803191489361704e-06, "loss": 0.1862, "step": 662 }, { "epoch": 0.26517571884984026, "grad_norm": 5.482936732062825, "learning_rate": 8.829787234042555e-06, "loss": 0.2109, "step": 664 }, { "epoch": 0.2659744408945687, "grad_norm": 1.8743456888792152, "learning_rate": 8.856382978723404e-06, "loss": 0.2045, "step": 666 }, { "epoch": 0.26677316293929715, "grad_norm": 1.4845513209949641, "learning_rate": 8.882978723404256e-06, "loss": 0.1779, "step": 668 }, { "epoch": 0.26757188498402557, "grad_norm": 1.7833087776873404, "learning_rate": 8.909574468085107e-06, "loss": 0.1949, "step": 670 }, { "epoch": 0.268370607028754, "grad_norm": 1.7237851857753832, "learning_rate": 8.936170212765958e-06, "loss": 0.2309, "step": 672 }, { "epoch": 0.26916932907348246, "grad_norm": 1.7943976011736698, "learning_rate": 8.96276595744681e-06, "loss": 0.2246, "step": 674 }, { "epoch": 0.26996805111821087, "grad_norm": 1.6645946104715852, "learning_rate": 8.98936170212766e-06, "loss": 0.213, "step": 676 }, { "epoch": 0.2707667731629393, "grad_norm": 1.6878179409944127, "learning_rate": 9.015957446808511e-06, "loss": 0.2177, "step": 678 }, { "epoch": 0.2715654952076677, "grad_norm": 1.6507645045148651, "learning_rate": 9.042553191489362e-06, "loss": 0.1924, "step": 680 }, { "epoch": 0.2723642172523962, "grad_norm": 1.9095635971119895, "learning_rate": 9.069148936170213e-06, "loss": 0.2165, "step": 682 }, { "epoch": 0.2731629392971246, "grad_norm": 1.630380756355513, "learning_rate": 9.095744680851063e-06, "loss": 0.2158, "step": 684 }, { "epoch": 0.273961661341853, "grad_norm": 1.7077429176964645, "learning_rate": 9.122340425531916e-06, "loss": 0.2023, "step": 686 }, { "epoch": 0.2747603833865815, "grad_norm": 1.7776755211491289, "learning_rate": 9.148936170212767e-06, "loss": 0.2179, "step": 688 }, { "epoch": 0.2755591054313099, "grad_norm": 1.6954241240154826, "learning_rate": 9.175531914893617e-06, "loss": 0.2137, "step": 690 }, { "epoch": 0.2763578274760383, "grad_norm": 1.7891304751555133, "learning_rate": 9.20212765957447e-06, "loss": 0.2117, "step": 692 }, { "epoch": 0.2771565495207668, "grad_norm": 1.9639021184922467, "learning_rate": 9.22872340425532e-06, "loss": 0.2372, "step": 694 }, { "epoch": 0.2779552715654952, "grad_norm": 1.5797174817631339, "learning_rate": 9.255319148936171e-06, "loss": 0.2063, "step": 696 }, { "epoch": 0.2787539936102236, "grad_norm": 1.8249112869928803, "learning_rate": 9.281914893617022e-06, "loss": 0.2095, "step": 698 }, { "epoch": 0.2795527156549521, "grad_norm": 1.7706586811506164, "learning_rate": 9.308510638297872e-06, "loss": 0.2073, "step": 700 }, { "epoch": 0.2803514376996805, "grad_norm": 1.7271382999203275, "learning_rate": 9.335106382978723e-06, "loss": 0.2099, "step": 702 }, { "epoch": 0.28115015974440893, "grad_norm": 1.6484304946170987, "learning_rate": 9.361702127659576e-06, "loss": 0.2133, "step": 704 }, { "epoch": 0.2819488817891374, "grad_norm": 1.724151909486214, "learning_rate": 9.388297872340426e-06, "loss": 0.212, "step": 706 }, { "epoch": 0.2827476038338658, "grad_norm": 1.5897126511168378, "learning_rate": 9.414893617021279e-06, "loss": 0.2, "step": 708 }, { "epoch": 0.28354632587859424, "grad_norm": 1.7835429984815048, "learning_rate": 9.44148936170213e-06, "loss": 0.2318, "step": 710 }, { "epoch": 0.28434504792332266, "grad_norm": 1.7751330791980955, "learning_rate": 9.46808510638298e-06, "loss": 0.2316, "step": 712 }, { "epoch": 0.28514376996805113, "grad_norm": 1.8933530962247043, "learning_rate": 9.49468085106383e-06, "loss": 0.2265, "step": 714 }, { "epoch": 0.28594249201277955, "grad_norm": 1.561209719510769, "learning_rate": 9.521276595744681e-06, "loss": 0.199, "step": 716 }, { "epoch": 0.28674121405750796, "grad_norm": 1.8127384648554525, "learning_rate": 9.547872340425532e-06, "loss": 0.213, "step": 718 }, { "epoch": 0.28753993610223644, "grad_norm": 3.645955534528657, "learning_rate": 9.574468085106385e-06, "loss": 0.2213, "step": 720 }, { "epoch": 0.28833865814696485, "grad_norm": 2.2672006586352476, "learning_rate": 9.601063829787235e-06, "loss": 0.2139, "step": 722 }, { "epoch": 0.28913738019169327, "grad_norm": 1.8519024796984893, "learning_rate": 9.627659574468086e-06, "loss": 0.2153, "step": 724 }, { "epoch": 0.28993610223642174, "grad_norm": 1.560446824326613, "learning_rate": 9.654255319148937e-06, "loss": 0.2023, "step": 726 }, { "epoch": 0.29073482428115016, "grad_norm": 1.7699204671345776, "learning_rate": 9.680851063829787e-06, "loss": 0.2238, "step": 728 }, { "epoch": 0.2915335463258786, "grad_norm": 1.6865554967271952, "learning_rate": 9.707446808510638e-06, "loss": 0.2029, "step": 730 }, { "epoch": 0.29233226837060705, "grad_norm": 1.6482995869085524, "learning_rate": 9.73404255319149e-06, "loss": 0.189, "step": 732 }, { "epoch": 0.29313099041533547, "grad_norm": 1.7007866856749894, "learning_rate": 9.760638297872341e-06, "loss": 0.2203, "step": 734 }, { "epoch": 0.2939297124600639, "grad_norm": 2.0612744928368585, "learning_rate": 9.787234042553192e-06, "loss": 0.2011, "step": 736 }, { "epoch": 0.29472843450479236, "grad_norm": 1.6349743417964269, "learning_rate": 9.813829787234044e-06, "loss": 0.217, "step": 738 }, { "epoch": 0.2955271565495208, "grad_norm": 1.6647666932854406, "learning_rate": 9.840425531914895e-06, "loss": 0.2083, "step": 740 }, { "epoch": 0.2963258785942492, "grad_norm": 1.6704980371536418, "learning_rate": 9.867021276595746e-06, "loss": 0.1969, "step": 742 }, { "epoch": 0.2971246006389776, "grad_norm": 1.6928260652626874, "learning_rate": 9.893617021276596e-06, "loss": 0.1927, "step": 744 }, { "epoch": 0.2979233226837061, "grad_norm": 1.6889079212558915, "learning_rate": 9.920212765957447e-06, "loss": 0.2108, "step": 746 }, { "epoch": 0.2987220447284345, "grad_norm": 1.6008195623226404, "learning_rate": 9.946808510638298e-06, "loss": 0.2079, "step": 748 }, { "epoch": 0.2995207667731629, "grad_norm": 1.6391899308564928, "learning_rate": 9.97340425531915e-06, "loss": 0.224, "step": 750 }, { "epoch": 0.3003194888178914, "grad_norm": 1.6579897287838672, "learning_rate": 1e-05, "loss": 0.2007, "step": 752 }, { "epoch": 0.3011182108626198, "grad_norm": 1.6060266288421388, "learning_rate": 9.99999784023588e-06, "loss": 0.1831, "step": 754 }, { "epoch": 0.3019169329073482, "grad_norm": 1.5755540572254816, "learning_rate": 9.999991360945382e-06, "loss": 0.2366, "step": 756 }, { "epoch": 0.3027156549520767, "grad_norm": 1.7099166011335876, "learning_rate": 9.999980562134104e-06, "loss": 0.2231, "step": 758 }, { "epoch": 0.3035143769968051, "grad_norm": 1.5689663504063338, "learning_rate": 9.999965443811378e-06, "loss": 0.1979, "step": 760 }, { "epoch": 0.30431309904153353, "grad_norm": 2.1289064244731657, "learning_rate": 9.999946005990262e-06, "loss": 0.2284, "step": 762 }, { "epoch": 0.305111821086262, "grad_norm": 1.530947535684416, "learning_rate": 9.99992224868755e-06, "loss": 0.2067, "step": 764 }, { "epoch": 0.3059105431309904, "grad_norm": 1.666888953841592, "learning_rate": 9.999894171923764e-06, "loss": 0.2061, "step": 766 }, { "epoch": 0.30670926517571884, "grad_norm": 1.5905369733593537, "learning_rate": 9.999861775723162e-06, "loss": 0.2047, "step": 768 }, { "epoch": 0.3075079872204473, "grad_norm": 1.9351094836921137, "learning_rate": 9.99982506011373e-06, "loss": 0.2111, "step": 770 }, { "epoch": 0.3083067092651757, "grad_norm": 1.6775809786308205, "learning_rate": 9.999784025127187e-06, "loss": 0.2009, "step": 772 }, { "epoch": 0.30910543130990414, "grad_norm": 1.6337987957663225, "learning_rate": 9.999738670798983e-06, "loss": 0.1965, "step": 774 }, { "epoch": 0.30990415335463256, "grad_norm": 1.6853294588250392, "learning_rate": 9.999688997168301e-06, "loss": 0.2094, "step": 776 }, { "epoch": 0.31070287539936103, "grad_norm": 2.380951431702259, "learning_rate": 9.999635004278054e-06, "loss": 0.2131, "step": 778 }, { "epoch": 0.31150159744408945, "grad_norm": 1.6821758607705601, "learning_rate": 9.999576692174887e-06, "loss": 0.2148, "step": 780 }, { "epoch": 0.31230031948881787, "grad_norm": 1.5362646502228101, "learning_rate": 9.999514060909175e-06, "loss": 0.2348, "step": 782 }, { "epoch": 0.31309904153354634, "grad_norm": 1.6226459616330424, "learning_rate": 9.999447110535026e-06, "loss": 0.2031, "step": 784 }, { "epoch": 0.31389776357827476, "grad_norm": 1.494547748123698, "learning_rate": 9.999375841110277e-06, "loss": 0.2004, "step": 786 }, { "epoch": 0.3146964856230032, "grad_norm": 1.800819253841243, "learning_rate": 9.999300252696502e-06, "loss": 0.2446, "step": 788 }, { "epoch": 0.31549520766773165, "grad_norm": 1.60492245047213, "learning_rate": 9.999220345359e-06, "loss": 0.2044, "step": 790 }, { "epoch": 0.31629392971246006, "grad_norm": 1.8934326067851825, "learning_rate": 9.999136119166803e-06, "loss": 0.2043, "step": 792 }, { "epoch": 0.3170926517571885, "grad_norm": 1.5648068073113917, "learning_rate": 9.999047574192677e-06, "loss": 0.2089, "step": 794 }, { "epoch": 0.31789137380191695, "grad_norm": 1.6312196131131216, "learning_rate": 9.998954710513113e-06, "loss": 0.1983, "step": 796 }, { "epoch": 0.31869009584664537, "grad_norm": 1.4933485598542207, "learning_rate": 9.998857528208337e-06, "loss": 0.2149, "step": 798 }, { "epoch": 0.3194888178913738, "grad_norm": 1.7284519491739558, "learning_rate": 9.998756027362308e-06, "loss": 0.2049, "step": 800 }, { "epoch": 0.32028753993610226, "grad_norm": 1.6062264896600003, "learning_rate": 9.998650208062713e-06, "loss": 0.2302, "step": 802 }, { "epoch": 0.3210862619808307, "grad_norm": 1.541668593359599, "learning_rate": 9.998540070400966e-06, "loss": 0.214, "step": 804 }, { "epoch": 0.3218849840255591, "grad_norm": 1.5744953156084605, "learning_rate": 9.998425614472217e-06, "loss": 0.2209, "step": 806 }, { "epoch": 0.3226837060702875, "grad_norm": 1.5946944416094957, "learning_rate": 9.99830684037535e-06, "loss": 0.2209, "step": 808 }, { "epoch": 0.323482428115016, "grad_norm": 1.6547243881770919, "learning_rate": 9.998183748212968e-06, "loss": 0.2291, "step": 810 }, { "epoch": 0.3242811501597444, "grad_norm": 1.5020701296905943, "learning_rate": 9.998056338091415e-06, "loss": 0.1947, "step": 812 }, { "epoch": 0.3250798722044728, "grad_norm": 1.775646858056913, "learning_rate": 9.997924610120758e-06, "loss": 0.2122, "step": 814 }, { "epoch": 0.3258785942492013, "grad_norm": 1.9643173710094557, "learning_rate": 9.9977885644148e-06, "loss": 0.2304, "step": 816 }, { "epoch": 0.3266773162939297, "grad_norm": 2.2694487421961305, "learning_rate": 9.997648201091073e-06, "loss": 0.1922, "step": 818 }, { "epoch": 0.3274760383386581, "grad_norm": 1.6657139292549983, "learning_rate": 9.997503520270837e-06, "loss": 0.2166, "step": 820 }, { "epoch": 0.3282747603833866, "grad_norm": 1.419957033481622, "learning_rate": 9.997354522079078e-06, "loss": 0.2025, "step": 822 }, { "epoch": 0.329073482428115, "grad_norm": 1.499108476748773, "learning_rate": 9.997201206644522e-06, "loss": 0.2159, "step": 824 }, { "epoch": 0.32987220447284343, "grad_norm": 1.624603961236652, "learning_rate": 9.997043574099616e-06, "loss": 0.2147, "step": 826 }, { "epoch": 0.3306709265175719, "grad_norm": 1.5313961974893107, "learning_rate": 9.996881624580542e-06, "loss": 0.2063, "step": 828 }, { "epoch": 0.3314696485623003, "grad_norm": 1.5963165181678973, "learning_rate": 9.996715358227208e-06, "loss": 0.2163, "step": 830 }, { "epoch": 0.33226837060702874, "grad_norm": 2.5507679267995793, "learning_rate": 9.99654477518325e-06, "loss": 0.1961, "step": 832 }, { "epoch": 0.3330670926517572, "grad_norm": 1.729152523099962, "learning_rate": 9.99636987559604e-06, "loss": 0.2024, "step": 834 }, { "epoch": 0.33386581469648563, "grad_norm": 1.7210992603515631, "learning_rate": 9.99619065961667e-06, "loss": 0.2252, "step": 836 }, { "epoch": 0.33466453674121405, "grad_norm": 1.560277624031283, "learning_rate": 9.99600712739997e-06, "loss": 0.2267, "step": 838 }, { "epoch": 0.3354632587859425, "grad_norm": 1.4591188372864774, "learning_rate": 9.995819279104494e-06, "loss": 0.2011, "step": 840 }, { "epoch": 0.33626198083067094, "grad_norm": 1.5041613555668587, "learning_rate": 9.995627114892522e-06, "loss": 0.2274, "step": 842 }, { "epoch": 0.33706070287539935, "grad_norm": 1.6583699889454577, "learning_rate": 9.995430634930068e-06, "loss": 0.1911, "step": 844 }, { "epoch": 0.33785942492012777, "grad_norm": 1.4587897948416573, "learning_rate": 9.99522983938687e-06, "loss": 0.2237, "step": 846 }, { "epoch": 0.33865814696485624, "grad_norm": 1.503331285695136, "learning_rate": 9.995024728436402e-06, "loss": 0.2094, "step": 848 }, { "epoch": 0.33945686900958466, "grad_norm": 1.535943980164657, "learning_rate": 9.994815302255854e-06, "loss": 0.2166, "step": 850 }, { "epoch": 0.3402555910543131, "grad_norm": 1.6504775369785527, "learning_rate": 9.994601561026156e-06, "loss": 0.2002, "step": 852 }, { "epoch": 0.34105431309904155, "grad_norm": 1.4302740240626306, "learning_rate": 9.994383504931955e-06, "loss": 0.2082, "step": 854 }, { "epoch": 0.34185303514376997, "grad_norm": 1.593656408563002, "learning_rate": 9.994161134161635e-06, "loss": 0.2094, "step": 856 }, { "epoch": 0.3426517571884984, "grad_norm": 1.3695207582393358, "learning_rate": 9.9939344489073e-06, "loss": 0.1919, "step": 858 }, { "epoch": 0.34345047923322686, "grad_norm": 1.6060661840602912, "learning_rate": 9.993703449364787e-06, "loss": 0.206, "step": 860 }, { "epoch": 0.3442492012779553, "grad_norm": 1.4773210324662711, "learning_rate": 9.993468135733658e-06, "loss": 0.2192, "step": 862 }, { "epoch": 0.3450479233226837, "grad_norm": 1.406665980225023, "learning_rate": 9.993228508217201e-06, "loss": 0.1907, "step": 864 }, { "epoch": 0.34584664536741216, "grad_norm": 1.5457381119375846, "learning_rate": 9.99298456702243e-06, "loss": 0.188, "step": 866 }, { "epoch": 0.3466453674121406, "grad_norm": 1.392632813313601, "learning_rate": 9.99273631236009e-06, "loss": 0.1814, "step": 868 }, { "epoch": 0.347444089456869, "grad_norm": 1.541510486431838, "learning_rate": 9.992483744444648e-06, "loss": 0.202, "step": 870 }, { "epoch": 0.34824281150159747, "grad_norm": 1.416164840329261, "learning_rate": 9.9922268634943e-06, "loss": 0.2034, "step": 872 }, { "epoch": 0.3490415335463259, "grad_norm": 1.52233433092975, "learning_rate": 9.991965669730965e-06, "loss": 0.2186, "step": 874 }, { "epoch": 0.3498402555910543, "grad_norm": 1.59873194009716, "learning_rate": 9.991700163380292e-06, "loss": 0.2353, "step": 876 }, { "epoch": 0.3506389776357827, "grad_norm": 1.7195541732829867, "learning_rate": 9.991430344671653e-06, "loss": 0.2121, "step": 878 }, { "epoch": 0.3514376996805112, "grad_norm": 1.6835655527767803, "learning_rate": 9.991156213838143e-06, "loss": 0.2048, "step": 880 }, { "epoch": 0.3522364217252396, "grad_norm": 1.4417934095264713, "learning_rate": 9.990877771116588e-06, "loss": 0.1967, "step": 882 }, { "epoch": 0.35303514376996803, "grad_norm": 1.6176474246414694, "learning_rate": 9.990595016747536e-06, "loss": 0.203, "step": 884 }, { "epoch": 0.3538338658146965, "grad_norm": 1.5007198389997798, "learning_rate": 9.99030795097526e-06, "loss": 0.2201, "step": 886 }, { "epoch": 0.3546325878594249, "grad_norm": 1.628516285198201, "learning_rate": 9.990016574047757e-06, "loss": 0.208, "step": 888 }, { "epoch": 0.35543130990415334, "grad_norm": 1.5813428579068682, "learning_rate": 9.989720886216749e-06, "loss": 0.2196, "step": 890 }, { "epoch": 0.3562300319488818, "grad_norm": 1.5508584713469626, "learning_rate": 9.989420887737684e-06, "loss": 0.2145, "step": 892 }, { "epoch": 0.3570287539936102, "grad_norm": 1.7163590462789873, "learning_rate": 9.989116578869732e-06, "loss": 0.2393, "step": 894 }, { "epoch": 0.35782747603833864, "grad_norm": 1.361569590542389, "learning_rate": 9.988807959875785e-06, "loss": 0.2064, "step": 896 }, { "epoch": 0.3586261980830671, "grad_norm": 1.451447508537209, "learning_rate": 9.988495031022465e-06, "loss": 0.204, "step": 898 }, { "epoch": 0.35942492012779553, "grad_norm": 1.4487071973493906, "learning_rate": 9.988177792580107e-06, "loss": 0.2009, "step": 900 }, { "epoch": 0.36022364217252395, "grad_norm": 1.419806864642736, "learning_rate": 9.98785624482278e-06, "loss": 0.2024, "step": 902 }, { "epoch": 0.3610223642172524, "grad_norm": 1.578737310761318, "learning_rate": 9.987530388028269e-06, "loss": 0.2092, "step": 904 }, { "epoch": 0.36182108626198084, "grad_norm": 1.6200056976952275, "learning_rate": 9.987200222478084e-06, "loss": 0.209, "step": 906 }, { "epoch": 0.36261980830670926, "grad_norm": 1.6251607361656684, "learning_rate": 9.986865748457457e-06, "loss": 0.2102, "step": 908 }, { "epoch": 0.3634185303514377, "grad_norm": 1.60473197937151, "learning_rate": 9.986526966255341e-06, "loss": 0.2188, "step": 910 }, { "epoch": 0.36421725239616615, "grad_norm": 1.4451319185877254, "learning_rate": 9.986183876164412e-06, "loss": 0.211, "step": 912 }, { "epoch": 0.36501597444089456, "grad_norm": 1.7222368755921156, "learning_rate": 9.985836478481069e-06, "loss": 0.2093, "step": 914 }, { "epoch": 0.365814696485623, "grad_norm": 1.5858620117161173, "learning_rate": 9.98548477350543e-06, "loss": 0.2128, "step": 916 }, { "epoch": 0.36661341853035145, "grad_norm": 1.5923894300594037, "learning_rate": 9.985128761541334e-06, "loss": 0.2185, "step": 918 }, { "epoch": 0.36741214057507987, "grad_norm": 1.5815229483720405, "learning_rate": 9.984768442896342e-06, "loss": 0.232, "step": 920 }, { "epoch": 0.3682108626198083, "grad_norm": 1.3902654600389392, "learning_rate": 9.984403817881736e-06, "loss": 0.1875, "step": 922 }, { "epoch": 0.36900958466453676, "grad_norm": 1.434425802911243, "learning_rate": 9.984034886812519e-06, "loss": 0.1891, "step": 924 }, { "epoch": 0.3698083067092652, "grad_norm": 1.4965650327837223, "learning_rate": 9.98366165000741e-06, "loss": 0.212, "step": 926 }, { "epoch": 0.3706070287539936, "grad_norm": 1.5934149301565348, "learning_rate": 9.983284107788852e-06, "loss": 0.2081, "step": 928 }, { "epoch": 0.37140575079872207, "grad_norm": 1.5649065798161081, "learning_rate": 9.982902260483003e-06, "loss": 0.2148, "step": 930 }, { "epoch": 0.3722044728434505, "grad_norm": 1.5919056980501831, "learning_rate": 9.982516108419746e-06, "loss": 0.2088, "step": 932 }, { "epoch": 0.3730031948881789, "grad_norm": 1.3802136089446853, "learning_rate": 9.982125651932681e-06, "loss": 0.1826, "step": 934 }, { "epoch": 0.3738019169329074, "grad_norm": 1.4882986051122178, "learning_rate": 9.981730891359123e-06, "loss": 0.2278, "step": 936 }, { "epoch": 0.3746006389776358, "grad_norm": 1.377902572686802, "learning_rate": 9.981331827040109e-06, "loss": 0.2011, "step": 938 }, { "epoch": 0.3753993610223642, "grad_norm": 1.574021407706636, "learning_rate": 9.980928459320393e-06, "loss": 0.2101, "step": 940 }, { "epoch": 0.3761980830670926, "grad_norm": 1.4461584644500969, "learning_rate": 9.980520788548445e-06, "loss": 0.1986, "step": 942 }, { "epoch": 0.3769968051118211, "grad_norm": 1.5726540162240885, "learning_rate": 9.980108815076456e-06, "loss": 0.2079, "step": 944 }, { "epoch": 0.3777955271565495, "grad_norm": 1.631975663438154, "learning_rate": 9.979692539260331e-06, "loss": 0.2071, "step": 946 }, { "epoch": 0.37859424920127793, "grad_norm": 1.3678329735675085, "learning_rate": 9.979271961459696e-06, "loss": 0.2109, "step": 948 }, { "epoch": 0.3793929712460064, "grad_norm": 1.641531450859251, "learning_rate": 9.978847082037886e-06, "loss": 0.1935, "step": 950 }, { "epoch": 0.3801916932907348, "grad_norm": 1.5492312533670054, "learning_rate": 9.978417901361958e-06, "loss": 0.2046, "step": 952 }, { "epoch": 0.38099041533546324, "grad_norm": 1.7625259362995305, "learning_rate": 9.977984419802686e-06, "loss": 0.2085, "step": 954 }, { "epoch": 0.3817891373801917, "grad_norm": 1.3629858192513753, "learning_rate": 9.977546637734557e-06, "loss": 0.2026, "step": 956 }, { "epoch": 0.38258785942492013, "grad_norm": 1.6553817872981966, "learning_rate": 9.97710455553577e-06, "loss": 0.1986, "step": 958 }, { "epoch": 0.38338658146964855, "grad_norm": 1.6709973514366674, "learning_rate": 9.976658173588244e-06, "loss": 0.1885, "step": 960 }, { "epoch": 0.384185303514377, "grad_norm": 1.5781155572362893, "learning_rate": 9.976207492277612e-06, "loss": 0.1978, "step": 962 }, { "epoch": 0.38498402555910544, "grad_norm": 1.5471644306914103, "learning_rate": 9.97575251199322e-06, "loss": 0.2062, "step": 964 }, { "epoch": 0.38578274760383385, "grad_norm": 1.6210596494320104, "learning_rate": 9.975293233128128e-06, "loss": 0.211, "step": 966 }, { "epoch": 0.3865814696485623, "grad_norm": 1.4763420559956302, "learning_rate": 9.974829656079106e-06, "loss": 0.2018, "step": 968 }, { "epoch": 0.38738019169329074, "grad_norm": 1.6383352001216414, "learning_rate": 9.974361781246647e-06, "loss": 0.2094, "step": 970 }, { "epoch": 0.38817891373801916, "grad_norm": 1.7933627710025113, "learning_rate": 9.973889609034945e-06, "loss": 0.2163, "step": 972 }, { "epoch": 0.3889776357827476, "grad_norm": 1.5400540527111912, "learning_rate": 9.973413139851918e-06, "loss": 0.1863, "step": 974 }, { "epoch": 0.38977635782747605, "grad_norm": 1.3560103646758506, "learning_rate": 9.972932374109184e-06, "loss": 0.1959, "step": 976 }, { "epoch": 0.39057507987220447, "grad_norm": 1.9026289683010842, "learning_rate": 9.972447312222084e-06, "loss": 0.1825, "step": 978 }, { "epoch": 0.3913738019169329, "grad_norm": 1.5573821099296685, "learning_rate": 9.971957954609663e-06, "loss": 0.206, "step": 980 }, { "epoch": 0.39217252396166136, "grad_norm": 1.463153557338603, "learning_rate": 9.971464301694683e-06, "loss": 0.2139, "step": 982 }, { "epoch": 0.3929712460063898, "grad_norm": 1.4361640616504048, "learning_rate": 9.97096635390361e-06, "loss": 0.2039, "step": 984 }, { "epoch": 0.3937699680511182, "grad_norm": 1.3944298003112567, "learning_rate": 9.970464111666627e-06, "loss": 0.2078, "step": 986 }, { "epoch": 0.39456869009584666, "grad_norm": 1.427105562837558, "learning_rate": 9.969957575417621e-06, "loss": 0.1837, "step": 988 }, { "epoch": 0.3953674121405751, "grad_norm": 1.520588415638903, "learning_rate": 9.969446745594193e-06, "loss": 0.2239, "step": 990 }, { "epoch": 0.3961661341853035, "grad_norm": 2.079367024655261, "learning_rate": 9.968931622637652e-06, "loss": 0.1997, "step": 992 }, { "epoch": 0.39696485623003197, "grad_norm": 1.5129459484138021, "learning_rate": 9.968412206993015e-06, "loss": 0.1953, "step": 994 }, { "epoch": 0.3977635782747604, "grad_norm": 1.478216675491551, "learning_rate": 9.967888499109008e-06, "loss": 0.2081, "step": 996 }, { "epoch": 0.3985623003194888, "grad_norm": 1.5236800584109207, "learning_rate": 9.967360499438067e-06, "loss": 0.2043, "step": 998 }, { "epoch": 0.3993610223642173, "grad_norm": 1.4639393772316662, "learning_rate": 9.966828208436332e-06, "loss": 0.2035, "step": 1000 }, { "epoch": 0.3993610223642173, "eval_loss": 0.1855674833059311, "eval_runtime": 418.9403, "eval_samples_per_second": 42.505, "eval_steps_per_second": 5.313, "step": 1000 }, { "epoch": 0.4001597444089457, "grad_norm": 1.925447602175885, "learning_rate": 9.966291626563651e-06, "loss": 0.2141, "step": 1002 }, { "epoch": 0.4009584664536741, "grad_norm": 1.5342974238522853, "learning_rate": 9.965750754283583e-06, "loss": 0.1967, "step": 1004 }, { "epoch": 0.40175718849840253, "grad_norm": 1.365762174319038, "learning_rate": 9.96520559206339e-06, "loss": 0.1787, "step": 1006 }, { "epoch": 0.402555910543131, "grad_norm": 1.6415643245446048, "learning_rate": 9.96465614037404e-06, "loss": 0.2094, "step": 1008 }, { "epoch": 0.4033546325878594, "grad_norm": 1.487881148105119, "learning_rate": 9.964102399690206e-06, "loss": 0.2296, "step": 1010 }, { "epoch": 0.40415335463258784, "grad_norm": 1.3885469945101532, "learning_rate": 9.96354437049027e-06, "loss": 0.1953, "step": 1012 }, { "epoch": 0.4049520766773163, "grad_norm": 1.5376182595234218, "learning_rate": 9.962982053256317e-06, "loss": 0.2067, "step": 1014 }, { "epoch": 0.4057507987220447, "grad_norm": 1.5925253053184176, "learning_rate": 9.962415448474134e-06, "loss": 0.2034, "step": 1016 }, { "epoch": 0.40654952076677314, "grad_norm": 1.5707593570512013, "learning_rate": 9.961844556633216e-06, "loss": 0.2107, "step": 1018 }, { "epoch": 0.4073482428115016, "grad_norm": 1.538254717520436, "learning_rate": 9.961269378226756e-06, "loss": 0.2014, "step": 1020 }, { "epoch": 0.40814696485623003, "grad_norm": 1.5245585221099645, "learning_rate": 9.960689913751658e-06, "loss": 0.2002, "step": 1022 }, { "epoch": 0.40894568690095845, "grad_norm": 1.4248623479875928, "learning_rate": 9.960106163708522e-06, "loss": 0.1883, "step": 1024 }, { "epoch": 0.4097444089456869, "grad_norm": 1.7412362384789994, "learning_rate": 9.959518128601657e-06, "loss": 0.218, "step": 1026 }, { "epoch": 0.41054313099041534, "grad_norm": 1.4925072692951855, "learning_rate": 9.958925808939066e-06, "loss": 0.2111, "step": 1028 }, { "epoch": 0.41134185303514376, "grad_norm": 1.4973115296085746, "learning_rate": 9.958329205232456e-06, "loss": 0.2059, "step": 1030 }, { "epoch": 0.41214057507987223, "grad_norm": 1.468856647822638, "learning_rate": 9.95772831799724e-06, "loss": 0.2093, "step": 1032 }, { "epoch": 0.41293929712460065, "grad_norm": 1.4085028520516685, "learning_rate": 9.957123147752527e-06, "loss": 0.2116, "step": 1034 }, { "epoch": 0.41373801916932906, "grad_norm": 1.3478745438364825, "learning_rate": 9.956513695021126e-06, "loss": 0.2144, "step": 1036 }, { "epoch": 0.4145367412140575, "grad_norm": 1.414787678483826, "learning_rate": 9.955899960329546e-06, "loss": 0.2114, "step": 1038 }, { "epoch": 0.41533546325878595, "grad_norm": 1.4758641440286802, "learning_rate": 9.955281944207998e-06, "loss": 0.2114, "step": 1040 }, { "epoch": 0.41613418530351437, "grad_norm": 1.3808751418519412, "learning_rate": 9.95465964719039e-06, "loss": 0.1996, "step": 1042 }, { "epoch": 0.4169329073482428, "grad_norm": 1.3552203407425432, "learning_rate": 9.954033069814324e-06, "loss": 0.2197, "step": 1044 }, { "epoch": 0.41773162939297126, "grad_norm": 1.7431939971291783, "learning_rate": 9.953402212621107e-06, "loss": 0.1766, "step": 1046 }, { "epoch": 0.4185303514376997, "grad_norm": 1.2999138749816357, "learning_rate": 9.95276707615574e-06, "loss": 0.1742, "step": 1048 }, { "epoch": 0.4193290734824281, "grad_norm": 1.4945842460673373, "learning_rate": 9.952127660966919e-06, "loss": 0.2246, "step": 1050 }, { "epoch": 0.42012779552715657, "grad_norm": 1.5152322746280356, "learning_rate": 9.95148396760704e-06, "loss": 0.2192, "step": 1052 }, { "epoch": 0.420926517571885, "grad_norm": 1.4230040236709558, "learning_rate": 9.950835996632193e-06, "loss": 0.1982, "step": 1054 }, { "epoch": 0.4217252396166134, "grad_norm": 1.3823627265224154, "learning_rate": 9.950183748602164e-06, "loss": 0.2143, "step": 1056 }, { "epoch": 0.4225239616613419, "grad_norm": 1.6379038373994437, "learning_rate": 9.949527224080434e-06, "loss": 0.2006, "step": 1058 }, { "epoch": 0.4233226837060703, "grad_norm": 1.4280283780098375, "learning_rate": 9.948866423634178e-06, "loss": 0.1841, "step": 1060 }, { "epoch": 0.4241214057507987, "grad_norm": 1.417933520198746, "learning_rate": 9.948201347834265e-06, "loss": 0.2032, "step": 1062 }, { "epoch": 0.4249201277955272, "grad_norm": 1.3378469760405771, "learning_rate": 9.947531997255256e-06, "loss": 0.1926, "step": 1064 }, { "epoch": 0.4257188498402556, "grad_norm": 1.5459222264184573, "learning_rate": 9.94685837247541e-06, "loss": 0.2052, "step": 1066 }, { "epoch": 0.426517571884984, "grad_norm": 1.501440385232651, "learning_rate": 9.946180474076675e-06, "loss": 0.2052, "step": 1068 }, { "epoch": 0.4273162939297125, "grad_norm": 1.5358561082091893, "learning_rate": 9.945498302644687e-06, "loss": 0.2034, "step": 1070 }, { "epoch": 0.4281150159744409, "grad_norm": 1.6969353624664514, "learning_rate": 9.944811858768782e-06, "loss": 0.2141, "step": 1072 }, { "epoch": 0.4289137380191693, "grad_norm": 1.4361329940984802, "learning_rate": 9.944121143041982e-06, "loss": 0.1955, "step": 1074 }, { "epoch": 0.42971246006389774, "grad_norm": 1.4944093870984845, "learning_rate": 9.943426156061e-06, "loss": 0.1968, "step": 1076 }, { "epoch": 0.4305111821086262, "grad_norm": 1.265035184804792, "learning_rate": 9.942726898426238e-06, "loss": 0.2113, "step": 1078 }, { "epoch": 0.43130990415335463, "grad_norm": 1.4364726173362738, "learning_rate": 9.94202337074179e-06, "loss": 0.2034, "step": 1080 }, { "epoch": 0.43210862619808305, "grad_norm": 1.4034504271470043, "learning_rate": 9.941315573615437e-06, "loss": 0.1783, "step": 1082 }, { "epoch": 0.4329073482428115, "grad_norm": 1.4068580877161003, "learning_rate": 9.940603507658649e-06, "loss": 0.2071, "step": 1084 }, { "epoch": 0.43370607028753994, "grad_norm": 1.4617297171517643, "learning_rate": 9.939887173486583e-06, "loss": 0.2086, "step": 1086 }, { "epoch": 0.43450479233226835, "grad_norm": 1.4870587568723221, "learning_rate": 9.939166571718086e-06, "loss": 0.2191, "step": 1088 }, { "epoch": 0.4353035143769968, "grad_norm": 1.3330964975081798, "learning_rate": 9.938441702975689e-06, "loss": 0.2105, "step": 1090 }, { "epoch": 0.43610223642172524, "grad_norm": 1.4071934617523967, "learning_rate": 9.93771256788561e-06, "loss": 0.1986, "step": 1092 }, { "epoch": 0.43690095846645366, "grad_norm": 1.4956330891164953, "learning_rate": 9.936979167077754e-06, "loss": 0.2242, "step": 1094 }, { "epoch": 0.43769968051118213, "grad_norm": 1.500694287898403, "learning_rate": 9.936241501185706e-06, "loss": 0.213, "step": 1096 }, { "epoch": 0.43849840255591055, "grad_norm": 1.431648588389163, "learning_rate": 9.935499570846746e-06, "loss": 0.1994, "step": 1098 }, { "epoch": 0.43929712460063897, "grad_norm": 1.3419204436535992, "learning_rate": 9.934753376701827e-06, "loss": 0.1917, "step": 1100 }, { "epoch": 0.44009584664536744, "grad_norm": 1.2901653329852891, "learning_rate": 9.934002919395593e-06, "loss": 0.1923, "step": 1102 }, { "epoch": 0.44089456869009586, "grad_norm": 1.6677348530796035, "learning_rate": 9.933248199576366e-06, "loss": 0.2037, "step": 1104 }, { "epoch": 0.4416932907348243, "grad_norm": 1.380349679623337, "learning_rate": 9.932489217896154e-06, "loss": 0.1909, "step": 1106 }, { "epoch": 0.4424920127795527, "grad_norm": 1.4203485606081072, "learning_rate": 9.931725975010647e-06, "loss": 0.1934, "step": 1108 }, { "epoch": 0.44329073482428116, "grad_norm": 1.4039574653298539, "learning_rate": 9.930958471579212e-06, "loss": 0.2137, "step": 1110 }, { "epoch": 0.4440894568690096, "grad_norm": 1.4554296915268317, "learning_rate": 9.930186708264902e-06, "loss": 0.2081, "step": 1112 }, { "epoch": 0.444888178913738, "grad_norm": 1.5862456773052678, "learning_rate": 9.929410685734446e-06, "loss": 0.2074, "step": 1114 }, { "epoch": 0.44568690095846647, "grad_norm": 1.371290557948402, "learning_rate": 9.928630404658255e-06, "loss": 0.1934, "step": 1116 }, { "epoch": 0.4464856230031949, "grad_norm": 1.5523688713967079, "learning_rate": 9.92784586571042e-06, "loss": 0.2114, "step": 1118 }, { "epoch": 0.4472843450479233, "grad_norm": 1.5913885443768627, "learning_rate": 9.927057069568704e-06, "loss": 0.197, "step": 1120 }, { "epoch": 0.4480830670926518, "grad_norm": 1.376586499393768, "learning_rate": 9.926264016914555e-06, "loss": 0.2114, "step": 1122 }, { "epoch": 0.4488817891373802, "grad_norm": 1.3604215439899947, "learning_rate": 9.925466708433097e-06, "loss": 0.1949, "step": 1124 }, { "epoch": 0.4496805111821086, "grad_norm": 1.4064368128907085, "learning_rate": 9.924665144813128e-06, "loss": 0.1876, "step": 1126 }, { "epoch": 0.4504792332268371, "grad_norm": 1.4493555350796723, "learning_rate": 9.923859326747125e-06, "loss": 0.166, "step": 1128 }, { "epoch": 0.4512779552715655, "grad_norm": 1.3005030779658915, "learning_rate": 9.923049254931235e-06, "loss": 0.179, "step": 1130 }, { "epoch": 0.4520766773162939, "grad_norm": 1.301603097338869, "learning_rate": 9.922234930065286e-06, "loss": 0.1828, "step": 1132 }, { "epoch": 0.4528753993610224, "grad_norm": 1.424132465075419, "learning_rate": 9.921416352852779e-06, "loss": 0.2027, "step": 1134 }, { "epoch": 0.4536741214057508, "grad_norm": 1.5184245441350142, "learning_rate": 9.920593524000887e-06, "loss": 0.2163, "step": 1136 }, { "epoch": 0.4544728434504792, "grad_norm": 1.2649380282890057, "learning_rate": 9.919766444220454e-06, "loss": 0.1952, "step": 1138 }, { "epoch": 0.45527156549520764, "grad_norm": 1.3192846529035582, "learning_rate": 9.918935114226001e-06, "loss": 0.2147, "step": 1140 }, { "epoch": 0.4560702875399361, "grad_norm": 1.6014388375199928, "learning_rate": 9.91809953473572e-06, "loss": 0.209, "step": 1142 }, { "epoch": 0.45686900958466453, "grad_norm": 1.2763233267206437, "learning_rate": 9.917259706471469e-06, "loss": 0.1894, "step": 1144 }, { "epoch": 0.45766773162939295, "grad_norm": 1.355011355722016, "learning_rate": 9.916415630158782e-06, "loss": 0.2019, "step": 1146 }, { "epoch": 0.4584664536741214, "grad_norm": 1.420940628734508, "learning_rate": 9.915567306526863e-06, "loss": 0.2176, "step": 1148 }, { "epoch": 0.45926517571884984, "grad_norm": 1.3488193076357031, "learning_rate": 9.914714736308582e-06, "loss": 0.2032, "step": 1150 }, { "epoch": 0.46006389776357826, "grad_norm": 1.288799255340392, "learning_rate": 9.913857920240481e-06, "loss": 0.2077, "step": 1152 }, { "epoch": 0.46086261980830673, "grad_norm": 1.3786756620906035, "learning_rate": 9.912996859062764e-06, "loss": 0.2113, "step": 1154 }, { "epoch": 0.46166134185303515, "grad_norm": 1.50469580343903, "learning_rate": 9.912131553519313e-06, "loss": 0.2073, "step": 1156 }, { "epoch": 0.46246006389776356, "grad_norm": 1.4700159822947163, "learning_rate": 9.911262004357665e-06, "loss": 0.2053, "step": 1158 }, { "epoch": 0.46325878594249204, "grad_norm": 1.519205743263198, "learning_rate": 9.91038821232903e-06, "loss": 0.2217, "step": 1160 }, { "epoch": 0.46405750798722045, "grad_norm": 1.3708099691931215, "learning_rate": 9.909510178188281e-06, "loss": 0.1891, "step": 1162 }, { "epoch": 0.46485623003194887, "grad_norm": 1.2711919877902211, "learning_rate": 9.90862790269396e-06, "loss": 0.2003, "step": 1164 }, { "epoch": 0.46565495207667734, "grad_norm": 1.296959216697176, "learning_rate": 9.907741386608267e-06, "loss": 0.1895, "step": 1166 }, { "epoch": 0.46645367412140576, "grad_norm": 1.4988072467804332, "learning_rate": 9.906850630697068e-06, "loss": 0.229, "step": 1168 }, { "epoch": 0.4672523961661342, "grad_norm": 1.357985179422065, "learning_rate": 9.905955635729894e-06, "loss": 0.208, "step": 1170 }, { "epoch": 0.4680511182108626, "grad_norm": 1.3438998637885513, "learning_rate": 9.905056402479933e-06, "loss": 0.1809, "step": 1172 }, { "epoch": 0.46884984025559107, "grad_norm": 1.2383109161845713, "learning_rate": 9.904152931724043e-06, "loss": 0.1971, "step": 1174 }, { "epoch": 0.4696485623003195, "grad_norm": 1.5560174102928908, "learning_rate": 9.903245224242732e-06, "loss": 0.2032, "step": 1176 }, { "epoch": 0.4704472843450479, "grad_norm": 1.38347824637013, "learning_rate": 9.902333280820176e-06, "loss": 0.198, "step": 1178 }, { "epoch": 0.4712460063897764, "grad_norm": 1.4051889306019656, "learning_rate": 9.901417102244208e-06, "loss": 0.1987, "step": 1180 }, { "epoch": 0.4720447284345048, "grad_norm": 1.6159750486401823, "learning_rate": 9.90049668930632e-06, "loss": 0.2029, "step": 1182 }, { "epoch": 0.4728434504792332, "grad_norm": 1.5547809497795395, "learning_rate": 9.899572042801662e-06, "loss": 0.1937, "step": 1184 }, { "epoch": 0.4736421725239617, "grad_norm": 1.392059461645592, "learning_rate": 9.898643163529041e-06, "loss": 0.1783, "step": 1186 }, { "epoch": 0.4744408945686901, "grad_norm": 1.4227792657414569, "learning_rate": 9.89771005229092e-06, "loss": 0.1942, "step": 1188 }, { "epoch": 0.4752396166134185, "grad_norm": 1.525020829545986, "learning_rate": 9.89677270989342e-06, "loss": 0.2135, "step": 1190 }, { "epoch": 0.476038338658147, "grad_norm": 1.7374484802274244, "learning_rate": 9.895831137146319e-06, "loss": 0.2081, "step": 1192 }, { "epoch": 0.4768370607028754, "grad_norm": 1.3661453654617517, "learning_rate": 9.894885334863044e-06, "loss": 0.2055, "step": 1194 }, { "epoch": 0.4776357827476038, "grad_norm": 1.5602720293300305, "learning_rate": 9.893935303860677e-06, "loss": 0.2171, "step": 1196 }, { "epoch": 0.4784345047923323, "grad_norm": 3.1025231784672216, "learning_rate": 9.892981044959961e-06, "loss": 0.193, "step": 1198 }, { "epoch": 0.4792332268370607, "grad_norm": 1.262885402820455, "learning_rate": 9.89202255898528e-06, "loss": 0.1916, "step": 1200 }, { "epoch": 0.48003194888178913, "grad_norm": 1.381018034591307, "learning_rate": 9.891059846764679e-06, "loss": 0.1878, "step": 1202 }, { "epoch": 0.48083067092651754, "grad_norm": 1.5768051172665962, "learning_rate": 9.89009290912985e-06, "loss": 0.1967, "step": 1204 }, { "epoch": 0.481629392971246, "grad_norm": 1.463151710058678, "learning_rate": 9.889121746916132e-06, "loss": 0.2269, "step": 1206 }, { "epoch": 0.48242811501597443, "grad_norm": 1.34065053745843, "learning_rate": 9.888146360962523e-06, "loss": 0.1865, "step": 1208 }, { "epoch": 0.48322683706070285, "grad_norm": 1.6723543690494453, "learning_rate": 9.887166752111663e-06, "loss": 0.2129, "step": 1210 }, { "epoch": 0.4840255591054313, "grad_norm": 1.5794398784564423, "learning_rate": 9.88618292120984e-06, "loss": 0.2224, "step": 1212 }, { "epoch": 0.48482428115015974, "grad_norm": 1.450388456580997, "learning_rate": 9.88519486910699e-06, "loss": 0.1966, "step": 1214 }, { "epoch": 0.48562300319488816, "grad_norm": 1.7814239798658509, "learning_rate": 9.8842025966567e-06, "loss": 0.1904, "step": 1216 }, { "epoch": 0.48642172523961663, "grad_norm": 1.7391567788714228, "learning_rate": 9.883206104716198e-06, "loss": 0.2133, "step": 1218 }, { "epoch": 0.48722044728434505, "grad_norm": 1.442571651767735, "learning_rate": 9.882205394146362e-06, "loss": 0.1989, "step": 1220 }, { "epoch": 0.48801916932907347, "grad_norm": 1.4088522689829923, "learning_rate": 9.881200465811706e-06, "loss": 0.2175, "step": 1222 }, { "epoch": 0.48881789137380194, "grad_norm": 1.2413815231676442, "learning_rate": 9.880191320580396e-06, "loss": 0.1705, "step": 1224 }, { "epoch": 0.48961661341853036, "grad_norm": 1.4154117503900958, "learning_rate": 9.87917795932424e-06, "loss": 0.1817, "step": 1226 }, { "epoch": 0.4904153354632588, "grad_norm": 1.3393213535614317, "learning_rate": 9.878160382918685e-06, "loss": 0.2009, "step": 1228 }, { "epoch": 0.49121405750798725, "grad_norm": 1.5366573602302467, "learning_rate": 9.87713859224282e-06, "loss": 0.2339, "step": 1230 }, { "epoch": 0.49201277955271566, "grad_norm": 1.2826491297833846, "learning_rate": 9.876112588179378e-06, "loss": 0.1837, "step": 1232 }, { "epoch": 0.4928115015974441, "grad_norm": 1.4232335556523423, "learning_rate": 9.875082371614728e-06, "loss": 0.1985, "step": 1234 }, { "epoch": 0.4936102236421725, "grad_norm": 1.4944078372691827, "learning_rate": 9.874047943438879e-06, "loss": 0.1777, "step": 1236 }, { "epoch": 0.49440894568690097, "grad_norm": 1.3263567228958557, "learning_rate": 9.873009304545482e-06, "loss": 0.2043, "step": 1238 }, { "epoch": 0.4952076677316294, "grad_norm": 1.491665028254892, "learning_rate": 9.87196645583182e-06, "loss": 0.1995, "step": 1240 }, { "epoch": 0.4960063897763578, "grad_norm": 1.4711866133685396, "learning_rate": 9.870919398198819e-06, "loss": 0.2151, "step": 1242 }, { "epoch": 0.4968051118210863, "grad_norm": 1.3471791986545025, "learning_rate": 9.869868132551037e-06, "loss": 0.2301, "step": 1244 }, { "epoch": 0.4976038338658147, "grad_norm": 1.30305885026678, "learning_rate": 9.868812659796669e-06, "loss": 0.1955, "step": 1246 }, { "epoch": 0.4984025559105431, "grad_norm": 1.3754860324491927, "learning_rate": 9.86775298084754e-06, "loss": 0.2245, "step": 1248 }, { "epoch": 0.4992012779552716, "grad_norm": 1.2464730265182025, "learning_rate": 9.866689096619117e-06, "loss": 0.1811, "step": 1250 }, { "epoch": 0.5, "grad_norm": 1.3277366908866204, "learning_rate": 9.865621008030492e-06, "loss": 0.1916, "step": 1252 }, { "epoch": 0.5007987220447284, "grad_norm": 1.4195649241005022, "learning_rate": 9.864548716004399e-06, "loss": 0.1932, "step": 1254 }, { "epoch": 0.5015974440894568, "grad_norm": 1.4925132735569981, "learning_rate": 9.863472221467189e-06, "loss": 0.1938, "step": 1256 }, { "epoch": 0.5023961661341853, "grad_norm": 1.2979288287029658, "learning_rate": 9.862391525348856e-06, "loss": 0.1877, "step": 1258 }, { "epoch": 0.5031948881789138, "grad_norm": 1.3864119857396864, "learning_rate": 9.861306628583021e-06, "loss": 0.1958, "step": 1260 }, { "epoch": 0.5039936102236422, "grad_norm": 1.5118691478458204, "learning_rate": 9.86021753210693e-06, "loss": 0.2145, "step": 1262 }, { "epoch": 0.5047923322683706, "grad_norm": 1.3753508468091324, "learning_rate": 9.85912423686146e-06, "loss": 0.202, "step": 1264 }, { "epoch": 0.505591054313099, "grad_norm": 1.4184249679671, "learning_rate": 9.858026743791114e-06, "loss": 0.2002, "step": 1266 }, { "epoch": 0.5063897763578274, "grad_norm": 1.423875732058238, "learning_rate": 9.856925053844025e-06, "loss": 0.2118, "step": 1268 }, { "epoch": 0.5071884984025559, "grad_norm": 1.4222433141825608, "learning_rate": 9.855819167971946e-06, "loss": 0.1782, "step": 1270 }, { "epoch": 0.5079872204472844, "grad_norm": 1.3607705696064714, "learning_rate": 9.854709087130261e-06, "loss": 0.2206, "step": 1272 }, { "epoch": 0.5087859424920128, "grad_norm": 1.4114769617291796, "learning_rate": 9.853594812277973e-06, "loss": 0.214, "step": 1274 }, { "epoch": 0.5095846645367412, "grad_norm": 1.5913052917009076, "learning_rate": 9.852476344377713e-06, "loss": 0.2109, "step": 1276 }, { "epoch": 0.5103833865814696, "grad_norm": 1.3774331396577875, "learning_rate": 9.851353684395728e-06, "loss": 0.2027, "step": 1278 }, { "epoch": 0.5111821086261981, "grad_norm": 1.2773430420906078, "learning_rate": 9.850226833301893e-06, "loss": 0.1985, "step": 1280 }, { "epoch": 0.5119808306709265, "grad_norm": 1.6292303107969242, "learning_rate": 9.849095792069701e-06, "loss": 0.2125, "step": 1282 }, { "epoch": 0.512779552715655, "grad_norm": 1.4215039291593559, "learning_rate": 9.847960561676263e-06, "loss": 0.2037, "step": 1284 }, { "epoch": 0.5135782747603834, "grad_norm": 1.3067667391370512, "learning_rate": 9.846821143102313e-06, "loss": 0.1677, "step": 1286 }, { "epoch": 0.5143769968051118, "grad_norm": 1.2923555649063807, "learning_rate": 9.8456775373322e-06, "loss": 0.1866, "step": 1288 }, { "epoch": 0.5151757188498403, "grad_norm": 1.4943462008704147, "learning_rate": 9.844529745353892e-06, "loss": 0.2126, "step": 1290 }, { "epoch": 0.5159744408945687, "grad_norm": 1.5684625006318347, "learning_rate": 9.843377768158972e-06, "loss": 0.2084, "step": 1292 }, { "epoch": 0.5167731629392971, "grad_norm": 1.4914103834569956, "learning_rate": 9.84222160674264e-06, "loss": 0.2153, "step": 1294 }, { "epoch": 0.5175718849840255, "grad_norm": 1.328585368359251, "learning_rate": 9.841061262103713e-06, "loss": 0.1919, "step": 1296 }, { "epoch": 0.518370607028754, "grad_norm": 1.3701342649025663, "learning_rate": 9.839896735244615e-06, "loss": 0.1981, "step": 1298 }, { "epoch": 0.5191693290734825, "grad_norm": 1.5701845518180864, "learning_rate": 9.83872802717139e-06, "loss": 0.1849, "step": 1300 }, { "epoch": 0.5199680511182109, "grad_norm": 1.3075421535239862, "learning_rate": 9.83755513889369e-06, "loss": 0.1918, "step": 1302 }, { "epoch": 0.5207667731629393, "grad_norm": 1.5057769635536875, "learning_rate": 9.836378071424782e-06, "loss": 0.2218, "step": 1304 }, { "epoch": 0.5215654952076677, "grad_norm": 1.3361043392598404, "learning_rate": 9.835196825781539e-06, "loss": 0.1998, "step": 1306 }, { "epoch": 0.5223642172523961, "grad_norm": 1.3294074227646568, "learning_rate": 9.834011402984447e-06, "loss": 0.1885, "step": 1308 }, { "epoch": 0.5231629392971247, "grad_norm": 1.5233983111587897, "learning_rate": 9.8328218040576e-06, "loss": 0.2051, "step": 1310 }, { "epoch": 0.5239616613418531, "grad_norm": 1.263265485856387, "learning_rate": 9.831628030028698e-06, "loss": 0.1898, "step": 1312 }, { "epoch": 0.5247603833865815, "grad_norm": 1.3267088643171874, "learning_rate": 9.830430081929047e-06, "loss": 0.1816, "step": 1314 }, { "epoch": 0.5255591054313099, "grad_norm": 1.4118242503209621, "learning_rate": 9.829227960793566e-06, "loss": 0.2206, "step": 1316 }, { "epoch": 0.5263578274760383, "grad_norm": 1.2848755780312735, "learning_rate": 9.82802166766077e-06, "loss": 0.191, "step": 1318 }, { "epoch": 0.5271565495207667, "grad_norm": 1.6025918801894552, "learning_rate": 9.826811203572785e-06, "loss": 0.2231, "step": 1320 }, { "epoch": 0.5279552715654952, "grad_norm": 1.3694781938576936, "learning_rate": 9.82559656957534e-06, "loss": 0.2133, "step": 1322 }, { "epoch": 0.5287539936102237, "grad_norm": 1.5702346185266045, "learning_rate": 9.824377766717758e-06, "loss": 0.204, "step": 1324 }, { "epoch": 0.5295527156549521, "grad_norm": 1.4119518722483408, "learning_rate": 9.823154796052974e-06, "loss": 0.1944, "step": 1326 }, { "epoch": 0.5303514376996805, "grad_norm": 1.2510934475746587, "learning_rate": 9.821927658637518e-06, "loss": 0.1901, "step": 1328 }, { "epoch": 0.5311501597444089, "grad_norm": 1.23475379515121, "learning_rate": 9.82069635553152e-06, "loss": 0.179, "step": 1330 }, { "epoch": 0.5319488817891374, "grad_norm": 1.3831688689107673, "learning_rate": 9.819460887798714e-06, "loss": 0.1854, "step": 1332 }, { "epoch": 0.5327476038338658, "grad_norm": 1.495164379191231, "learning_rate": 9.818221256506421e-06, "loss": 0.2067, "step": 1334 }, { "epoch": 0.5335463258785943, "grad_norm": 1.4669136010120607, "learning_rate": 9.81697746272557e-06, "loss": 0.2182, "step": 1336 }, { "epoch": 0.5343450479233227, "grad_norm": 1.3093260674635532, "learning_rate": 9.81572950753068e-06, "loss": 0.2077, "step": 1338 }, { "epoch": 0.5351437699680511, "grad_norm": 1.4203515470016688, "learning_rate": 9.814477391999868e-06, "loss": 0.1832, "step": 1340 }, { "epoch": 0.5359424920127795, "grad_norm": 1.2863505009713285, "learning_rate": 9.813221117214842e-06, "loss": 0.2022, "step": 1342 }, { "epoch": 0.536741214057508, "grad_norm": 1.2084177020870481, "learning_rate": 9.811960684260907e-06, "loss": 0.1963, "step": 1344 }, { "epoch": 0.5375399361022364, "grad_norm": 1.3072664824588964, "learning_rate": 9.810696094226952e-06, "loss": 0.2205, "step": 1346 }, { "epoch": 0.5383386581469649, "grad_norm": 1.1614780192043668, "learning_rate": 9.809427348205472e-06, "loss": 0.1875, "step": 1348 }, { "epoch": 0.5391373801916933, "grad_norm": 1.3175352444594506, "learning_rate": 9.808154447292539e-06, "loss": 0.185, "step": 1350 }, { "epoch": 0.5399361022364217, "grad_norm": 1.2834953987740787, "learning_rate": 9.80687739258782e-06, "loss": 0.1924, "step": 1352 }, { "epoch": 0.5407348242811502, "grad_norm": 1.3409325351626424, "learning_rate": 9.805596185194571e-06, "loss": 0.206, "step": 1354 }, { "epoch": 0.5415335463258786, "grad_norm": 1.367209463101786, "learning_rate": 9.804310826219633e-06, "loss": 0.2099, "step": 1356 }, { "epoch": 0.542332268370607, "grad_norm": 1.2819870163156417, "learning_rate": 9.803021316773434e-06, "loss": 0.1881, "step": 1358 }, { "epoch": 0.5431309904153354, "grad_norm": 1.3214326903766342, "learning_rate": 9.801727657969988e-06, "loss": 0.1964, "step": 1360 }, { "epoch": 0.5439297124600639, "grad_norm": 1.1833472018926297, "learning_rate": 9.800429850926898e-06, "loss": 0.1836, "step": 1362 }, { "epoch": 0.5447284345047924, "grad_norm": 1.4748511681523404, "learning_rate": 9.799127896765346e-06, "loss": 0.1772, "step": 1364 }, { "epoch": 0.5455271565495208, "grad_norm": 1.2243926123531605, "learning_rate": 9.797821796610094e-06, "loss": 0.2206, "step": 1366 }, { "epoch": 0.5463258785942492, "grad_norm": 1.292732092241605, "learning_rate": 9.796511551589492e-06, "loss": 0.1888, "step": 1368 }, { "epoch": 0.5471246006389776, "grad_norm": 1.348610309652153, "learning_rate": 9.795197162835468e-06, "loss": 0.2045, "step": 1370 }, { "epoch": 0.547923322683706, "grad_norm": 1.2828128744633993, "learning_rate": 9.79387863148353e-06, "loss": 0.1785, "step": 1372 }, { "epoch": 0.5487220447284346, "grad_norm": 1.369286359401732, "learning_rate": 9.792555958672762e-06, "loss": 0.1839, "step": 1374 }, { "epoch": 0.549520766773163, "grad_norm": 1.4014364965242936, "learning_rate": 9.791229145545832e-06, "loss": 0.1972, "step": 1376 }, { "epoch": 0.5503194888178914, "grad_norm": 1.465996725744811, "learning_rate": 9.789898193248978e-06, "loss": 0.1957, "step": 1378 }, { "epoch": 0.5511182108626198, "grad_norm": 1.5171558530092926, "learning_rate": 9.788563102932023e-06, "loss": 0.1966, "step": 1380 }, { "epoch": 0.5519169329073482, "grad_norm": 1.3311267145287, "learning_rate": 9.787223875748353e-06, "loss": 0.2064, "step": 1382 }, { "epoch": 0.5527156549520766, "grad_norm": 1.3066784419896205, "learning_rate": 9.785880512854937e-06, "loss": 0.1853, "step": 1384 }, { "epoch": 0.5535143769968051, "grad_norm": 1.2390174296827288, "learning_rate": 9.784533015412311e-06, "loss": 0.1991, "step": 1386 }, { "epoch": 0.5543130990415336, "grad_norm": 1.306300904073364, "learning_rate": 9.78318138458459e-06, "loss": 0.1946, "step": 1388 }, { "epoch": 0.555111821086262, "grad_norm": 1.2680623158657511, "learning_rate": 9.781825621539451e-06, "loss": 0.1961, "step": 1390 }, { "epoch": 0.5559105431309904, "grad_norm": 1.2795990600690346, "learning_rate": 9.78046572744815e-06, "loss": 0.1852, "step": 1392 }, { "epoch": 0.5567092651757188, "grad_norm": 1.4140574609807046, "learning_rate": 9.779101703485503e-06, "loss": 0.1915, "step": 1394 }, { "epoch": 0.5575079872204473, "grad_norm": 1.1983648174489345, "learning_rate": 9.7777335508299e-06, "loss": 0.1812, "step": 1396 }, { "epoch": 0.5583067092651757, "grad_norm": 1.2309594478518777, "learning_rate": 9.776361270663295e-06, "loss": 0.1861, "step": 1398 }, { "epoch": 0.5591054313099042, "grad_norm": 1.1982821972463766, "learning_rate": 9.77498486417121e-06, "loss": 0.1713, "step": 1400 }, { "epoch": 0.5599041533546326, "grad_norm": 1.4024839662731965, "learning_rate": 9.77360433254273e-06, "loss": 0.2106, "step": 1402 }, { "epoch": 0.560702875399361, "grad_norm": 1.2213412954218998, "learning_rate": 9.772219676970502e-06, "loss": 0.1693, "step": 1404 }, { "epoch": 0.5615015974440895, "grad_norm": 1.4583671258013307, "learning_rate": 9.770830898650739e-06, "loss": 0.2006, "step": 1406 }, { "epoch": 0.5623003194888179, "grad_norm": 1.2563510565936002, "learning_rate": 9.769437998783216e-06, "loss": 0.1862, "step": 1408 }, { "epoch": 0.5630990415335463, "grad_norm": 1.314486055157245, "learning_rate": 9.768040978571265e-06, "loss": 0.1755, "step": 1410 }, { "epoch": 0.5638977635782748, "grad_norm": 1.3853554706786548, "learning_rate": 9.76663983922178e-06, "loss": 0.2031, "step": 1412 }, { "epoch": 0.5646964856230032, "grad_norm": 1.2878686521894571, "learning_rate": 9.765234581945215e-06, "loss": 0.2123, "step": 1414 }, { "epoch": 0.5654952076677316, "grad_norm": 1.252378097297008, "learning_rate": 9.763825207955577e-06, "loss": 0.1935, "step": 1416 }, { "epoch": 0.5662939297124601, "grad_norm": 1.213815586126145, "learning_rate": 9.762411718470434e-06, "loss": 0.1658, "step": 1418 }, { "epoch": 0.5670926517571885, "grad_norm": 1.3949685159556597, "learning_rate": 9.760994114710906e-06, "loss": 0.2114, "step": 1420 }, { "epoch": 0.5678913738019169, "grad_norm": 1.2817914821838783, "learning_rate": 9.759572397901671e-06, "loss": 0.1767, "step": 1422 }, { "epoch": 0.5686900958466453, "grad_norm": 1.4145245969611204, "learning_rate": 9.758146569270957e-06, "loss": 0.1986, "step": 1424 }, { "epoch": 0.5694888178913738, "grad_norm": 1.2826653603852296, "learning_rate": 9.756716630050546e-06, "loss": 0.1854, "step": 1426 }, { "epoch": 0.5702875399361023, "grad_norm": 1.2441705422611444, "learning_rate": 9.755282581475769e-06, "loss": 0.1923, "step": 1428 }, { "epoch": 0.5710862619808307, "grad_norm": 1.4151560859787753, "learning_rate": 9.75384442478551e-06, "loss": 0.2145, "step": 1430 }, { "epoch": 0.5718849840255591, "grad_norm": 1.2387933950813024, "learning_rate": 9.7524021612222e-06, "loss": 0.1804, "step": 1432 }, { "epoch": 0.5726837060702875, "grad_norm": 1.395550176337113, "learning_rate": 9.75095579203182e-06, "loss": 0.203, "step": 1434 }, { "epoch": 0.5734824281150159, "grad_norm": 1.4116044575628213, "learning_rate": 9.749505318463894e-06, "loss": 0.2111, "step": 1436 }, { "epoch": 0.5742811501597445, "grad_norm": 1.2825403647392892, "learning_rate": 9.748050741771498e-06, "loss": 0.1891, "step": 1438 }, { "epoch": 0.5750798722044729, "grad_norm": 1.3389504388761955, "learning_rate": 9.746592063211247e-06, "loss": 0.1989, "step": 1440 }, { "epoch": 0.5758785942492013, "grad_norm": 1.2944657988503898, "learning_rate": 9.7451292840433e-06, "loss": 0.1986, "step": 1442 }, { "epoch": 0.5766773162939297, "grad_norm": 1.373583553210306, "learning_rate": 9.743662405531361e-06, "loss": 0.1896, "step": 1444 }, { "epoch": 0.5774760383386581, "grad_norm": 1.3216412289151287, "learning_rate": 9.742191428942677e-06, "loss": 0.1841, "step": 1446 }, { "epoch": 0.5782747603833865, "grad_norm": 1.4646317495074423, "learning_rate": 9.74071635554803e-06, "loss": 0.1987, "step": 1448 }, { "epoch": 0.579073482428115, "grad_norm": 1.3041923646858569, "learning_rate": 9.739237186621747e-06, "loss": 0.1825, "step": 1450 }, { "epoch": 0.5798722044728435, "grad_norm": 1.3593275898591164, "learning_rate": 9.737753923441689e-06, "loss": 0.1859, "step": 1452 }, { "epoch": 0.5806709265175719, "grad_norm": 1.2580533184275235, "learning_rate": 9.736266567289255e-06, "loss": 0.1969, "step": 1454 }, { "epoch": 0.5814696485623003, "grad_norm": 1.325474938146526, "learning_rate": 9.73477511944938e-06, "loss": 0.2043, "step": 1456 }, { "epoch": 0.5822683706070287, "grad_norm": 1.3507712058043326, "learning_rate": 9.733279581210535e-06, "loss": 0.1979, "step": 1458 }, { "epoch": 0.5830670926517572, "grad_norm": 1.2160048011358895, "learning_rate": 9.731779953864725e-06, "loss": 0.197, "step": 1460 }, { "epoch": 0.5838658146964856, "grad_norm": 1.3331470604472404, "learning_rate": 9.730276238707486e-06, "loss": 0.1808, "step": 1462 }, { "epoch": 0.5846645367412141, "grad_norm": 1.2875278440764244, "learning_rate": 9.728768437037882e-06, "loss": 0.1728, "step": 1464 }, { "epoch": 0.5854632587859425, "grad_norm": 1.3527080702439602, "learning_rate": 9.72725655015852e-06, "loss": 0.1979, "step": 1466 }, { "epoch": 0.5862619808306709, "grad_norm": 1.2097235904416872, "learning_rate": 9.725740579375518e-06, "loss": 0.1863, "step": 1468 }, { "epoch": 0.5870607028753994, "grad_norm": 1.3817835609057318, "learning_rate": 9.724220525998538e-06, "loss": 0.1966, "step": 1470 }, { "epoch": 0.5878594249201278, "grad_norm": 1.374598787526082, "learning_rate": 9.722696391340762e-06, "loss": 0.2084, "step": 1472 }, { "epoch": 0.5886581469648562, "grad_norm": 1.1682542939498242, "learning_rate": 9.721168176718896e-06, "loss": 0.1819, "step": 1474 }, { "epoch": 0.5894568690095847, "grad_norm": 1.3405797462449764, "learning_rate": 9.719635883453175e-06, "loss": 0.2133, "step": 1476 }, { "epoch": 0.5902555910543131, "grad_norm": 1.3482283589788477, "learning_rate": 9.718099512867355e-06, "loss": 0.1737, "step": 1478 }, { "epoch": 0.5910543130990416, "grad_norm": 1.3206877696300103, "learning_rate": 9.716559066288716e-06, "loss": 0.1913, "step": 1480 }, { "epoch": 0.59185303514377, "grad_norm": 1.3089066804848661, "learning_rate": 9.715014545048059e-06, "loss": 0.189, "step": 1482 }, { "epoch": 0.5926517571884984, "grad_norm": 1.1772602990825523, "learning_rate": 9.713465950479704e-06, "loss": 0.1634, "step": 1484 }, { "epoch": 0.5934504792332268, "grad_norm": 1.6415957840833397, "learning_rate": 9.711913283921488e-06, "loss": 0.1986, "step": 1486 }, { "epoch": 0.5942492012779552, "grad_norm": 1.3340911692430844, "learning_rate": 9.710356546714774e-06, "loss": 0.1762, "step": 1488 }, { "epoch": 0.5950479233226837, "grad_norm": 1.19131607954315, "learning_rate": 9.708795740204431e-06, "loss": 0.1807, "step": 1490 }, { "epoch": 0.5958466453674122, "grad_norm": 1.377988455402913, "learning_rate": 9.70723086573885e-06, "loss": 0.1939, "step": 1492 }, { "epoch": 0.5966453674121406, "grad_norm": 1.25358131516604, "learning_rate": 9.705661924669937e-06, "loss": 0.1792, "step": 1494 }, { "epoch": 0.597444089456869, "grad_norm": 1.2404137677794904, "learning_rate": 9.704088918353108e-06, "loss": 0.1687, "step": 1496 }, { "epoch": 0.5982428115015974, "grad_norm": 1.2604619975409577, "learning_rate": 9.70251184814729e-06, "loss": 0.1939, "step": 1498 }, { "epoch": 0.5990415335463258, "grad_norm": 1.363988707372948, "learning_rate": 9.700930715414923e-06, "loss": 0.1931, "step": 1500 }, { "epoch": 0.5990415335463258, "eval_loss": 0.1750628650188446, "eval_runtime": 416.5473, "eval_samples_per_second": 42.749, "eval_steps_per_second": 5.344, "step": 1500 }, { "epoch": 0.5998402555910544, "grad_norm": 1.5877690610897348, "learning_rate": 9.69934552152196e-06, "loss": 0.1771, "step": 1502 }, { "epoch": 0.6006389776357828, "grad_norm": 1.3591700844899557, "learning_rate": 9.697756267837856e-06, "loss": 0.2061, "step": 1504 }, { "epoch": 0.6014376996805112, "grad_norm": 1.7943521151008524, "learning_rate": 9.696162955735577e-06, "loss": 0.1918, "step": 1506 }, { "epoch": 0.6022364217252396, "grad_norm": 1.3757909830144823, "learning_rate": 9.694565586591595e-06, "loss": 0.1972, "step": 1508 }, { "epoch": 0.603035143769968, "grad_norm": 1.3887040239577928, "learning_rate": 9.692964161785885e-06, "loss": 0.1861, "step": 1510 }, { "epoch": 0.6038338658146964, "grad_norm": 1.1809219692093975, "learning_rate": 9.691358682701927e-06, "loss": 0.1718, "step": 1512 }, { "epoch": 0.604632587859425, "grad_norm": 1.3982270520799365, "learning_rate": 9.689749150726705e-06, "loss": 0.1747, "step": 1514 }, { "epoch": 0.6054313099041534, "grad_norm": 1.3379960842286045, "learning_rate": 9.688135567250701e-06, "loss": 0.1977, "step": 1516 }, { "epoch": 0.6062300319488818, "grad_norm": 1.3650657458967188, "learning_rate": 9.6865179336679e-06, "loss": 0.234, "step": 1518 }, { "epoch": 0.6070287539936102, "grad_norm": 1.220639149821231, "learning_rate": 9.684896251375784e-06, "loss": 0.1832, "step": 1520 }, { "epoch": 0.6078274760383386, "grad_norm": 1.3205908355697185, "learning_rate": 9.683270521775334e-06, "loss": 0.1877, "step": 1522 }, { "epoch": 0.6086261980830671, "grad_norm": 1.1383918655715137, "learning_rate": 9.681640746271026e-06, "loss": 0.1849, "step": 1524 }, { "epoch": 0.6094249201277955, "grad_norm": 1.2899856170845478, "learning_rate": 9.680006926270833e-06, "loss": 0.1879, "step": 1526 }, { "epoch": 0.610223642172524, "grad_norm": 1.3318559141238482, "learning_rate": 9.678369063186224e-06, "loss": 0.197, "step": 1528 }, { "epoch": 0.6110223642172524, "grad_norm": 1.1812228890278815, "learning_rate": 9.676727158432153e-06, "loss": 0.1913, "step": 1530 }, { "epoch": 0.6118210862619808, "grad_norm": 1.4859051313262472, "learning_rate": 9.675081213427076e-06, "loss": 0.2113, "step": 1532 }, { "epoch": 0.6126198083067093, "grad_norm": 1.297766937816727, "learning_rate": 9.673431229592928e-06, "loss": 0.1929, "step": 1534 }, { "epoch": 0.6134185303514377, "grad_norm": 1.2317306535256243, "learning_rate": 9.671777208355146e-06, "loss": 0.2108, "step": 1536 }, { "epoch": 0.6142172523961661, "grad_norm": 1.2840356619037692, "learning_rate": 9.670119151142644e-06, "loss": 0.1853, "step": 1538 }, { "epoch": 0.6150159744408946, "grad_norm": 1.2450276467812322, "learning_rate": 9.668457059387828e-06, "loss": 0.2088, "step": 1540 }, { "epoch": 0.615814696485623, "grad_norm": 1.3269269430845416, "learning_rate": 9.66679093452659e-06, "loss": 0.2082, "step": 1542 }, { "epoch": 0.6166134185303515, "grad_norm": 1.1360832482053789, "learning_rate": 9.665120777998303e-06, "loss": 0.1941, "step": 1544 }, { "epoch": 0.6174121405750799, "grad_norm": 1.2899083518509165, "learning_rate": 9.663446591245825e-06, "loss": 0.1761, "step": 1546 }, { "epoch": 0.6182108626198083, "grad_norm": 1.352813774180764, "learning_rate": 9.661768375715493e-06, "loss": 0.1922, "step": 1548 }, { "epoch": 0.6190095846645367, "grad_norm": 1.2861565795443108, "learning_rate": 9.660086132857132e-06, "loss": 0.2052, "step": 1550 }, { "epoch": 0.6198083067092651, "grad_norm": 1.5903917159408412, "learning_rate": 9.658399864124037e-06, "loss": 0.2057, "step": 1552 }, { "epoch": 0.6206070287539937, "grad_norm": 1.30967345968437, "learning_rate": 9.656709570972987e-06, "loss": 0.1925, "step": 1554 }, { "epoch": 0.6214057507987221, "grad_norm": 1.328489492989945, "learning_rate": 9.655015254864236e-06, "loss": 0.204, "step": 1556 }, { "epoch": 0.6222044728434505, "grad_norm": 1.6209499481494345, "learning_rate": 9.653316917261511e-06, "loss": 0.2084, "step": 1558 }, { "epoch": 0.6230031948881789, "grad_norm": 1.40907634938526, "learning_rate": 9.65161455963202e-06, "loss": 0.2028, "step": 1560 }, { "epoch": 0.6238019169329073, "grad_norm": 1.1340460294386678, "learning_rate": 9.649908183446432e-06, "loss": 0.1703, "step": 1562 }, { "epoch": 0.6246006389776357, "grad_norm": 1.2652621838230012, "learning_rate": 9.648197790178902e-06, "loss": 0.1775, "step": 1564 }, { "epoch": 0.6253993610223643, "grad_norm": 1.1637259399366224, "learning_rate": 9.646483381307047e-06, "loss": 0.1786, "step": 1566 }, { "epoch": 0.6261980830670927, "grad_norm": 1.3590258772922685, "learning_rate": 9.64476495831195e-06, "loss": 0.1957, "step": 1568 }, { "epoch": 0.6269968051118211, "grad_norm": 1.5133394068843713, "learning_rate": 9.643042522678172e-06, "loss": 0.2086, "step": 1570 }, { "epoch": 0.6277955271565495, "grad_norm": 1.5150111988589272, "learning_rate": 9.641316075893731e-06, "loss": 0.2025, "step": 1572 }, { "epoch": 0.6285942492012779, "grad_norm": 1.2465479067467933, "learning_rate": 9.639585619450116e-06, "loss": 0.1887, "step": 1574 }, { "epoch": 0.6293929712460063, "grad_norm": 1.2608367845246782, "learning_rate": 9.637851154842279e-06, "loss": 0.1908, "step": 1576 }, { "epoch": 0.6301916932907349, "grad_norm": 1.228578420129268, "learning_rate": 9.636112683568633e-06, "loss": 0.182, "step": 1578 }, { "epoch": 0.6309904153354633, "grad_norm": 1.2794906854021242, "learning_rate": 9.63437020713105e-06, "loss": 0.1891, "step": 1580 }, { "epoch": 0.6317891373801917, "grad_norm": 1.2079726286437402, "learning_rate": 9.632623727034868e-06, "loss": 0.1918, "step": 1582 }, { "epoch": 0.6325878594249201, "grad_norm": 1.4507684848320332, "learning_rate": 9.630873244788884e-06, "loss": 0.1928, "step": 1584 }, { "epoch": 0.6333865814696485, "grad_norm": 1.3354253708604187, "learning_rate": 9.629118761905343e-06, "loss": 0.2098, "step": 1586 }, { "epoch": 0.634185303514377, "grad_norm": 1.2516888814265885, "learning_rate": 9.627360279899958e-06, "loss": 0.1941, "step": 1588 }, { "epoch": 0.6349840255591054, "grad_norm": 1.193887849956848, "learning_rate": 9.62559780029189e-06, "loss": 0.1811, "step": 1590 }, { "epoch": 0.6357827476038339, "grad_norm": 1.2324593768790522, "learning_rate": 9.623831324603755e-06, "loss": 0.1896, "step": 1592 }, { "epoch": 0.6365814696485623, "grad_norm": 1.252327759159766, "learning_rate": 9.62206085436162e-06, "loss": 0.2004, "step": 1594 }, { "epoch": 0.6373801916932907, "grad_norm": 1.375448278876851, "learning_rate": 9.620286391095004e-06, "loss": 0.2213, "step": 1596 }, { "epoch": 0.6381789137380192, "grad_norm": 1.2928127323852008, "learning_rate": 9.618507936336878e-06, "loss": 0.184, "step": 1598 }, { "epoch": 0.6389776357827476, "grad_norm": 1.292316256258317, "learning_rate": 9.61672549162366e-06, "loss": 0.1974, "step": 1600 }, { "epoch": 0.639776357827476, "grad_norm": 1.2009672868034973, "learning_rate": 9.61493905849521e-06, "loss": 0.1926, "step": 1602 }, { "epoch": 0.6405750798722045, "grad_norm": 1.1528776664831937, "learning_rate": 9.61314863849484e-06, "loss": 0.1781, "step": 1604 }, { "epoch": 0.6413738019169329, "grad_norm": 1.228036340284955, "learning_rate": 9.611354233169305e-06, "loss": 0.1712, "step": 1606 }, { "epoch": 0.6421725239616614, "grad_norm": 1.2207441407251327, "learning_rate": 9.6095558440688e-06, "loss": 0.187, "step": 1608 }, { "epoch": 0.6429712460063898, "grad_norm": 1.2828242186591854, "learning_rate": 9.607753472746967e-06, "loss": 0.1847, "step": 1610 }, { "epoch": 0.6437699680511182, "grad_norm": 1.147124947259627, "learning_rate": 9.605947120760878e-06, "loss": 0.1555, "step": 1612 }, { "epoch": 0.6445686900958466, "grad_norm": 1.1939260367535258, "learning_rate": 9.604136789671056e-06, "loss": 0.2005, "step": 1614 }, { "epoch": 0.645367412140575, "grad_norm": 1.20685907752979, "learning_rate": 9.602322481041457e-06, "loss": 0.1927, "step": 1616 }, { "epoch": 0.6461661341853036, "grad_norm": 1.2690850828036355, "learning_rate": 9.600504196439468e-06, "loss": 0.1789, "step": 1618 }, { "epoch": 0.646964856230032, "grad_norm": 1.3543205528313849, "learning_rate": 9.59868193743592e-06, "loss": 0.22, "step": 1620 }, { "epoch": 0.6477635782747604, "grad_norm": 1.3669603475853995, "learning_rate": 9.596855705605069e-06, "loss": 0.1948, "step": 1622 }, { "epoch": 0.6485623003194888, "grad_norm": 1.3838109830343859, "learning_rate": 9.595025502524609e-06, "loss": 0.1992, "step": 1624 }, { "epoch": 0.6493610223642172, "grad_norm": 1.1502765370043713, "learning_rate": 9.593191329775663e-06, "loss": 0.1778, "step": 1626 }, { "epoch": 0.6501597444089456, "grad_norm": 1.2318516996914193, "learning_rate": 9.591353188942782e-06, "loss": 0.2073, "step": 1628 }, { "epoch": 0.6509584664536742, "grad_norm": 1.6179688640460086, "learning_rate": 9.589511081613947e-06, "loss": 0.2033, "step": 1630 }, { "epoch": 0.6517571884984026, "grad_norm": 1.1413033654758473, "learning_rate": 9.587665009380565e-06, "loss": 0.1859, "step": 1632 }, { "epoch": 0.652555910543131, "grad_norm": 1.2785775485264639, "learning_rate": 9.585814973837468e-06, "loss": 0.1959, "step": 1634 }, { "epoch": 0.6533546325878594, "grad_norm": 1.2901165952115163, "learning_rate": 9.583960976582914e-06, "loss": 0.1962, "step": 1636 }, { "epoch": 0.6541533546325878, "grad_norm": 1.2679330444702808, "learning_rate": 9.582103019218577e-06, "loss": 0.1907, "step": 1638 }, { "epoch": 0.6549520766773163, "grad_norm": 1.246571134661727, "learning_rate": 9.580241103349562e-06, "loss": 0.1712, "step": 1640 }, { "epoch": 0.6557507987220448, "grad_norm": 1.342636088102212, "learning_rate": 9.578375230584384e-06, "loss": 0.1789, "step": 1642 }, { "epoch": 0.6565495207667732, "grad_norm": 1.173038253799244, "learning_rate": 9.576505402534984e-06, "loss": 0.1717, "step": 1644 }, { "epoch": 0.6573482428115016, "grad_norm": 1.2394488250110955, "learning_rate": 9.574631620816718e-06, "loss": 0.186, "step": 1646 }, { "epoch": 0.65814696485623, "grad_norm": 1.4829342645126746, "learning_rate": 9.572753887048353e-06, "loss": 0.1965, "step": 1648 }, { "epoch": 0.6589456869009584, "grad_norm": 1.2666969584636023, "learning_rate": 9.570872202852077e-06, "loss": 0.181, "step": 1650 }, { "epoch": 0.6597444089456869, "grad_norm": 1.2265218244462477, "learning_rate": 9.568986569853487e-06, "loss": 0.1811, "step": 1652 }, { "epoch": 0.6605431309904153, "grad_norm": 1.391040746962543, "learning_rate": 9.56709698968159e-06, "loss": 0.1955, "step": 1654 }, { "epoch": 0.6613418530351438, "grad_norm": 1.2540103131383575, "learning_rate": 9.565203463968808e-06, "loss": 0.2158, "step": 1656 }, { "epoch": 0.6621405750798722, "grad_norm": 1.218580930627416, "learning_rate": 9.563305994350966e-06, "loss": 0.2075, "step": 1658 }, { "epoch": 0.6629392971246006, "grad_norm": 1.2969445843723761, "learning_rate": 9.5614045824673e-06, "loss": 0.174, "step": 1660 }, { "epoch": 0.6637380191693291, "grad_norm": 1.2080482907459895, "learning_rate": 9.55949922996045e-06, "loss": 0.1718, "step": 1662 }, { "epoch": 0.6645367412140575, "grad_norm": 1.3088014021283991, "learning_rate": 9.557589938476462e-06, "loss": 0.1958, "step": 1664 }, { "epoch": 0.6653354632587859, "grad_norm": 1.2483532241185216, "learning_rate": 9.555676709664783e-06, "loss": 0.1667, "step": 1666 }, { "epoch": 0.6661341853035144, "grad_norm": 1.4228507269979516, "learning_rate": 9.55375954517826e-06, "loss": 0.1887, "step": 1668 }, { "epoch": 0.6669329073482428, "grad_norm": 1.426968884001837, "learning_rate": 9.551838446673144e-06, "loss": 0.1733, "step": 1670 }, { "epoch": 0.6677316293929713, "grad_norm": 1.238164625762186, "learning_rate": 9.549913415809084e-06, "loss": 0.1909, "step": 1672 }, { "epoch": 0.6685303514376997, "grad_norm": 1.1947296500798315, "learning_rate": 9.547984454249125e-06, "loss": 0.19, "step": 1674 }, { "epoch": 0.6693290734824281, "grad_norm": 1.3096866424518478, "learning_rate": 9.546051563659704e-06, "loss": 0.1838, "step": 1676 }, { "epoch": 0.6701277955271565, "grad_norm": 1.2325979346576457, "learning_rate": 9.54411474571066e-06, "loss": 0.213, "step": 1678 }, { "epoch": 0.670926517571885, "grad_norm": 1.1989676471273152, "learning_rate": 9.542174002075221e-06, "loss": 0.1815, "step": 1680 }, { "epoch": 0.6717252396166135, "grad_norm": 1.3573812014117548, "learning_rate": 9.540229334430005e-06, "loss": 0.1897, "step": 1682 }, { "epoch": 0.6725239616613419, "grad_norm": 1.2474833071465166, "learning_rate": 9.53828074445502e-06, "loss": 0.1846, "step": 1684 }, { "epoch": 0.6733226837060703, "grad_norm": 1.2370853532394706, "learning_rate": 9.536328233833668e-06, "loss": 0.1926, "step": 1686 }, { "epoch": 0.6741214057507987, "grad_norm": 1.2326249171384345, "learning_rate": 9.534371804252727e-06, "loss": 0.1971, "step": 1688 }, { "epoch": 0.6749201277955271, "grad_norm": 1.2299339970845358, "learning_rate": 9.532411457402374e-06, "loss": 0.1806, "step": 1690 }, { "epoch": 0.6757188498402555, "grad_norm": 1.4027305493123479, "learning_rate": 9.530447194976164e-06, "loss": 0.1939, "step": 1692 }, { "epoch": 0.6765175718849841, "grad_norm": 1.7551361158569634, "learning_rate": 9.52847901867103e-06, "loss": 0.2109, "step": 1694 }, { "epoch": 0.6773162939297125, "grad_norm": 1.2968839868364699, "learning_rate": 9.526506930187294e-06, "loss": 0.1987, "step": 1696 }, { "epoch": 0.6781150159744409, "grad_norm": 1.4078086349831407, "learning_rate": 9.524530931228653e-06, "loss": 0.1989, "step": 1698 }, { "epoch": 0.6789137380191693, "grad_norm": 1.231887491149701, "learning_rate": 9.522551023502183e-06, "loss": 0.1718, "step": 1700 }, { "epoch": 0.6797124600638977, "grad_norm": 1.4088570515681005, "learning_rate": 9.520567208718337e-06, "loss": 0.1791, "step": 1702 }, { "epoch": 0.6805111821086262, "grad_norm": 1.4104721082557625, "learning_rate": 9.518579488590947e-06, "loss": 0.2051, "step": 1704 }, { "epoch": 0.6813099041533547, "grad_norm": 1.4204595182317563, "learning_rate": 9.516587864837213e-06, "loss": 0.2222, "step": 1706 }, { "epoch": 0.6821086261980831, "grad_norm": 1.3446005177751752, "learning_rate": 9.51459233917771e-06, "loss": 0.1923, "step": 1708 }, { "epoch": 0.6829073482428115, "grad_norm": 1.3177179723391306, "learning_rate": 9.512592913336385e-06, "loss": 0.1911, "step": 1710 }, { "epoch": 0.6837060702875399, "grad_norm": 1.3007833533147979, "learning_rate": 9.510589589040554e-06, "loss": 0.2003, "step": 1712 }, { "epoch": 0.6845047923322684, "grad_norm": 1.3084903604314289, "learning_rate": 9.508582368020897e-06, "loss": 0.2068, "step": 1714 }, { "epoch": 0.6853035143769968, "grad_norm": 1.2332676487219392, "learning_rate": 9.506571252011467e-06, "loss": 0.1809, "step": 1716 }, { "epoch": 0.6861022364217252, "grad_norm": 1.4206295803871527, "learning_rate": 9.504556242749677e-06, "loss": 0.1879, "step": 1718 }, { "epoch": 0.6869009584664537, "grad_norm": 1.4363115148587187, "learning_rate": 9.502537341976305e-06, "loss": 0.1955, "step": 1720 }, { "epoch": 0.6876996805111821, "grad_norm": 1.3048263552086317, "learning_rate": 9.500514551435491e-06, "loss": 0.2062, "step": 1722 }, { "epoch": 0.6884984025559105, "grad_norm": 1.4533257081032092, "learning_rate": 9.498487872874735e-06, "loss": 0.2065, "step": 1724 }, { "epoch": 0.689297124600639, "grad_norm": 1.2029574969239287, "learning_rate": 9.496457308044895e-06, "loss": 0.1848, "step": 1726 }, { "epoch": 0.6900958466453674, "grad_norm": 1.2074365689053979, "learning_rate": 9.494422858700188e-06, "loss": 0.1966, "step": 1728 }, { "epoch": 0.6908945686900958, "grad_norm": 1.2900526030749346, "learning_rate": 9.492384526598188e-06, "loss": 0.1982, "step": 1730 }, { "epoch": 0.6916932907348243, "grad_norm": 1.2246323016242, "learning_rate": 9.49034231349982e-06, "loss": 0.1981, "step": 1732 }, { "epoch": 0.6924920127795527, "grad_norm": 1.2646452794419145, "learning_rate": 9.488296221169363e-06, "loss": 0.1748, "step": 1734 }, { "epoch": 0.6932907348242812, "grad_norm": 1.2901816109550333, "learning_rate": 9.48624625137445e-06, "loss": 0.2003, "step": 1736 }, { "epoch": 0.6940894568690096, "grad_norm": 1.236520057555359, "learning_rate": 9.484192405886058e-06, "loss": 0.1943, "step": 1738 }, { "epoch": 0.694888178913738, "grad_norm": 1.3841072722578425, "learning_rate": 9.48213468647852e-06, "loss": 0.2006, "step": 1740 }, { "epoch": 0.6956869009584664, "grad_norm": 1.2637297258797011, "learning_rate": 9.480073094929507e-06, "loss": 0.1508, "step": 1742 }, { "epoch": 0.6964856230031949, "grad_norm": 1.3190153343742, "learning_rate": 9.478007633020043e-06, "loss": 0.1791, "step": 1744 }, { "epoch": 0.6972843450479234, "grad_norm": 1.2377929664861391, "learning_rate": 9.47593830253449e-06, "loss": 0.1919, "step": 1746 }, { "epoch": 0.6980830670926518, "grad_norm": 1.2951377520887053, "learning_rate": 9.473865105260556e-06, "loss": 0.1931, "step": 1748 }, { "epoch": 0.6988817891373802, "grad_norm": 1.203360302375175, "learning_rate": 9.471788042989285e-06, "loss": 0.1628, "step": 1750 }, { "epoch": 0.6996805111821086, "grad_norm": 1.3053684785835005, "learning_rate": 9.469707117515068e-06, "loss": 0.197, "step": 1752 }, { "epoch": 0.700479233226837, "grad_norm": 1.1639750404721112, "learning_rate": 9.467622330635622e-06, "loss": 0.1811, "step": 1754 }, { "epoch": 0.7012779552715654, "grad_norm": 1.3446647639072014, "learning_rate": 9.465533684152011e-06, "loss": 0.1931, "step": 1756 }, { "epoch": 0.702076677316294, "grad_norm": 1.336079168507431, "learning_rate": 9.463441179868626e-06, "loss": 0.1804, "step": 1758 }, { "epoch": 0.7028753993610224, "grad_norm": 1.210643436064521, "learning_rate": 9.461344819593194e-06, "loss": 0.1948, "step": 1760 }, { "epoch": 0.7036741214057508, "grad_norm": 1.2291459699844245, "learning_rate": 9.459244605136775e-06, "loss": 0.1854, "step": 1762 }, { "epoch": 0.7044728434504792, "grad_norm": 1.5941109492311214, "learning_rate": 9.45714053831375e-06, "loss": 0.2128, "step": 1764 }, { "epoch": 0.7052715654952076, "grad_norm": 1.1893124518881097, "learning_rate": 9.45503262094184e-06, "loss": 0.1735, "step": 1766 }, { "epoch": 0.7060702875399361, "grad_norm": 1.244259188061574, "learning_rate": 9.452920854842085e-06, "loss": 0.203, "step": 1768 }, { "epoch": 0.7068690095846646, "grad_norm": 1.1591101687705918, "learning_rate": 9.45080524183885e-06, "loss": 0.1931, "step": 1770 }, { "epoch": 0.707667731629393, "grad_norm": 1.185305488469298, "learning_rate": 9.448685783759825e-06, "loss": 0.1826, "step": 1772 }, { "epoch": 0.7084664536741214, "grad_norm": 1.3935670621941678, "learning_rate": 9.446562482436026e-06, "loss": 0.1882, "step": 1774 }, { "epoch": 0.7092651757188498, "grad_norm": 1.3143460322695941, "learning_rate": 9.44443533970178e-06, "loss": 0.1815, "step": 1776 }, { "epoch": 0.7100638977635783, "grad_norm": 1.3400068262690805, "learning_rate": 9.442304357394741e-06, "loss": 0.1968, "step": 1778 }, { "epoch": 0.7108626198083067, "grad_norm": 1.2627252602067203, "learning_rate": 9.440169537355874e-06, "loss": 0.1837, "step": 1780 }, { "epoch": 0.7116613418530351, "grad_norm": 1.1054955641627056, "learning_rate": 9.438030881429465e-06, "loss": 0.1603, "step": 1782 }, { "epoch": 0.7124600638977636, "grad_norm": 1.3680070238884574, "learning_rate": 9.435888391463108e-06, "loss": 0.1996, "step": 1784 }, { "epoch": 0.713258785942492, "grad_norm": 1.297397045088257, "learning_rate": 9.433742069307714e-06, "loss": 0.1987, "step": 1786 }, { "epoch": 0.7140575079872205, "grad_norm": 1.2477413014257834, "learning_rate": 9.431591916817503e-06, "loss": 0.1821, "step": 1788 }, { "epoch": 0.7148562300319489, "grad_norm": 1.3162258234912159, "learning_rate": 9.429437935850003e-06, "loss": 0.2119, "step": 1790 }, { "epoch": 0.7156549520766773, "grad_norm": 1.215098930469111, "learning_rate": 9.427280128266049e-06, "loss": 0.1842, "step": 1792 }, { "epoch": 0.7164536741214057, "grad_norm": 1.2140728281775635, "learning_rate": 9.425118495929788e-06, "loss": 0.1813, "step": 1794 }, { "epoch": 0.7172523961661342, "grad_norm": 1.0941868316938037, "learning_rate": 9.422953040708662e-06, "loss": 0.1836, "step": 1796 }, { "epoch": 0.7180511182108626, "grad_norm": 1.2213745174193513, "learning_rate": 9.420783764473418e-06, "loss": 0.1902, "step": 1798 }, { "epoch": 0.7188498402555911, "grad_norm": 1.1225238944421074, "learning_rate": 9.418610669098114e-06, "loss": 0.1924, "step": 1800 }, { "epoch": 0.7196485623003195, "grad_norm": 1.194219981117358, "learning_rate": 9.41643375646009e-06, "loss": 0.1901, "step": 1802 }, { "epoch": 0.7204472843450479, "grad_norm": 1.405501209107951, "learning_rate": 9.41425302844e-06, "loss": 0.1962, "step": 1804 }, { "epoch": 0.7212460063897763, "grad_norm": 1.2754008408470467, "learning_rate": 9.412068486921786e-06, "loss": 0.1981, "step": 1806 }, { "epoch": 0.7220447284345048, "grad_norm": 1.153012769564869, "learning_rate": 9.409880133792684e-06, "loss": 0.1601, "step": 1808 }, { "epoch": 0.7228434504792333, "grad_norm": 1.412330054040255, "learning_rate": 9.407687970943223e-06, "loss": 0.2015, "step": 1810 }, { "epoch": 0.7236421725239617, "grad_norm": 1.2512273615084581, "learning_rate": 9.405492000267228e-06, "loss": 0.162, "step": 1812 }, { "epoch": 0.7244408945686901, "grad_norm": 1.3404549541893849, "learning_rate": 9.403292223661811e-06, "loss": 0.1858, "step": 1814 }, { "epoch": 0.7252396166134185, "grad_norm": 1.2882725353355315, "learning_rate": 9.40108864302737e-06, "loss": 0.1918, "step": 1816 }, { "epoch": 0.7260383386581469, "grad_norm": 1.2513310169482343, "learning_rate": 9.398881260267589e-06, "loss": 0.1998, "step": 1818 }, { "epoch": 0.7268370607028753, "grad_norm": 1.337658331922668, "learning_rate": 9.396670077289443e-06, "loss": 0.1932, "step": 1820 }, { "epoch": 0.7276357827476039, "grad_norm": 1.2367717186372091, "learning_rate": 9.394455096003182e-06, "loss": 0.2068, "step": 1822 }, { "epoch": 0.7284345047923323, "grad_norm": 1.1564610765311962, "learning_rate": 9.392236318322339e-06, "loss": 0.1801, "step": 1824 }, { "epoch": 0.7292332268370607, "grad_norm": 1.2112589486368208, "learning_rate": 9.390013746163733e-06, "loss": 0.1837, "step": 1826 }, { "epoch": 0.7300319488817891, "grad_norm": 1.254583211491624, "learning_rate": 9.387787381447455e-06, "loss": 0.2105, "step": 1828 }, { "epoch": 0.7308306709265175, "grad_norm": 1.1680394605252384, "learning_rate": 9.385557226096873e-06, "loss": 0.1933, "step": 1830 }, { "epoch": 0.731629392971246, "grad_norm": 1.1210837547747836, "learning_rate": 9.383323282038632e-06, "loss": 0.1847, "step": 1832 }, { "epoch": 0.7324281150159745, "grad_norm": 1.2069663644377342, "learning_rate": 9.381085551202648e-06, "loss": 0.1968, "step": 1834 }, { "epoch": 0.7332268370607029, "grad_norm": 1.2372918873896177, "learning_rate": 9.378844035522112e-06, "loss": 0.1969, "step": 1836 }, { "epoch": 0.7340255591054313, "grad_norm": 1.1846330324034746, "learning_rate": 9.376598736933478e-06, "loss": 0.191, "step": 1838 }, { "epoch": 0.7348242811501597, "grad_norm": 1.1755331433603171, "learning_rate": 9.374349657376473e-06, "loss": 0.1647, "step": 1840 }, { "epoch": 0.7356230031948882, "grad_norm": 1.2996956651624354, "learning_rate": 9.372096798794093e-06, "loss": 0.182, "step": 1842 }, { "epoch": 0.7364217252396166, "grad_norm": 1.2224015710817062, "learning_rate": 9.36984016313259e-06, "loss": 0.1732, "step": 1844 }, { "epoch": 0.737220447284345, "grad_norm": 1.196928762655955, "learning_rate": 9.367579752341488e-06, "loss": 0.1863, "step": 1846 }, { "epoch": 0.7380191693290735, "grad_norm": 1.2553608491551194, "learning_rate": 9.365315568373569e-06, "loss": 0.1825, "step": 1848 }, { "epoch": 0.7388178913738019, "grad_norm": 1.1728882445017346, "learning_rate": 9.363047613184872e-06, "loss": 0.1767, "step": 1850 }, { "epoch": 0.7396166134185304, "grad_norm": 1.179558407608193, "learning_rate": 9.360775888734699e-06, "loss": 0.1862, "step": 1852 }, { "epoch": 0.7404153354632588, "grad_norm": 1.1401251188622028, "learning_rate": 9.358500396985603e-06, "loss": 0.1836, "step": 1854 }, { "epoch": 0.7412140575079872, "grad_norm": 1.1844349399914762, "learning_rate": 9.356221139903395e-06, "loss": 0.1885, "step": 1856 }, { "epoch": 0.7420127795527156, "grad_norm": 1.2044582641333985, "learning_rate": 9.353938119457137e-06, "loss": 0.1865, "step": 1858 }, { "epoch": 0.7428115015974441, "grad_norm": 1.2685669411269742, "learning_rate": 9.351651337619145e-06, "loss": 0.1754, "step": 1860 }, { "epoch": 0.7436102236421726, "grad_norm": 1.4085265142769048, "learning_rate": 9.349360796364984e-06, "loss": 0.1972, "step": 1862 }, { "epoch": 0.744408945686901, "grad_norm": 1.3358324906099626, "learning_rate": 9.347066497673462e-06, "loss": 0.1934, "step": 1864 }, { "epoch": 0.7452076677316294, "grad_norm": 1.1473591638995988, "learning_rate": 9.34476844352664e-06, "loss": 0.1756, "step": 1866 }, { "epoch": 0.7460063897763578, "grad_norm": 1.1732315132551707, "learning_rate": 9.342466635909815e-06, "loss": 0.1937, "step": 1868 }, { "epoch": 0.7468051118210862, "grad_norm": 1.0907285192894114, "learning_rate": 9.340161076811539e-06, "loss": 0.1769, "step": 1870 }, { "epoch": 0.7476038338658147, "grad_norm": 1.3140788804501988, "learning_rate": 9.337851768223589e-06, "loss": 0.2101, "step": 1872 }, { "epoch": 0.7484025559105432, "grad_norm": 1.2720702400836585, "learning_rate": 9.335538712140997e-06, "loss": 0.1755, "step": 1874 }, { "epoch": 0.7492012779552716, "grad_norm": 1.1203884095157366, "learning_rate": 9.333221910562022e-06, "loss": 0.1645, "step": 1876 }, { "epoch": 0.75, "grad_norm": 1.3590498055976206, "learning_rate": 9.330901365488163e-06, "loss": 0.1898, "step": 1878 }, { "epoch": 0.7507987220447284, "grad_norm": 1.2561659399741483, "learning_rate": 9.328577078924151e-06, "loss": 0.2081, "step": 1880 }, { "epoch": 0.7515974440894568, "grad_norm": 1.5567261987394814, "learning_rate": 9.326249052877949e-06, "loss": 0.1899, "step": 1882 }, { "epoch": 0.7523961661341853, "grad_norm": 1.10938481898858, "learning_rate": 9.323917289360755e-06, "loss": 0.1822, "step": 1884 }, { "epoch": 0.7531948881789138, "grad_norm": 1.199525742926509, "learning_rate": 9.321581790386989e-06, "loss": 0.1838, "step": 1886 }, { "epoch": 0.7539936102236422, "grad_norm": 1.2183809794152496, "learning_rate": 9.319242557974306e-06, "loss": 0.1813, "step": 1888 }, { "epoch": 0.7547923322683706, "grad_norm": 1.3361557842251508, "learning_rate": 9.316899594143581e-06, "loss": 0.1923, "step": 1890 }, { "epoch": 0.755591054313099, "grad_norm": 1.147280166061991, "learning_rate": 9.31455290091891e-06, "loss": 0.1681, "step": 1892 }, { "epoch": 0.7563897763578274, "grad_norm": 1.2035996373756572, "learning_rate": 9.31220248032762e-06, "loss": 0.1752, "step": 1894 }, { "epoch": 0.7571884984025559, "grad_norm": 1.2112026467710946, "learning_rate": 9.309848334400247e-06, "loss": 0.1862, "step": 1896 }, { "epoch": 0.7579872204472844, "grad_norm": 1.345396574129663, "learning_rate": 9.307490465170555e-06, "loss": 0.2152, "step": 1898 }, { "epoch": 0.7587859424920128, "grad_norm": 1.1731055253334703, "learning_rate": 9.30512887467552e-06, "loss": 0.1845, "step": 1900 }, { "epoch": 0.7595846645367412, "grad_norm": 1.2248021789089425, "learning_rate": 9.302763564955332e-06, "loss": 0.1896, "step": 1902 }, { "epoch": 0.7603833865814696, "grad_norm": 1.2159843127230634, "learning_rate": 9.300394538053395e-06, "loss": 0.2073, "step": 1904 }, { "epoch": 0.7611821086261981, "grad_norm": 1.0782210847536804, "learning_rate": 9.298021796016328e-06, "loss": 0.1965, "step": 1906 }, { "epoch": 0.7619808306709265, "grad_norm": 1.0674705155900905, "learning_rate": 9.295645340893954e-06, "loss": 0.1828, "step": 1908 }, { "epoch": 0.762779552715655, "grad_norm": 1.242030763933389, "learning_rate": 9.293265174739304e-06, "loss": 0.185, "step": 1910 }, { "epoch": 0.7635782747603834, "grad_norm": 1.2869300880171743, "learning_rate": 9.29088129960862e-06, "loss": 0.2072, "step": 1912 }, { "epoch": 0.7643769968051118, "grad_norm": 1.2987640167915715, "learning_rate": 9.288493717561346e-06, "loss": 0.1797, "step": 1914 }, { "epoch": 0.7651757188498403, "grad_norm": 1.360583270135005, "learning_rate": 9.286102430660124e-06, "loss": 0.1915, "step": 1916 }, { "epoch": 0.7659744408945687, "grad_norm": 1.1308395049372801, "learning_rate": 9.283707440970804e-06, "loss": 0.1732, "step": 1918 }, { "epoch": 0.7667731629392971, "grad_norm": 1.1740116097009, "learning_rate": 9.281308750562426e-06, "loss": 0.1954, "step": 1920 }, { "epoch": 0.7675718849840255, "grad_norm": 1.2021810735396423, "learning_rate": 9.278906361507238e-06, "loss": 0.1941, "step": 1922 }, { "epoch": 0.768370607028754, "grad_norm": 1.3054645933982225, "learning_rate": 9.276500275880676e-06, "loss": 0.2025, "step": 1924 }, { "epoch": 0.7691693290734825, "grad_norm": 1.2735690354152704, "learning_rate": 9.274090495761368e-06, "loss": 0.1977, "step": 1926 }, { "epoch": 0.7699680511182109, "grad_norm": 1.0912094717996301, "learning_rate": 9.271677023231137e-06, "loss": 0.1918, "step": 1928 }, { "epoch": 0.7707667731629393, "grad_norm": 1.2385601982012775, "learning_rate": 9.269259860375001e-06, "loss": 0.1853, "step": 1930 }, { "epoch": 0.7715654952076677, "grad_norm": 1.2615706597096068, "learning_rate": 9.266839009281154e-06, "loss": 0.1775, "step": 1932 }, { "epoch": 0.7723642172523961, "grad_norm": 1.2657440936466628, "learning_rate": 9.264414472040986e-06, "loss": 0.2122, "step": 1934 }, { "epoch": 0.7731629392971247, "grad_norm": 1.2434750494740934, "learning_rate": 9.261986250749068e-06, "loss": 0.1642, "step": 1936 }, { "epoch": 0.7739616613418531, "grad_norm": 1.2150467984561792, "learning_rate": 9.259554347503157e-06, "loss": 0.179, "step": 1938 }, { "epoch": 0.7747603833865815, "grad_norm": 1.0460909251749075, "learning_rate": 9.257118764404183e-06, "loss": 0.1766, "step": 1940 }, { "epoch": 0.7755591054313099, "grad_norm": 1.2648684414877398, "learning_rate": 9.254679503556261e-06, "loss": 0.1873, "step": 1942 }, { "epoch": 0.7763578274760383, "grad_norm": 1.0629676893025566, "learning_rate": 9.252236567066686e-06, "loss": 0.1765, "step": 1944 }, { "epoch": 0.7771565495207667, "grad_norm": 1.1476890935760726, "learning_rate": 9.249789957045921e-06, "loss": 0.1828, "step": 1946 }, { "epoch": 0.7779552715654952, "grad_norm": 1.255044114071295, "learning_rate": 9.247339675607606e-06, "loss": 0.1962, "step": 1948 }, { "epoch": 0.7787539936102237, "grad_norm": 1.1772586864322332, "learning_rate": 9.244885724868556e-06, "loss": 0.1948, "step": 1950 }, { "epoch": 0.7795527156549521, "grad_norm": 1.134211924482269, "learning_rate": 9.242428106948748e-06, "loss": 0.1797, "step": 1952 }, { "epoch": 0.7803514376996805, "grad_norm": 1.1580694174114012, "learning_rate": 9.239966823971339e-06, "loss": 0.1818, "step": 1954 }, { "epoch": 0.7811501597444089, "grad_norm": 1.3621884578061603, "learning_rate": 9.23750187806264e-06, "loss": 0.2012, "step": 1956 }, { "epoch": 0.7819488817891374, "grad_norm": 1.107597745587062, "learning_rate": 9.235033271352132e-06, "loss": 0.1816, "step": 1958 }, { "epoch": 0.7827476038338658, "grad_norm": 1.1104284440920194, "learning_rate": 9.23256100597246e-06, "loss": 0.1748, "step": 1960 }, { "epoch": 0.7835463258785943, "grad_norm": 1.0772833846504462, "learning_rate": 9.230085084059428e-06, "loss": 0.1577, "step": 1962 }, { "epoch": 0.7843450479233227, "grad_norm": 1.5348516969940666, "learning_rate": 9.227605507751998e-06, "loss": 0.1797, "step": 1964 }, { "epoch": 0.7851437699680511, "grad_norm": 1.1556035368347786, "learning_rate": 9.22512227919229e-06, "loss": 0.1904, "step": 1966 }, { "epoch": 0.7859424920127795, "grad_norm": 1.1541070739348673, "learning_rate": 9.22263540052558e-06, "loss": 0.1629, "step": 1968 }, { "epoch": 0.786741214057508, "grad_norm": 1.1977607582272027, "learning_rate": 9.220144873900294e-06, "loss": 0.1852, "step": 1970 }, { "epoch": 0.7875399361022364, "grad_norm": 1.2010602313619996, "learning_rate": 9.217650701468016e-06, "loss": 0.167, "step": 1972 }, { "epoch": 0.7883386581469649, "grad_norm": 1.2177543309257506, "learning_rate": 9.215152885383473e-06, "loss": 0.1785, "step": 1974 }, { "epoch": 0.7891373801916933, "grad_norm": 1.3217713389862689, "learning_rate": 9.212651427804544e-06, "loss": 0.2005, "step": 1976 }, { "epoch": 0.7899361022364217, "grad_norm": 1.2318521295090967, "learning_rate": 9.210146330892251e-06, "loss": 0.1804, "step": 1978 }, { "epoch": 0.7907348242811502, "grad_norm": 1.2625115330731185, "learning_rate": 9.20763759681076e-06, "loss": 0.1648, "step": 1980 }, { "epoch": 0.7915335463258786, "grad_norm": 1.1952075781633953, "learning_rate": 9.205125227727386e-06, "loss": 0.1753, "step": 1982 }, { "epoch": 0.792332268370607, "grad_norm": 1.2767418757560534, "learning_rate": 9.202609225812572e-06, "loss": 0.1694, "step": 1984 }, { "epoch": 0.7931309904153354, "grad_norm": 1.2452947191655441, "learning_rate": 9.200089593239911e-06, "loss": 0.2021, "step": 1986 }, { "epoch": 0.7939297124600639, "grad_norm": 1.2365293461357185, "learning_rate": 9.197566332186125e-06, "loss": 0.1858, "step": 1988 }, { "epoch": 0.7947284345047924, "grad_norm": 1.2036375025069872, "learning_rate": 9.195039444831076e-06, "loss": 0.1957, "step": 1990 }, { "epoch": 0.7955271565495208, "grad_norm": 1.2642281456239493, "learning_rate": 9.192508933357753e-06, "loss": 0.1764, "step": 1992 }, { "epoch": 0.7963258785942492, "grad_norm": 1.1075926832298097, "learning_rate": 9.189974799952283e-06, "loss": 0.1696, "step": 1994 }, { "epoch": 0.7971246006389776, "grad_norm": 1.1441930453048488, "learning_rate": 9.187437046803916e-06, "loss": 0.1882, "step": 1996 }, { "epoch": 0.797923322683706, "grad_norm": 1.1610488689052545, "learning_rate": 9.184895676105033e-06, "loss": 0.1844, "step": 1998 }, { "epoch": 0.7987220447284346, "grad_norm": 1.2457291872936878, "learning_rate": 9.182350690051134e-06, "loss": 0.187, "step": 2000 }, { "epoch": 0.7987220447284346, "eval_loss": 0.16647948324680328, "eval_runtime": 417.9228, "eval_samples_per_second": 42.608, "eval_steps_per_second": 5.326, "step": 2000 }, { "epoch": 0.799520766773163, "grad_norm": 1.1216999386193873, "learning_rate": 9.179802090840852e-06, "loss": 0.1554, "step": 2002 }, { "epoch": 0.8003194888178914, "grad_norm": 1.2327864947476759, "learning_rate": 9.177249880675934e-06, "loss": 0.19, "step": 2004 }, { "epoch": 0.8011182108626198, "grad_norm": 1.1765118852291059, "learning_rate": 9.174694061761249e-06, "loss": 0.1752, "step": 2006 }, { "epoch": 0.8019169329073482, "grad_norm": 1.3929323793396107, "learning_rate": 9.172134636304783e-06, "loss": 0.197, "step": 2008 }, { "epoch": 0.8027156549520766, "grad_norm": 1.1116097957166795, "learning_rate": 9.169571606517637e-06, "loss": 0.1821, "step": 2010 }, { "epoch": 0.8035143769968051, "grad_norm": 1.1758254974846156, "learning_rate": 9.16700497461403e-06, "loss": 0.1801, "step": 2012 }, { "epoch": 0.8043130990415336, "grad_norm": 1.2306371246505314, "learning_rate": 9.164434742811287e-06, "loss": 0.1888, "step": 2014 }, { "epoch": 0.805111821086262, "grad_norm": 8.822301554752972, "learning_rate": 9.16186091332985e-06, "loss": 0.2123, "step": 2016 }, { "epoch": 0.8059105431309904, "grad_norm": 3.5201171269663365, "learning_rate": 9.159283488393259e-06, "loss": 0.1897, "step": 2018 }, { "epoch": 0.8067092651757188, "grad_norm": 1.5889480805348766, "learning_rate": 9.15670247022817e-06, "loss": 0.1948, "step": 2020 }, { "epoch": 0.8075079872204473, "grad_norm": 1.2452616237787566, "learning_rate": 9.154117861064337e-06, "loss": 0.1736, "step": 2022 }, { "epoch": 0.8083067092651757, "grad_norm": 1.1779378285841855, "learning_rate": 9.15152966313462e-06, "loss": 0.1949, "step": 2024 }, { "epoch": 0.8091054313099042, "grad_norm": 1.1749802960279865, "learning_rate": 9.148937878674975e-06, "loss": 0.1931, "step": 2026 }, { "epoch": 0.8099041533546326, "grad_norm": 1.1577483816469516, "learning_rate": 9.146342509924464e-06, "loss": 0.1761, "step": 2028 }, { "epoch": 0.810702875399361, "grad_norm": 1.1493255720503592, "learning_rate": 9.143743559125238e-06, "loss": 0.2112, "step": 2030 }, { "epoch": 0.8115015974440895, "grad_norm": 1.1867113395597348, "learning_rate": 9.141141028522544e-06, "loss": 0.1871, "step": 2032 }, { "epoch": 0.8123003194888179, "grad_norm": 1.2097756605247039, "learning_rate": 9.138534920364725e-06, "loss": 0.1895, "step": 2034 }, { "epoch": 0.8130990415335463, "grad_norm": 1.2656565196236773, "learning_rate": 9.135925236903213e-06, "loss": 0.1931, "step": 2036 }, { "epoch": 0.8138977635782748, "grad_norm": 1.1498073348190923, "learning_rate": 9.133311980392525e-06, "loss": 0.1622, "step": 2038 }, { "epoch": 0.8146964856230032, "grad_norm": 1.1313101324027541, "learning_rate": 9.130695153090272e-06, "loss": 0.1909, "step": 2040 }, { "epoch": 0.8154952076677316, "grad_norm": 1.2312685403966457, "learning_rate": 9.128074757257142e-06, "loss": 0.2096, "step": 2042 }, { "epoch": 0.8162939297124601, "grad_norm": 1.1681617587331703, "learning_rate": 9.125450795156913e-06, "loss": 0.168, "step": 2044 }, { "epoch": 0.8170926517571885, "grad_norm": 1.1768723977340287, "learning_rate": 9.12282326905644e-06, "loss": 0.1889, "step": 2046 }, { "epoch": 0.8178913738019169, "grad_norm": 1.0800467032397891, "learning_rate": 9.120192181225658e-06, "loss": 0.1948, "step": 2048 }, { "epoch": 0.8186900958466453, "grad_norm": 1.1247061859219583, "learning_rate": 9.117557533937575e-06, "loss": 0.1713, "step": 2050 }, { "epoch": 0.8194888178913738, "grad_norm": 1.172438969339138, "learning_rate": 9.114919329468283e-06, "loss": 0.162, "step": 2052 }, { "epoch": 0.8202875399361023, "grad_norm": 1.2278975843705813, "learning_rate": 9.112277570096938e-06, "loss": 0.1865, "step": 2054 }, { "epoch": 0.8210862619808307, "grad_norm": 1.2992348324926213, "learning_rate": 9.109632258105771e-06, "loss": 0.2055, "step": 2056 }, { "epoch": 0.8218849840255591, "grad_norm": 1.251504510998654, "learning_rate": 9.106983395780086e-06, "loss": 0.1843, "step": 2058 }, { "epoch": 0.8226837060702875, "grad_norm": 1.276208869165608, "learning_rate": 9.104330985408245e-06, "loss": 0.1886, "step": 2060 }, { "epoch": 0.8234824281150159, "grad_norm": 1.356357122339326, "learning_rate": 9.101675029281683e-06, "loss": 0.1981, "step": 2062 }, { "epoch": 0.8242811501597445, "grad_norm": 1.2399736943434136, "learning_rate": 9.099015529694894e-06, "loss": 0.1897, "step": 2064 }, { "epoch": 0.8250798722044729, "grad_norm": 1.2876220526804583, "learning_rate": 9.096352488945437e-06, "loss": 0.1796, "step": 2066 }, { "epoch": 0.8258785942492013, "grad_norm": 1.2532320467708493, "learning_rate": 9.093685909333926e-06, "loss": 0.1788, "step": 2068 }, { "epoch": 0.8266773162939297, "grad_norm": 1.2586518550619221, "learning_rate": 9.091015793164035e-06, "loss": 0.1768, "step": 2070 }, { "epoch": 0.8274760383386581, "grad_norm": 1.1394590760675556, "learning_rate": 9.088342142742493e-06, "loss": 0.1741, "step": 2072 }, { "epoch": 0.8282747603833865, "grad_norm": 1.09902223120881, "learning_rate": 9.08566496037908e-06, "loss": 0.1749, "step": 2074 }, { "epoch": 0.829073482428115, "grad_norm": 1.094791391856433, "learning_rate": 9.08298424838663e-06, "loss": 0.1785, "step": 2076 }, { "epoch": 0.8298722044728435, "grad_norm": 1.3016717542317755, "learning_rate": 9.080300009081025e-06, "loss": 0.2058, "step": 2078 }, { "epoch": 0.8306709265175719, "grad_norm": 1.2437203627775957, "learning_rate": 9.077612244781196e-06, "loss": 0.18, "step": 2080 }, { "epoch": 0.8314696485623003, "grad_norm": 1.1494190321851014, "learning_rate": 9.074920957809115e-06, "loss": 0.1993, "step": 2082 }, { "epoch": 0.8322683706070287, "grad_norm": 1.197846354885409, "learning_rate": 9.0722261504898e-06, "loss": 0.1924, "step": 2084 }, { "epoch": 0.8330670926517572, "grad_norm": 1.2305171514008204, "learning_rate": 9.069527825151314e-06, "loss": 0.2045, "step": 2086 }, { "epoch": 0.8338658146964856, "grad_norm": 1.1363998111576086, "learning_rate": 9.066825984124751e-06, "loss": 0.1878, "step": 2088 }, { "epoch": 0.8346645367412141, "grad_norm": 1.2416863327291112, "learning_rate": 9.064120629744253e-06, "loss": 0.187, "step": 2090 }, { "epoch": 0.8354632587859425, "grad_norm": 1.1003807898549165, "learning_rate": 9.061411764346983e-06, "loss": 0.1727, "step": 2092 }, { "epoch": 0.8362619808306709, "grad_norm": 1.1391607571000586, "learning_rate": 9.05869939027315e-06, "loss": 0.1792, "step": 2094 }, { "epoch": 0.8370607028753994, "grad_norm": 1.191740022915474, "learning_rate": 9.055983509865988e-06, "loss": 0.1852, "step": 2096 }, { "epoch": 0.8378594249201278, "grad_norm": 1.263318830688807, "learning_rate": 9.053264125471763e-06, "loss": 0.1794, "step": 2098 }, { "epoch": 0.8386581469648562, "grad_norm": 1.2749000778835136, "learning_rate": 9.050541239439764e-06, "loss": 0.1683, "step": 2100 }, { "epoch": 0.8394568690095847, "grad_norm": 1.264926687502538, "learning_rate": 9.04781485412231e-06, "loss": 0.1811, "step": 2102 }, { "epoch": 0.8402555910543131, "grad_norm": 1.218176101735685, "learning_rate": 9.045084971874738e-06, "loss": 0.1826, "step": 2104 }, { "epoch": 0.8410543130990416, "grad_norm": 1.1339821842224997, "learning_rate": 9.04235159505541e-06, "loss": 0.1748, "step": 2106 }, { "epoch": 0.84185303514377, "grad_norm": 1.228556794787821, "learning_rate": 9.039614726025708e-06, "loss": 0.1816, "step": 2108 }, { "epoch": 0.8426517571884984, "grad_norm": 1.1590750565644556, "learning_rate": 9.036874367150024e-06, "loss": 0.1919, "step": 2110 }, { "epoch": 0.8434504792332268, "grad_norm": 1.1992254533267062, "learning_rate": 9.034130520795774e-06, "loss": 0.1786, "step": 2112 }, { "epoch": 0.8442492012779552, "grad_norm": 1.2866570327279305, "learning_rate": 9.03138318933338e-06, "loss": 0.1779, "step": 2114 }, { "epoch": 0.8450479233226837, "grad_norm": 1.1304382720588986, "learning_rate": 9.028632375136277e-06, "loss": 0.1916, "step": 2116 }, { "epoch": 0.8458466453674122, "grad_norm": 1.2026241441378607, "learning_rate": 9.025878080580908e-06, "loss": 0.1865, "step": 2118 }, { "epoch": 0.8466453674121406, "grad_norm": 1.2239810455485836, "learning_rate": 9.023120308046726e-06, "loss": 0.2069, "step": 2120 }, { "epoch": 0.847444089456869, "grad_norm": 1.1593857182783416, "learning_rate": 9.020359059916189e-06, "loss": 0.1802, "step": 2122 }, { "epoch": 0.8482428115015974, "grad_norm": 1.1851101442311125, "learning_rate": 9.017594338574746e-06, "loss": 0.1718, "step": 2124 }, { "epoch": 0.8490415335463258, "grad_norm": 1.3424377790206583, "learning_rate": 9.014826146410863e-06, "loss": 0.187, "step": 2126 }, { "epoch": 0.8498402555910544, "grad_norm": 1.2211268818753362, "learning_rate": 9.012054485815995e-06, "loss": 0.2054, "step": 2128 }, { "epoch": 0.8506389776357828, "grad_norm": 1.2093558338280859, "learning_rate": 9.009279359184594e-06, "loss": 0.1853, "step": 2130 }, { "epoch": 0.8514376996805112, "grad_norm": 1.3142251663193079, "learning_rate": 9.006500768914106e-06, "loss": 0.182, "step": 2132 }, { "epoch": 0.8522364217252396, "grad_norm": 1.1390165214451813, "learning_rate": 9.003718717404977e-06, "loss": 0.1714, "step": 2134 }, { "epoch": 0.853035143769968, "grad_norm": 1.2833538623348046, "learning_rate": 9.00093320706063e-06, "loss": 0.1829, "step": 2136 }, { "epoch": 0.8538338658146964, "grad_norm": 1.201198063387828, "learning_rate": 8.998144240287487e-06, "loss": 0.1836, "step": 2138 }, { "epoch": 0.854632587859425, "grad_norm": 1.26048990415032, "learning_rate": 8.995351819494954e-06, "loss": 0.1823, "step": 2140 }, { "epoch": 0.8554313099041534, "grad_norm": 1.1311889575435519, "learning_rate": 8.992555947095414e-06, "loss": 0.1692, "step": 2142 }, { "epoch": 0.8562300319488818, "grad_norm": 1.351907102192122, "learning_rate": 8.989756625504237e-06, "loss": 0.1979, "step": 2144 }, { "epoch": 0.8570287539936102, "grad_norm": 1.3577115649569347, "learning_rate": 8.98695385713978e-06, "loss": 0.1888, "step": 2146 }, { "epoch": 0.8578274760383386, "grad_norm": 1.1332848107317308, "learning_rate": 8.984147644423362e-06, "loss": 0.1737, "step": 2148 }, { "epoch": 0.8586261980830671, "grad_norm": 1.276000255217281, "learning_rate": 8.981337989779291e-06, "loss": 0.1904, "step": 2150 }, { "epoch": 0.8594249201277955, "grad_norm": 1.2196944395263352, "learning_rate": 8.978524895634842e-06, "loss": 0.1976, "step": 2152 }, { "epoch": 0.860223642172524, "grad_norm": 1.0522481436377091, "learning_rate": 8.975708364420264e-06, "loss": 0.1529, "step": 2154 }, { "epoch": 0.8610223642172524, "grad_norm": 1.2862973570919587, "learning_rate": 8.972888398568772e-06, "loss": 0.1941, "step": 2156 }, { "epoch": 0.8618210862619808, "grad_norm": 1.1472291142227702, "learning_rate": 8.970065000516553e-06, "loss": 0.1884, "step": 2158 }, { "epoch": 0.8626198083067093, "grad_norm": 1.2425852591025766, "learning_rate": 8.967238172702754e-06, "loss": 0.1758, "step": 2160 }, { "epoch": 0.8634185303514377, "grad_norm": 1.1523178078837133, "learning_rate": 8.964407917569488e-06, "loss": 0.1773, "step": 2162 }, { "epoch": 0.8642172523961661, "grad_norm": 1.0544506917154455, "learning_rate": 8.96157423756183e-06, "loss": 0.1782, "step": 2164 }, { "epoch": 0.8650159744408946, "grad_norm": 1.398137403531195, "learning_rate": 8.958737135127812e-06, "loss": 0.1888, "step": 2166 }, { "epoch": 0.865814696485623, "grad_norm": 1.1006386353076576, "learning_rate": 8.95589661271842e-06, "loss": 0.1613, "step": 2168 }, { "epoch": 0.8666134185303515, "grad_norm": 1.1816019658476726, "learning_rate": 8.953052672787602e-06, "loss": 0.1828, "step": 2170 }, { "epoch": 0.8674121405750799, "grad_norm": 1.160479928900505, "learning_rate": 8.95020531779225e-06, "loss": 0.1816, "step": 2172 }, { "epoch": 0.8682108626198083, "grad_norm": 1.1134898684750454, "learning_rate": 8.94735455019221e-06, "loss": 0.1806, "step": 2174 }, { "epoch": 0.8690095846645367, "grad_norm": 1.1953510040643958, "learning_rate": 8.94450037245028e-06, "loss": 0.1759, "step": 2176 }, { "epoch": 0.8698083067092651, "grad_norm": 1.2112582987106082, "learning_rate": 8.941642787032197e-06, "loss": 0.1684, "step": 2178 }, { "epoch": 0.8706070287539937, "grad_norm": 1.3031673371016932, "learning_rate": 8.938781796406646e-06, "loss": 0.1846, "step": 2180 }, { "epoch": 0.8714057507987221, "grad_norm": 1.247581011761579, "learning_rate": 8.935917403045251e-06, "loss": 0.1849, "step": 2182 }, { "epoch": 0.8722044728434505, "grad_norm": 1.2073797176687306, "learning_rate": 8.933049609422582e-06, "loss": 0.17, "step": 2184 }, { "epoch": 0.8730031948881789, "grad_norm": 1.1534987114988338, "learning_rate": 8.930178418016138e-06, "loss": 0.185, "step": 2186 }, { "epoch": 0.8738019169329073, "grad_norm": 1.1820635621691253, "learning_rate": 8.92730383130636e-06, "loss": 0.1632, "step": 2188 }, { "epoch": 0.8746006389776357, "grad_norm": 1.1918298750579541, "learning_rate": 8.924425851776619e-06, "loss": 0.1789, "step": 2190 }, { "epoch": 0.8753993610223643, "grad_norm": 1.2772477046804016, "learning_rate": 8.921544481913218e-06, "loss": 0.1932, "step": 2192 }, { "epoch": 0.8761980830670927, "grad_norm": 1.1382431039382104, "learning_rate": 8.918659724205387e-06, "loss": 0.1829, "step": 2194 }, { "epoch": 0.8769968051118211, "grad_norm": 1.124034930363523, "learning_rate": 8.915771581145286e-06, "loss": 0.157, "step": 2196 }, { "epoch": 0.8777955271565495, "grad_norm": 1.1938500101023577, "learning_rate": 8.912880055227998e-06, "loss": 0.1873, "step": 2198 }, { "epoch": 0.8785942492012779, "grad_norm": 1.247886644845545, "learning_rate": 8.909985148951528e-06, "loss": 0.1893, "step": 2200 }, { "epoch": 0.8793929712460063, "grad_norm": 1.2882483087755097, "learning_rate": 8.907086864816804e-06, "loss": 0.1986, "step": 2202 }, { "epoch": 0.8801916932907349, "grad_norm": 1.1916502807322977, "learning_rate": 8.904185205327667e-06, "loss": 0.1585, "step": 2204 }, { "epoch": 0.8809904153354633, "grad_norm": 1.1609680993035214, "learning_rate": 8.901280172990878e-06, "loss": 0.1756, "step": 2206 }, { "epoch": 0.8817891373801917, "grad_norm": 1.0458575482646773, "learning_rate": 8.898371770316113e-06, "loss": 0.1545, "step": 2208 }, { "epoch": 0.8825878594249201, "grad_norm": 1.3354091660753635, "learning_rate": 8.895459999815954e-06, "loss": 0.1997, "step": 2210 }, { "epoch": 0.8833865814696485, "grad_norm": 1.1262191292997419, "learning_rate": 8.892544864005899e-06, "loss": 0.1743, "step": 2212 }, { "epoch": 0.884185303514377, "grad_norm": 1.3566110237290778, "learning_rate": 8.889626365404348e-06, "loss": 0.1922, "step": 2214 }, { "epoch": 0.8849840255591054, "grad_norm": 1.376910187006041, "learning_rate": 8.886704506532611e-06, "loss": 0.1913, "step": 2216 }, { "epoch": 0.8857827476038339, "grad_norm": 1.2397186314872872, "learning_rate": 8.883779289914894e-06, "loss": 0.1732, "step": 2218 }, { "epoch": 0.8865814696485623, "grad_norm": 1.2887051577849917, "learning_rate": 8.880850718078313e-06, "loss": 0.2123, "step": 2220 }, { "epoch": 0.8873801916932907, "grad_norm": 1.2786244843973318, "learning_rate": 8.877918793552875e-06, "loss": 0.1828, "step": 2222 }, { "epoch": 0.8881789137380192, "grad_norm": 1.0958110453050904, "learning_rate": 8.874983518871488e-06, "loss": 0.1746, "step": 2224 }, { "epoch": 0.8889776357827476, "grad_norm": 1.2263512727265042, "learning_rate": 8.87204489656995e-06, "loss": 0.2076, "step": 2226 }, { "epoch": 0.889776357827476, "grad_norm": 1.2566470262879932, "learning_rate": 8.869102929186954e-06, "loss": 0.1942, "step": 2228 }, { "epoch": 0.8905750798722045, "grad_norm": 1.313910249745457, "learning_rate": 8.866157619264086e-06, "loss": 0.2064, "step": 2230 }, { "epoch": 0.8913738019169329, "grad_norm": 1.2130369145657738, "learning_rate": 8.86320896934581e-06, "loss": 0.176, "step": 2232 }, { "epoch": 0.8921725239616614, "grad_norm": 1.1816578044938435, "learning_rate": 8.860256981979485e-06, "loss": 0.1932, "step": 2234 }, { "epoch": 0.8929712460063898, "grad_norm": 1.095754287469498, "learning_rate": 8.857301659715348e-06, "loss": 0.1808, "step": 2236 }, { "epoch": 0.8937699680511182, "grad_norm": 1.2405828782550468, "learning_rate": 8.854343005106521e-06, "loss": 0.1975, "step": 2238 }, { "epoch": 0.8945686900958466, "grad_norm": 1.139128509869952, "learning_rate": 8.851381020709e-06, "loss": 0.1638, "step": 2240 }, { "epoch": 0.895367412140575, "grad_norm": 1.1346192386483716, "learning_rate": 8.848415709081659e-06, "loss": 0.1621, "step": 2242 }, { "epoch": 0.8961661341853036, "grad_norm": 1.2827068653023486, "learning_rate": 8.845447072786251e-06, "loss": 0.19, "step": 2244 }, { "epoch": 0.896964856230032, "grad_norm": 1.1215489197999844, "learning_rate": 8.842475114387394e-06, "loss": 0.1732, "step": 2246 }, { "epoch": 0.8977635782747604, "grad_norm": 1.1479343886524802, "learning_rate": 8.839499836452584e-06, "loss": 0.1727, "step": 2248 }, { "epoch": 0.8985623003194888, "grad_norm": 1.1534484096320634, "learning_rate": 8.836521241552177e-06, "loss": 0.1885, "step": 2250 }, { "epoch": 0.8993610223642172, "grad_norm": 1.234578241869321, "learning_rate": 8.833539332259398e-06, "loss": 0.1672, "step": 2252 }, { "epoch": 0.9001597444089456, "grad_norm": 1.2103918137414391, "learning_rate": 8.830554111150337e-06, "loss": 0.1825, "step": 2254 }, { "epoch": 0.9009584664536742, "grad_norm": 1.0351256185130608, "learning_rate": 8.827565580803944e-06, "loss": 0.1624, "step": 2256 }, { "epoch": 0.9017571884984026, "grad_norm": 1.1562824946118846, "learning_rate": 8.824573743802023e-06, "loss": 0.1724, "step": 2258 }, { "epoch": 0.902555910543131, "grad_norm": 1.352880063666687, "learning_rate": 8.821578602729242e-06, "loss": 0.1709, "step": 2260 }, { "epoch": 0.9033546325878594, "grad_norm": 1.2118411341735884, "learning_rate": 8.81858016017312e-06, "loss": 0.172, "step": 2262 }, { "epoch": 0.9041533546325878, "grad_norm": 1.2657731749208432, "learning_rate": 8.815578418724031e-06, "loss": 0.1944, "step": 2264 }, { "epoch": 0.9049520766773163, "grad_norm": 1.2696811826921244, "learning_rate": 8.812573380975191e-06, "loss": 0.1945, "step": 2266 }, { "epoch": 0.9057507987220448, "grad_norm": 1.260789859526146, "learning_rate": 8.809565049522673e-06, "loss": 0.187, "step": 2268 }, { "epoch": 0.9065495207667732, "grad_norm": 1.1561924099805676, "learning_rate": 8.806553426965391e-06, "loss": 0.184, "step": 2270 }, { "epoch": 0.9073482428115016, "grad_norm": 1.2184507876835764, "learning_rate": 8.803538515905102e-06, "loss": 0.181, "step": 2272 }, { "epoch": 0.90814696485623, "grad_norm": 1.1489816284226748, "learning_rate": 8.800520318946404e-06, "loss": 0.1601, "step": 2274 }, { "epoch": 0.9089456869009584, "grad_norm": 1.150382635417185, "learning_rate": 8.797498838696737e-06, "loss": 0.173, "step": 2276 }, { "epoch": 0.9097444089456869, "grad_norm": 1.2396571356927486, "learning_rate": 8.79447407776637e-06, "loss": 0.1775, "step": 2278 }, { "epoch": 0.9105431309904153, "grad_norm": 1.1258691868543116, "learning_rate": 8.791446038768416e-06, "loss": 0.1796, "step": 2280 }, { "epoch": 0.9113418530351438, "grad_norm": 1.1837850225751487, "learning_rate": 8.788414724318814e-06, "loss": 0.186, "step": 2282 }, { "epoch": 0.9121405750798722, "grad_norm": 1.1771823649393418, "learning_rate": 8.785380137036332e-06, "loss": 0.1774, "step": 2284 }, { "epoch": 0.9129392971246006, "grad_norm": 1.195664396595768, "learning_rate": 8.782342279542569e-06, "loss": 0.1715, "step": 2286 }, { "epoch": 0.9137380191693291, "grad_norm": 1.394493043461634, "learning_rate": 8.779301154461945e-06, "loss": 0.1954, "step": 2288 }, { "epoch": 0.9145367412140575, "grad_norm": 1.484773906372258, "learning_rate": 8.776256764421706e-06, "loss": 0.1649, "step": 2290 }, { "epoch": 0.9153354632587859, "grad_norm": 1.1713356392013368, "learning_rate": 8.773209112051919e-06, "loss": 0.1676, "step": 2292 }, { "epoch": 0.9161341853035144, "grad_norm": 1.2492572619724682, "learning_rate": 8.770158199985466e-06, "loss": 0.203, "step": 2294 }, { "epoch": 0.9169329073482428, "grad_norm": 1.112250308815286, "learning_rate": 8.76710403085805e-06, "loss": 0.1675, "step": 2296 }, { "epoch": 0.9177316293929713, "grad_norm": 1.2119980335045586, "learning_rate": 8.764046607308183e-06, "loss": 0.1822, "step": 2298 }, { "epoch": 0.9185303514376997, "grad_norm": 1.2003429295335617, "learning_rate": 8.760985931977191e-06, "loss": 0.1648, "step": 2300 }, { "epoch": 0.9193290734824281, "grad_norm": 1.2923004205685706, "learning_rate": 8.757922007509208e-06, "loss": 0.1672, "step": 2302 }, { "epoch": 0.9201277955271565, "grad_norm": 1.2602401218318184, "learning_rate": 8.754854836551174e-06, "loss": 0.1697, "step": 2304 }, { "epoch": 0.920926517571885, "grad_norm": 1.3225061494713877, "learning_rate": 8.75178442175284e-06, "loss": 0.2173, "step": 2306 }, { "epoch": 0.9217252396166135, "grad_norm": 1.236225325668187, "learning_rate": 8.748710765766752e-06, "loss": 0.1748, "step": 2308 }, { "epoch": 0.9225239616613419, "grad_norm": 1.5668493173725866, "learning_rate": 8.745633871248257e-06, "loss": 0.1792, "step": 2310 }, { "epoch": 0.9233226837060703, "grad_norm": 1.2818950876816324, "learning_rate": 8.742553740855507e-06, "loss": 0.1701, "step": 2312 }, { "epoch": 0.9241214057507987, "grad_norm": 1.0563021739777534, "learning_rate": 8.739470377249436e-06, "loss": 0.1673, "step": 2314 }, { "epoch": 0.9249201277955271, "grad_norm": 1.1015536336953309, "learning_rate": 8.736383783093788e-06, "loss": 0.1768, "step": 2316 }, { "epoch": 0.9257188498402555, "grad_norm": 1.2209081010826341, "learning_rate": 8.733293961055082e-06, "loss": 0.1893, "step": 2318 }, { "epoch": 0.9265175718849841, "grad_norm": 1.1147003675746905, "learning_rate": 8.730200913802638e-06, "loss": 0.1727, "step": 2320 }, { "epoch": 0.9273162939297125, "grad_norm": 1.1772795182600777, "learning_rate": 8.727104644008553e-06, "loss": 0.1777, "step": 2322 }, { "epoch": 0.9281150159744409, "grad_norm": 1.106532804822251, "learning_rate": 8.724005154347714e-06, "loss": 0.1781, "step": 2324 }, { "epoch": 0.9289137380191693, "grad_norm": 1.15574880894598, "learning_rate": 8.720902447497788e-06, "loss": 0.1777, "step": 2326 }, { "epoch": 0.9297124600638977, "grad_norm": 1.2312851048264675, "learning_rate": 8.717796526139218e-06, "loss": 0.204, "step": 2328 }, { "epoch": 0.9305111821086262, "grad_norm": 1.261983053853919, "learning_rate": 8.71468739295523e-06, "loss": 0.1993, "step": 2330 }, { "epoch": 0.9313099041533547, "grad_norm": 1.1982722648050632, "learning_rate": 8.711575050631823e-06, "loss": 0.1968, "step": 2332 }, { "epoch": 0.9321086261980831, "grad_norm": 1.1624739320562267, "learning_rate": 8.708459501857762e-06, "loss": 0.1864, "step": 2334 }, { "epoch": 0.9329073482428115, "grad_norm": 1.027366394484867, "learning_rate": 8.70534074932459e-06, "loss": 0.1687, "step": 2336 }, { "epoch": 0.9337060702875399, "grad_norm": 1.1021491198370645, "learning_rate": 8.702218795726619e-06, "loss": 0.1684, "step": 2338 }, { "epoch": 0.9345047923322684, "grad_norm": 1.1040575428181885, "learning_rate": 8.699093643760914e-06, "loss": 0.1739, "step": 2340 }, { "epoch": 0.9353035143769968, "grad_norm": 1.195647306811607, "learning_rate": 8.695965296127318e-06, "loss": 0.1816, "step": 2342 }, { "epoch": 0.9361022364217252, "grad_norm": 1.0812569337947433, "learning_rate": 8.692833755528426e-06, "loss": 0.1766, "step": 2344 }, { "epoch": 0.9369009584664537, "grad_norm": 1.3202981154225082, "learning_rate": 8.689699024669594e-06, "loss": 0.1879, "step": 2346 }, { "epoch": 0.9376996805111821, "grad_norm": 1.1970245625047584, "learning_rate": 8.686561106258932e-06, "loss": 0.1803, "step": 2348 }, { "epoch": 0.9384984025559105, "grad_norm": 1.2213363903695773, "learning_rate": 8.683420003007308e-06, "loss": 0.1947, "step": 2350 }, { "epoch": 0.939297124600639, "grad_norm": 1.168137561259312, "learning_rate": 8.680275717628336e-06, "loss": 0.1683, "step": 2352 }, { "epoch": 0.9400958466453674, "grad_norm": 1.1140547555618086, "learning_rate": 8.677128252838386e-06, "loss": 0.1808, "step": 2354 }, { "epoch": 0.9408945686900958, "grad_norm": 1.0913344127324964, "learning_rate": 8.673977611356567e-06, "loss": 0.1768, "step": 2356 }, { "epoch": 0.9416932907348243, "grad_norm": 1.2470605938854136, "learning_rate": 8.670823795904737e-06, "loss": 0.1881, "step": 2358 }, { "epoch": 0.9424920127795527, "grad_norm": 1.1432903930033393, "learning_rate": 8.667666809207495e-06, "loss": 0.1814, "step": 2360 }, { "epoch": 0.9432907348242812, "grad_norm": 1.1254326884247754, "learning_rate": 8.664506653992181e-06, "loss": 0.1721, "step": 2362 }, { "epoch": 0.9440894568690096, "grad_norm": 1.1774199109472463, "learning_rate": 8.661343332988869e-06, "loss": 0.1644, "step": 2364 }, { "epoch": 0.944888178913738, "grad_norm": 1.1966632218098605, "learning_rate": 8.65817684893037e-06, "loss": 0.1735, "step": 2366 }, { "epoch": 0.9456869009584664, "grad_norm": 1.0407093351327132, "learning_rate": 8.655007204552228e-06, "loss": 0.1544, "step": 2368 }, { "epoch": 0.9464856230031949, "grad_norm": 1.2274155096905306, "learning_rate": 8.651834402592719e-06, "loss": 0.1798, "step": 2370 }, { "epoch": 0.9472843450479234, "grad_norm": 1.267776362466128, "learning_rate": 8.64865844579284e-06, "loss": 0.1751, "step": 2372 }, { "epoch": 0.9480830670926518, "grad_norm": 1.1768732812780864, "learning_rate": 8.64547933689632e-06, "loss": 0.1785, "step": 2374 }, { "epoch": 0.9488817891373802, "grad_norm": 1.1577676779075388, "learning_rate": 8.64229707864961e-06, "loss": 0.1947, "step": 2376 }, { "epoch": 0.9496805111821086, "grad_norm": 1.1591103988078189, "learning_rate": 8.63911167380188e-06, "loss": 0.1788, "step": 2378 }, { "epoch": 0.950479233226837, "grad_norm": 1.11481900810481, "learning_rate": 8.635923125105019e-06, "loss": 0.1676, "step": 2380 }, { "epoch": 0.9512779552715654, "grad_norm": 1.2948111421923072, "learning_rate": 8.632731435313634e-06, "loss": 0.1645, "step": 2382 }, { "epoch": 0.952076677316294, "grad_norm": 1.1733292169857867, "learning_rate": 8.629536607185042e-06, "loss": 0.1701, "step": 2384 }, { "epoch": 0.9528753993610224, "grad_norm": 1.1636636504635067, "learning_rate": 8.626338643479275e-06, "loss": 0.17, "step": 2386 }, { "epoch": 0.9536741214057508, "grad_norm": 1.2570201463967705, "learning_rate": 8.62313754695907e-06, "loss": 0.1738, "step": 2388 }, { "epoch": 0.9544728434504792, "grad_norm": 1.0902131985821306, "learning_rate": 8.619933320389872e-06, "loss": 0.1723, "step": 2390 }, { "epoch": 0.9552715654952076, "grad_norm": 1.2111908413145964, "learning_rate": 8.616725966539831e-06, "loss": 0.1926, "step": 2392 }, { "epoch": 0.9560702875399361, "grad_norm": 1.1942013221000287, "learning_rate": 8.6135154881798e-06, "loss": 0.1888, "step": 2394 }, { "epoch": 0.9568690095846646, "grad_norm": 1.1699831896669461, "learning_rate": 8.610301888083327e-06, "loss": 0.2023, "step": 2396 }, { "epoch": 0.957667731629393, "grad_norm": 1.1882929391966546, "learning_rate": 8.607085169026661e-06, "loss": 0.1852, "step": 2398 }, { "epoch": 0.9584664536741214, "grad_norm": 1.1905600662435856, "learning_rate": 8.60386533378874e-06, "loss": 0.1919, "step": 2400 }, { "epoch": 0.9592651757188498, "grad_norm": 1.2472441081594776, "learning_rate": 8.600642385151206e-06, "loss": 0.1923, "step": 2402 }, { "epoch": 0.9600638977635783, "grad_norm": 1.0759279066292031, "learning_rate": 8.597416325898373e-06, "loss": 0.1685, "step": 2404 }, { "epoch": 0.9608626198083067, "grad_norm": 1.1583330331881445, "learning_rate": 8.594187158817257e-06, "loss": 0.1749, "step": 2406 }, { "epoch": 0.9616613418530351, "grad_norm": 1.2089521342785554, "learning_rate": 8.590954886697554e-06, "loss": 0.1939, "step": 2408 }, { "epoch": 0.9624600638977636, "grad_norm": 1.2614329419101098, "learning_rate": 8.58771951233164e-06, "loss": 0.2026, "step": 2410 }, { "epoch": 0.963258785942492, "grad_norm": 1.1366241513550654, "learning_rate": 8.584481038514573e-06, "loss": 0.1639, "step": 2412 }, { "epoch": 0.9640575079872205, "grad_norm": 1.3624993080110601, "learning_rate": 8.581239468044093e-06, "loss": 0.185, "step": 2414 }, { "epoch": 0.9648562300319489, "grad_norm": 1.0567057031046825, "learning_rate": 8.577994803720605e-06, "loss": 0.1661, "step": 2416 }, { "epoch": 0.9656549520766773, "grad_norm": 1.1539180273643035, "learning_rate": 8.574747048347199e-06, "loss": 0.1664, "step": 2418 }, { "epoch": 0.9664536741214057, "grad_norm": 1.236448054190876, "learning_rate": 8.571496204729623e-06, "loss": 0.1818, "step": 2420 }, { "epoch": 0.9672523961661342, "grad_norm": 1.0633439918324645, "learning_rate": 8.568242275676304e-06, "loss": 0.1735, "step": 2422 }, { "epoch": 0.9680511182108626, "grad_norm": 1.1561922874116346, "learning_rate": 8.564985263998327e-06, "loss": 0.1651, "step": 2424 }, { "epoch": 0.9688498402555911, "grad_norm": 1.1398950472118397, "learning_rate": 8.561725172509444e-06, "loss": 0.163, "step": 2426 }, { "epoch": 0.9696485623003195, "grad_norm": 1.1653225900272541, "learning_rate": 8.558462004026065e-06, "loss": 0.173, "step": 2428 }, { "epoch": 0.9704472843450479, "grad_norm": 1.3275231148352606, "learning_rate": 8.555195761367263e-06, "loss": 0.1819, "step": 2430 }, { "epoch": 0.9712460063897763, "grad_norm": 1.0555861455966542, "learning_rate": 8.551926447354759e-06, "loss": 0.1699, "step": 2432 }, { "epoch": 0.9720447284345048, "grad_norm": 1.2506959109380367, "learning_rate": 8.548654064812934e-06, "loss": 0.1792, "step": 2434 }, { "epoch": 0.9728434504792333, "grad_norm": 1.1541294441379601, "learning_rate": 8.54537861656882e-06, "loss": 0.1796, "step": 2436 }, { "epoch": 0.9736421725239617, "grad_norm": 1.3106265759711455, "learning_rate": 8.542100105452093e-06, "loss": 0.1823, "step": 2438 }, { "epoch": 0.9744408945686901, "grad_norm": 1.2700540275591943, "learning_rate": 8.538818534295076e-06, "loss": 0.183, "step": 2440 }, { "epoch": 0.9752396166134185, "grad_norm": 1.4250215144969003, "learning_rate": 8.535533905932739e-06, "loss": 0.1865, "step": 2442 }, { "epoch": 0.9760383386581469, "grad_norm": 1.1840076787843286, "learning_rate": 8.532246223202689e-06, "loss": 0.1773, "step": 2444 }, { "epoch": 0.9768370607028753, "grad_norm": 1.0800090683302819, "learning_rate": 8.528955488945177e-06, "loss": 0.1762, "step": 2446 }, { "epoch": 0.9776357827476039, "grad_norm": 1.056788808510418, "learning_rate": 8.525661706003083e-06, "loss": 0.1752, "step": 2448 }, { "epoch": 0.9784345047923323, "grad_norm": 1.1120115317215207, "learning_rate": 8.522364877221926e-06, "loss": 0.1827, "step": 2450 }, { "epoch": 0.9792332268370607, "grad_norm": 1.2484051846053978, "learning_rate": 8.519065005449858e-06, "loss": 0.1744, "step": 2452 }, { "epoch": 0.9800319488817891, "grad_norm": 1.2862407811422434, "learning_rate": 8.515762093537654e-06, "loss": 0.1816, "step": 2454 }, { "epoch": 0.9808306709265175, "grad_norm": 1.1176065762239906, "learning_rate": 8.512456144338717e-06, "loss": 0.1779, "step": 2456 }, { "epoch": 0.981629392971246, "grad_norm": 1.1181750039143228, "learning_rate": 8.509147160709079e-06, "loss": 0.1791, "step": 2458 }, { "epoch": 0.9824281150159745, "grad_norm": 1.1233000113043177, "learning_rate": 8.505835145507387e-06, "loss": 0.1801, "step": 2460 }, { "epoch": 0.9832268370607029, "grad_norm": 1.149372977172484, "learning_rate": 8.502520101594909e-06, "loss": 0.1621, "step": 2462 }, { "epoch": 0.9840255591054313, "grad_norm": 1.1858307935499464, "learning_rate": 8.499202031835532e-06, "loss": 0.1832, "step": 2464 }, { "epoch": 0.9848242811501597, "grad_norm": 1.290001462169469, "learning_rate": 8.495880939095754e-06, "loss": 0.1894, "step": 2466 }, { "epoch": 0.9856230031948882, "grad_norm": 1.3581022131665044, "learning_rate": 8.492556826244687e-06, "loss": 0.2, "step": 2468 }, { "epoch": 0.9864217252396166, "grad_norm": 1.1056676733741198, "learning_rate": 8.489229696154049e-06, "loss": 0.1805, "step": 2470 }, { "epoch": 0.987220447284345, "grad_norm": 1.1250903817623754, "learning_rate": 8.485899551698166e-06, "loss": 0.177, "step": 2472 }, { "epoch": 0.9880191693290735, "grad_norm": 1.148392440744409, "learning_rate": 8.482566395753975e-06, "loss": 0.1867, "step": 2474 }, { "epoch": 0.9888178913738019, "grad_norm": 1.2558757559301705, "learning_rate": 8.479230231201001e-06, "loss": 0.1964, "step": 2476 }, { "epoch": 0.9896166134185304, "grad_norm": 1.2579913217677217, "learning_rate": 8.475891060921378e-06, "loss": 0.1829, "step": 2478 }, { "epoch": 0.9904153354632588, "grad_norm": 1.316707228945154, "learning_rate": 8.472548887799833e-06, "loss": 0.1696, "step": 2480 }, { "epoch": 0.9912140575079872, "grad_norm": 1.2049638106018488, "learning_rate": 8.46920371472369e-06, "loss": 0.1887, "step": 2482 }, { "epoch": 0.9920127795527156, "grad_norm": 1.2377015394107287, "learning_rate": 8.465855544582862e-06, "loss": 0.2014, "step": 2484 }, { "epoch": 0.9928115015974441, "grad_norm": 1.1268024008456778, "learning_rate": 8.462504380269853e-06, "loss": 0.179, "step": 2486 }, { "epoch": 0.9936102236421726, "grad_norm": 1.446057306444501, "learning_rate": 8.45915022467975e-06, "loss": 0.1918, "step": 2488 }, { "epoch": 0.994408945686901, "grad_norm": 1.186023464310668, "learning_rate": 8.455793080710231e-06, "loss": 0.1706, "step": 2490 }, { "epoch": 0.9952076677316294, "grad_norm": 1.2273194439241009, "learning_rate": 8.452432951261549e-06, "loss": 0.1857, "step": 2492 }, { "epoch": 0.9960063897763578, "grad_norm": 1.0809657844755842, "learning_rate": 8.44906983923654e-06, "loss": 0.1403, "step": 2494 }, { "epoch": 0.9968051118210862, "grad_norm": 1.1417636929002628, "learning_rate": 8.445703747540614e-06, "loss": 0.1727, "step": 2496 }, { "epoch": 0.9976038338658147, "grad_norm": 1.2435186121479649, "learning_rate": 8.442334679081757e-06, "loss": 0.1651, "step": 2498 }, { "epoch": 0.9984025559105432, "grad_norm": 1.261179386652203, "learning_rate": 8.438962636770528e-06, "loss": 0.1916, "step": 2500 }, { "epoch": 0.9984025559105432, "eval_loss": 0.1609409898519516, "eval_runtime": 418.1212, "eval_samples_per_second": 42.588, "eval_steps_per_second": 5.324, "step": 2500 }, { "epoch": 0.9992012779552716, "grad_norm": 1.2177062771646232, "learning_rate": 8.43558762352005e-06, "loss": 0.1956, "step": 2502 }, { "epoch": 1.0, "grad_norm": 1.129270940930756, "learning_rate": 8.43220964224602e-06, "loss": 0.1687, "step": 2504 }, { "epoch": 1.0007987220447285, "grad_norm": 0.827421022738481, "learning_rate": 8.428828695866694e-06, "loss": 0.1127, "step": 2506 }, { "epoch": 1.0015974440894568, "grad_norm": 0.8532426168374652, "learning_rate": 8.425444787302887e-06, "loss": 0.1097, "step": 2508 }, { "epoch": 1.0023961661341854, "grad_norm": 1.0387260637251956, "learning_rate": 8.422057919477984e-06, "loss": 0.1209, "step": 2510 }, { "epoch": 1.0031948881789137, "grad_norm": 0.9532301555359793, "learning_rate": 8.418668095317912e-06, "loss": 0.1124, "step": 2512 }, { "epoch": 1.0039936102236422, "grad_norm": 0.9749882933863255, "learning_rate": 8.415275317751163e-06, "loss": 0.1033, "step": 2514 }, { "epoch": 1.0047923322683705, "grad_norm": 1.215365298510418, "learning_rate": 8.411879589708775e-06, "loss": 0.1136, "step": 2516 }, { "epoch": 1.005591054313099, "grad_norm": 1.0938388152428302, "learning_rate": 8.408480914124338e-06, "loss": 0.1126, "step": 2518 }, { "epoch": 1.0063897763578276, "grad_norm": 1.1091853030022452, "learning_rate": 8.405079293933986e-06, "loss": 0.0985, "step": 2520 }, { "epoch": 1.0071884984025559, "grad_norm": 1.1309767039321081, "learning_rate": 8.401674732076399e-06, "loss": 0.1121, "step": 2522 }, { "epoch": 1.0079872204472844, "grad_norm": 1.2204646216277761, "learning_rate": 8.398267231492797e-06, "loss": 0.1276, "step": 2524 }, { "epoch": 1.0087859424920127, "grad_norm": 1.380795556523787, "learning_rate": 8.394856795126937e-06, "loss": 0.1039, "step": 2526 }, { "epoch": 1.0095846645367412, "grad_norm": 1.0922220217499492, "learning_rate": 8.391443425925118e-06, "loss": 0.1126, "step": 2528 }, { "epoch": 1.0103833865814698, "grad_norm": 0.990994836569366, "learning_rate": 8.388027126836168e-06, "loss": 0.1109, "step": 2530 }, { "epoch": 1.011182108626198, "grad_norm": 1.160502599618174, "learning_rate": 8.384607900811442e-06, "loss": 0.11, "step": 2532 }, { "epoch": 1.0119808306709266, "grad_norm": 1.0016564935445926, "learning_rate": 8.381185750804835e-06, "loss": 0.1007, "step": 2534 }, { "epoch": 1.012779552715655, "grad_norm": 1.0190577969031376, "learning_rate": 8.37776067977276e-06, "loss": 0.1095, "step": 2536 }, { "epoch": 1.0135782747603834, "grad_norm": 1.0372954792800193, "learning_rate": 8.374332690674151e-06, "loss": 0.1087, "step": 2538 }, { "epoch": 1.0143769968051117, "grad_norm": 1.0326525572009215, "learning_rate": 8.370901786470475e-06, "loss": 0.1023, "step": 2540 }, { "epoch": 1.0151757188498403, "grad_norm": 0.9829529420381988, "learning_rate": 8.367467970125702e-06, "loss": 0.1104, "step": 2542 }, { "epoch": 1.0159744408945688, "grad_norm": 1.1974618667877126, "learning_rate": 8.36403124460633e-06, "loss": 0.1052, "step": 2544 }, { "epoch": 1.016773162939297, "grad_norm": 1.13559909487521, "learning_rate": 8.360591612881363e-06, "loss": 0.1076, "step": 2546 }, { "epoch": 1.0175718849840256, "grad_norm": 1.1337615805939383, "learning_rate": 8.357149077922322e-06, "loss": 0.1179, "step": 2548 }, { "epoch": 1.018370607028754, "grad_norm": 1.1381630091831911, "learning_rate": 8.353703642703228e-06, "loss": 0.1082, "step": 2550 }, { "epoch": 1.0191693290734825, "grad_norm": 1.1081002212560531, "learning_rate": 8.350255310200611e-06, "loss": 0.1133, "step": 2552 }, { "epoch": 1.0199680511182108, "grad_norm": 1.012013673101353, "learning_rate": 8.346804083393511e-06, "loss": 0.1133, "step": 2554 }, { "epoch": 1.0207667731629393, "grad_norm": 1.0949479826249693, "learning_rate": 8.343349965263459e-06, "loss": 0.109, "step": 2556 }, { "epoch": 1.0215654952076678, "grad_norm": 1.4697029214526798, "learning_rate": 8.339892958794487e-06, "loss": 0.1223, "step": 2558 }, { "epoch": 1.0223642172523961, "grad_norm": 1.1256837730078384, "learning_rate": 8.336433066973122e-06, "loss": 0.1106, "step": 2560 }, { "epoch": 1.0231629392971247, "grad_norm": 1.0111334067086826, "learning_rate": 8.332970292788384e-06, "loss": 0.0992, "step": 2562 }, { "epoch": 1.023961661341853, "grad_norm": 1.1457290491628267, "learning_rate": 8.329504639231784e-06, "loss": 0.1055, "step": 2564 }, { "epoch": 1.0247603833865815, "grad_norm": 1.094415886169632, "learning_rate": 8.32603610929732e-06, "loss": 0.1233, "step": 2566 }, { "epoch": 1.0255591054313098, "grad_norm": 1.115090103670034, "learning_rate": 8.322564705981476e-06, "loss": 0.1027, "step": 2568 }, { "epoch": 1.0263578274760383, "grad_norm": 1.2411016808966997, "learning_rate": 8.319090432283213e-06, "loss": 0.1251, "step": 2570 }, { "epoch": 1.0271565495207668, "grad_norm": 1.161818122354369, "learning_rate": 8.315613291203977e-06, "loss": 0.1136, "step": 2572 }, { "epoch": 1.0279552715654952, "grad_norm": 1.010109815757735, "learning_rate": 8.312133285747693e-06, "loss": 0.1116, "step": 2574 }, { "epoch": 1.0287539936102237, "grad_norm": 1.0352536009528577, "learning_rate": 8.308650418920751e-06, "loss": 0.0985, "step": 2576 }, { "epoch": 1.029552715654952, "grad_norm": 1.0385517544202987, "learning_rate": 8.305164693732026e-06, "loss": 0.1085, "step": 2578 }, { "epoch": 1.0303514376996805, "grad_norm": 1.0075596705684045, "learning_rate": 8.301676113192853e-06, "loss": 0.1092, "step": 2580 }, { "epoch": 1.031150159744409, "grad_norm": 1.0700338942744492, "learning_rate": 8.298184680317035e-06, "loss": 0.1124, "step": 2582 }, { "epoch": 1.0319488817891374, "grad_norm": 1.0149219991197933, "learning_rate": 8.294690398120843e-06, "loss": 0.1121, "step": 2584 }, { "epoch": 1.0327476038338659, "grad_norm": 1.2039242543481947, "learning_rate": 8.291193269623005e-06, "loss": 0.1222, "step": 2586 }, { "epoch": 1.0335463258785942, "grad_norm": 1.2377862042067174, "learning_rate": 8.287693297844712e-06, "loss": 0.1145, "step": 2588 }, { "epoch": 1.0343450479233227, "grad_norm": 1.091238797708431, "learning_rate": 8.28419048580961e-06, "loss": 0.1155, "step": 2590 }, { "epoch": 1.035143769968051, "grad_norm": 1.078587618975172, "learning_rate": 8.280684836543794e-06, "loss": 0.1082, "step": 2592 }, { "epoch": 1.0359424920127795, "grad_norm": 1.0454759141708203, "learning_rate": 8.277176353075818e-06, "loss": 0.1056, "step": 2594 }, { "epoch": 1.036741214057508, "grad_norm": 1.168733586560627, "learning_rate": 8.27366503843668e-06, "loss": 0.1159, "step": 2596 }, { "epoch": 1.0375399361022364, "grad_norm": 0.9639822523537996, "learning_rate": 8.270150895659824e-06, "loss": 0.1141, "step": 2598 }, { "epoch": 1.038338658146965, "grad_norm": 1.080334562027545, "learning_rate": 8.266633927781135e-06, "loss": 0.097, "step": 2600 }, { "epoch": 1.0391373801916932, "grad_norm": 1.0212322282727473, "learning_rate": 8.263114137838946e-06, "loss": 0.1141, "step": 2602 }, { "epoch": 1.0399361022364217, "grad_norm": 1.0394948988809494, "learning_rate": 8.259591528874023e-06, "loss": 0.1101, "step": 2604 }, { "epoch": 1.04073482428115, "grad_norm": 1.3623022308744146, "learning_rate": 8.256066103929566e-06, "loss": 0.1273, "step": 2606 }, { "epoch": 1.0415335463258786, "grad_norm": 1.022587558449341, "learning_rate": 8.25253786605121e-06, "loss": 0.1124, "step": 2608 }, { "epoch": 1.042332268370607, "grad_norm": 0.986241541472163, "learning_rate": 8.249006818287017e-06, "loss": 0.1017, "step": 2610 }, { "epoch": 1.0431309904153354, "grad_norm": 1.1218445133529082, "learning_rate": 8.245472963687484e-06, "loss": 0.1051, "step": 2612 }, { "epoch": 1.043929712460064, "grad_norm": 0.9853007519672737, "learning_rate": 8.241936305305526e-06, "loss": 0.0933, "step": 2614 }, { "epoch": 1.0447284345047922, "grad_norm": 0.9238979931931091, "learning_rate": 8.238396846196483e-06, "loss": 0.0964, "step": 2616 }, { "epoch": 1.0455271565495208, "grad_norm": 1.145209666797251, "learning_rate": 8.23485458941811e-06, "loss": 0.1178, "step": 2618 }, { "epoch": 1.0463258785942493, "grad_norm": 1.128182977350629, "learning_rate": 8.231309538030586e-06, "loss": 0.1141, "step": 2620 }, { "epoch": 1.0471246006389776, "grad_norm": 0.9914083611784468, "learning_rate": 8.2277616950965e-06, "loss": 0.0989, "step": 2622 }, { "epoch": 1.0479233226837061, "grad_norm": 1.1620950439007605, "learning_rate": 8.224211063680854e-06, "loss": 0.1215, "step": 2624 }, { "epoch": 1.0487220447284344, "grad_norm": 1.1508342917411967, "learning_rate": 8.220657646851059e-06, "loss": 0.1205, "step": 2626 }, { "epoch": 1.049520766773163, "grad_norm": 1.2150567330620372, "learning_rate": 8.217101447676929e-06, "loss": 0.106, "step": 2628 }, { "epoch": 1.0503194888178913, "grad_norm": 1.1152102769811438, "learning_rate": 8.213542469230688e-06, "loss": 0.1223, "step": 2630 }, { "epoch": 1.0511182108626198, "grad_norm": 1.1807098190243748, "learning_rate": 8.209980714586955e-06, "loss": 0.1122, "step": 2632 }, { "epoch": 1.0519169329073483, "grad_norm": 1.1286829181729654, "learning_rate": 8.206416186822753e-06, "loss": 0.1101, "step": 2634 }, { "epoch": 1.0527156549520766, "grad_norm": 1.0697286341076933, "learning_rate": 8.202848889017494e-06, "loss": 0.1112, "step": 2636 }, { "epoch": 1.0535143769968052, "grad_norm": 1.0611961961878171, "learning_rate": 8.19927882425299e-06, "loss": 0.1079, "step": 2638 }, { "epoch": 1.0543130990415335, "grad_norm": 1.0124139526923805, "learning_rate": 8.195705995613436e-06, "loss": 0.0996, "step": 2640 }, { "epoch": 1.055111821086262, "grad_norm": 1.108195697612016, "learning_rate": 8.192130406185425e-06, "loss": 0.1107, "step": 2642 }, { "epoch": 1.0559105431309903, "grad_norm": 1.1111577119822706, "learning_rate": 8.188552059057924e-06, "loss": 0.116, "step": 2644 }, { "epoch": 1.0567092651757188, "grad_norm": 1.0697354896619498, "learning_rate": 8.184970957322287e-06, "loss": 0.1026, "step": 2646 }, { "epoch": 1.0575079872204474, "grad_norm": 1.1184255850296823, "learning_rate": 8.181387104072252e-06, "loss": 0.1008, "step": 2648 }, { "epoch": 1.0583067092651757, "grad_norm": 1.123703636746383, "learning_rate": 8.177800502403928e-06, "loss": 0.1234, "step": 2650 }, { "epoch": 1.0591054313099042, "grad_norm": 1.1145099095973394, "learning_rate": 8.1742111554158e-06, "loss": 0.1038, "step": 2652 }, { "epoch": 1.0599041533546325, "grad_norm": 1.0924462066716556, "learning_rate": 8.170619066208723e-06, "loss": 0.1026, "step": 2654 }, { "epoch": 1.060702875399361, "grad_norm": 1.275295631090924, "learning_rate": 8.167024237885927e-06, "loss": 0.1202, "step": 2656 }, { "epoch": 1.0615015974440896, "grad_norm": 1.001711382132048, "learning_rate": 8.163426673553002e-06, "loss": 0.1028, "step": 2658 }, { "epoch": 1.0623003194888179, "grad_norm": 1.090098993511894, "learning_rate": 8.159826376317906e-06, "loss": 0.1074, "step": 2660 }, { "epoch": 1.0630990415335464, "grad_norm": 1.0661454669618209, "learning_rate": 8.156223349290957e-06, "loss": 0.1101, "step": 2662 }, { "epoch": 1.0638977635782747, "grad_norm": 1.256680601891414, "learning_rate": 8.152617595584827e-06, "loss": 0.1085, "step": 2664 }, { "epoch": 1.0646964856230032, "grad_norm": 1.0814675764007797, "learning_rate": 8.149009118314549e-06, "loss": 0.1041, "step": 2666 }, { "epoch": 1.0654952076677315, "grad_norm": 1.055396058607972, "learning_rate": 8.145397920597505e-06, "loss": 0.1012, "step": 2668 }, { "epoch": 1.06629392971246, "grad_norm": 1.0902362525938407, "learning_rate": 8.141784005553433e-06, "loss": 0.1129, "step": 2670 }, { "epoch": 1.0670926517571886, "grad_norm": 1.040946925167894, "learning_rate": 8.138167376304411e-06, "loss": 0.1032, "step": 2672 }, { "epoch": 1.067891373801917, "grad_norm": 1.1293507367739193, "learning_rate": 8.134548035974868e-06, "loss": 0.1093, "step": 2674 }, { "epoch": 1.0686900958466454, "grad_norm": 1.1475487980987538, "learning_rate": 8.13092598769157e-06, "loss": 0.1168, "step": 2676 }, { "epoch": 1.0694888178913737, "grad_norm": 1.0688926041290217, "learning_rate": 8.127301234583627e-06, "loss": 0.1209, "step": 2678 }, { "epoch": 1.0702875399361023, "grad_norm": 1.1130827659156695, "learning_rate": 8.123673779782481e-06, "loss": 0.1112, "step": 2680 }, { "epoch": 1.0710862619808306, "grad_norm": 0.999458562659492, "learning_rate": 8.120043626421915e-06, "loss": 0.1036, "step": 2682 }, { "epoch": 1.071884984025559, "grad_norm": 1.1445057181311435, "learning_rate": 8.116410777638035e-06, "loss": 0.1088, "step": 2684 }, { "epoch": 1.0726837060702876, "grad_norm": 1.060266912369435, "learning_rate": 8.112775236569282e-06, "loss": 0.1024, "step": 2686 }, { "epoch": 1.073482428115016, "grad_norm": 1.0993996765043779, "learning_rate": 8.10913700635642e-06, "loss": 0.1037, "step": 2688 }, { "epoch": 1.0742811501597445, "grad_norm": 1.039132296487209, "learning_rate": 8.105496090142535e-06, "loss": 0.1075, "step": 2690 }, { "epoch": 1.0750798722044728, "grad_norm": 1.1082486440070727, "learning_rate": 8.101852491073036e-06, "loss": 0.105, "step": 2692 }, { "epoch": 1.0758785942492013, "grad_norm": 1.1651294916681243, "learning_rate": 8.098206212295652e-06, "loss": 0.1154, "step": 2694 }, { "epoch": 1.0766773162939298, "grad_norm": 1.172460162145474, "learning_rate": 8.094557256960419e-06, "loss": 0.116, "step": 2696 }, { "epoch": 1.0774760383386581, "grad_norm": 0.9459826583314704, "learning_rate": 8.090905628219693e-06, "loss": 0.1068, "step": 2698 }, { "epoch": 1.0782747603833867, "grad_norm": 1.0709333935677605, "learning_rate": 8.087251329228136e-06, "loss": 0.0917, "step": 2700 }, { "epoch": 1.079073482428115, "grad_norm": 1.1016086096260114, "learning_rate": 8.083594363142717e-06, "loss": 0.1144, "step": 2702 }, { "epoch": 1.0798722044728435, "grad_norm": 1.0018060470975783, "learning_rate": 8.079934733122708e-06, "loss": 0.1071, "step": 2704 }, { "epoch": 1.0806709265175718, "grad_norm": 1.145863183921058, "learning_rate": 8.07627244232969e-06, "loss": 0.1177, "step": 2706 }, { "epoch": 1.0814696485623003, "grad_norm": 1.0861499113619286, "learning_rate": 8.07260749392753e-06, "loss": 0.116, "step": 2708 }, { "epoch": 1.0822683706070289, "grad_norm": 1.0911649754162966, "learning_rate": 8.068939891082401e-06, "loss": 0.1092, "step": 2710 }, { "epoch": 1.0830670926517572, "grad_norm": 1.1624721618006877, "learning_rate": 8.065269636962765e-06, "loss": 0.1035, "step": 2712 }, { "epoch": 1.0838658146964857, "grad_norm": 1.0820597055920262, "learning_rate": 8.061596734739377e-06, "loss": 0.1005, "step": 2714 }, { "epoch": 1.084664536741214, "grad_norm": 1.1521338536239825, "learning_rate": 8.057921187585274e-06, "loss": 0.1021, "step": 2716 }, { "epoch": 1.0854632587859425, "grad_norm": 1.088468478053058, "learning_rate": 8.054242998675787e-06, "loss": 0.1133, "step": 2718 }, { "epoch": 1.0862619808306708, "grad_norm": 1.139917756342621, "learning_rate": 8.05056217118852e-06, "loss": 0.108, "step": 2720 }, { "epoch": 1.0870607028753994, "grad_norm": 1.128084026950435, "learning_rate": 8.046878708303363e-06, "loss": 0.1026, "step": 2722 }, { "epoch": 1.0878594249201279, "grad_norm": 1.1291139425698837, "learning_rate": 8.04319261320248e-06, "loss": 0.1095, "step": 2724 }, { "epoch": 1.0886581469648562, "grad_norm": 1.3447212908276018, "learning_rate": 8.039503889070306e-06, "loss": 0.1194, "step": 2726 }, { "epoch": 1.0894568690095847, "grad_norm": 1.0745254740908237, "learning_rate": 8.035812539093557e-06, "loss": 0.1093, "step": 2728 }, { "epoch": 1.090255591054313, "grad_norm": 1.0828953637676413, "learning_rate": 8.032118566461206e-06, "loss": 0.1108, "step": 2730 }, { "epoch": 1.0910543130990416, "grad_norm": 1.0548110887259554, "learning_rate": 8.0284219743645e-06, "loss": 0.1062, "step": 2732 }, { "epoch": 1.09185303514377, "grad_norm": 1.1071922672380121, "learning_rate": 8.024722765996944e-06, "loss": 0.1037, "step": 2734 }, { "epoch": 1.0926517571884984, "grad_norm": 1.230092777293863, "learning_rate": 8.021020944554305e-06, "loss": 0.1154, "step": 2736 }, { "epoch": 1.093450479233227, "grad_norm": 1.0630731902655604, "learning_rate": 8.017316513234607e-06, "loss": 0.1152, "step": 2738 }, { "epoch": 1.0942492012779552, "grad_norm": 1.1292429935560224, "learning_rate": 8.013609475238132e-06, "loss": 0.11, "step": 2740 }, { "epoch": 1.0950479233226837, "grad_norm": 1.1249867737292523, "learning_rate": 8.009899833767407e-06, "loss": 0.1032, "step": 2742 }, { "epoch": 1.095846645367412, "grad_norm": 1.430619873698458, "learning_rate": 8.006187592027215e-06, "loss": 0.1178, "step": 2744 }, { "epoch": 1.0966453674121406, "grad_norm": 1.110036751210643, "learning_rate": 8.002472753224582e-06, "loss": 0.1145, "step": 2746 }, { "epoch": 1.097444089456869, "grad_norm": 0.957513352594371, "learning_rate": 7.998755320568778e-06, "loss": 0.1018, "step": 2748 }, { "epoch": 1.0982428115015974, "grad_norm": 1.042054355253303, "learning_rate": 7.995035297271313e-06, "loss": 0.1013, "step": 2750 }, { "epoch": 1.099041533546326, "grad_norm": 1.1532599431868966, "learning_rate": 7.991312686545939e-06, "loss": 0.1173, "step": 2752 }, { "epoch": 1.0998402555910542, "grad_norm": 1.2036914344298868, "learning_rate": 7.987587491608636e-06, "loss": 0.1186, "step": 2754 }, { "epoch": 1.1006389776357828, "grad_norm": 1.082895054115626, "learning_rate": 7.983859715677627e-06, "loss": 0.1006, "step": 2756 }, { "epoch": 1.101437699680511, "grad_norm": 1.1041877219800416, "learning_rate": 7.980129361973355e-06, "loss": 0.1061, "step": 2758 }, { "epoch": 1.1022364217252396, "grad_norm": 1.0649445718677086, "learning_rate": 7.976396433718492e-06, "loss": 0.1034, "step": 2760 }, { "epoch": 1.1030351437699681, "grad_norm": 1.1926141701225625, "learning_rate": 7.97266093413794e-06, "loss": 0.1158, "step": 2762 }, { "epoch": 1.1038338658146964, "grad_norm": 1.2011616729379744, "learning_rate": 7.968922866458812e-06, "loss": 0.1166, "step": 2764 }, { "epoch": 1.104632587859425, "grad_norm": 1.1032014477011283, "learning_rate": 7.965182233910453e-06, "loss": 0.1079, "step": 2766 }, { "epoch": 1.1054313099041533, "grad_norm": 1.0491357869845093, "learning_rate": 7.961439039724413e-06, "loss": 0.1185, "step": 2768 }, { "epoch": 1.1062300319488818, "grad_norm": 1.1311466555151601, "learning_rate": 7.95769328713446e-06, "loss": 0.1182, "step": 2770 }, { "epoch": 1.1070287539936103, "grad_norm": 1.2237403558053657, "learning_rate": 7.953944979376567e-06, "loss": 0.1097, "step": 2772 }, { "epoch": 1.1078274760383386, "grad_norm": 1.2431901555713765, "learning_rate": 7.950194119688922e-06, "loss": 0.1329, "step": 2774 }, { "epoch": 1.1086261980830672, "grad_norm": 1.0880599521982486, "learning_rate": 7.946440711311913e-06, "loss": 0.1162, "step": 2776 }, { "epoch": 1.1094249201277955, "grad_norm": 1.1073519932980254, "learning_rate": 7.942684757488133e-06, "loss": 0.1027, "step": 2778 }, { "epoch": 1.110223642172524, "grad_norm": 0.9960392472144209, "learning_rate": 7.938926261462366e-06, "loss": 0.1002, "step": 2780 }, { "epoch": 1.1110223642172523, "grad_norm": 1.1223224889936787, "learning_rate": 7.935165226481603e-06, "loss": 0.1138, "step": 2782 }, { "epoch": 1.1118210862619808, "grad_norm": 1.0435684073325897, "learning_rate": 7.931401655795021e-06, "loss": 0.1074, "step": 2784 }, { "epoch": 1.1126198083067094, "grad_norm": 1.0106236333934397, "learning_rate": 7.92763555265399e-06, "loss": 0.0992, "step": 2786 }, { "epoch": 1.1134185303514377, "grad_norm": 1.2351704038049207, "learning_rate": 7.923866920312069e-06, "loss": 0.1091, "step": 2788 }, { "epoch": 1.1142172523961662, "grad_norm": 1.2422399561551012, "learning_rate": 7.920095762025e-06, "loss": 0.1189, "step": 2790 }, { "epoch": 1.1150159744408945, "grad_norm": 1.0941653259345385, "learning_rate": 7.916322081050708e-06, "loss": 0.1003, "step": 2792 }, { "epoch": 1.115814696485623, "grad_norm": 1.1760490829802384, "learning_rate": 7.912545880649299e-06, "loss": 0.1267, "step": 2794 }, { "epoch": 1.1166134185303513, "grad_norm": 1.158226709859037, "learning_rate": 7.90876716408305e-06, "loss": 0.0968, "step": 2796 }, { "epoch": 1.1174121405750799, "grad_norm": 1.1375503575879282, "learning_rate": 7.904985934616419e-06, "loss": 0.1077, "step": 2798 }, { "epoch": 1.1182108626198084, "grad_norm": 0.9183058494090953, "learning_rate": 7.90120219551603e-06, "loss": 0.1032, "step": 2800 }, { "epoch": 1.1190095846645367, "grad_norm": 1.0610420482924088, "learning_rate": 7.897415950050676e-06, "loss": 0.1111, "step": 2802 }, { "epoch": 1.1198083067092652, "grad_norm": 1.1233765060725236, "learning_rate": 7.893627201491319e-06, "loss": 0.0985, "step": 2804 }, { "epoch": 1.1206070287539935, "grad_norm": 1.0398157065300724, "learning_rate": 7.889835953111075e-06, "loss": 0.1164, "step": 2806 }, { "epoch": 1.121405750798722, "grad_norm": 1.0997762546624286, "learning_rate": 7.88604220818523e-06, "loss": 0.1097, "step": 2808 }, { "epoch": 1.1222044728434506, "grad_norm": 1.0435817236551026, "learning_rate": 7.88224596999122e-06, "loss": 0.1046, "step": 2810 }, { "epoch": 1.123003194888179, "grad_norm": 1.2695278909995464, "learning_rate": 7.878447241808634e-06, "loss": 0.1102, "step": 2812 }, { "epoch": 1.1238019169329074, "grad_norm": 1.081882759084368, "learning_rate": 7.874646026919219e-06, "loss": 0.114, "step": 2814 }, { "epoch": 1.1246006389776357, "grad_norm": 1.2277213100870545, "learning_rate": 7.870842328606863e-06, "loss": 0.118, "step": 2816 }, { "epoch": 1.1253993610223643, "grad_norm": 1.0784722478718025, "learning_rate": 7.867036150157603e-06, "loss": 0.1132, "step": 2818 }, { "epoch": 1.1261980830670926, "grad_norm": 1.1255206572769305, "learning_rate": 7.86322749485962e-06, "loss": 0.1101, "step": 2820 }, { "epoch": 1.126996805111821, "grad_norm": 1.162832702222223, "learning_rate": 7.85941636600323e-06, "loss": 0.1161, "step": 2822 }, { "epoch": 1.1277955271565494, "grad_norm": 1.1447345276638636, "learning_rate": 7.85560276688089e-06, "loss": 0.1162, "step": 2824 }, { "epoch": 1.128594249201278, "grad_norm": 1.187515171501569, "learning_rate": 7.851786700787191e-06, "loss": 0.1046, "step": 2826 }, { "epoch": 1.1293929712460065, "grad_norm": 1.1412349222191334, "learning_rate": 7.847968171018851e-06, "loss": 0.1097, "step": 2828 }, { "epoch": 1.1301916932907348, "grad_norm": 0.9891747869008474, "learning_rate": 7.844147180874723e-06, "loss": 0.0948, "step": 2830 }, { "epoch": 1.1309904153354633, "grad_norm": 1.140395014058789, "learning_rate": 7.84032373365578e-06, "loss": 0.1038, "step": 2832 }, { "epoch": 1.1317891373801916, "grad_norm": 1.0508986553012427, "learning_rate": 7.836497832665119e-06, "loss": 0.1091, "step": 2834 }, { "epoch": 1.1325878594249201, "grad_norm": 1.016036817043171, "learning_rate": 7.83266948120796e-06, "loss": 0.0973, "step": 2836 }, { "epoch": 1.1333865814696487, "grad_norm": 1.130667401388963, "learning_rate": 7.828838682591635e-06, "loss": 0.1207, "step": 2838 }, { "epoch": 1.134185303514377, "grad_norm": 1.039182888114013, "learning_rate": 7.825005440125595e-06, "loss": 0.106, "step": 2840 }, { "epoch": 1.1349840255591055, "grad_norm": 1.1559863889230508, "learning_rate": 7.821169757121395e-06, "loss": 0.1113, "step": 2842 }, { "epoch": 1.1357827476038338, "grad_norm": 1.0074524350682563, "learning_rate": 7.817331636892708e-06, "loss": 0.1171, "step": 2844 }, { "epoch": 1.1365814696485623, "grad_norm": 1.1004441537401717, "learning_rate": 7.813491082755306e-06, "loss": 0.0954, "step": 2846 }, { "epoch": 1.1373801916932909, "grad_norm": 1.0467946843112212, "learning_rate": 7.809648098027067e-06, "loss": 0.1098, "step": 2848 }, { "epoch": 1.1381789137380192, "grad_norm": 1.1629386911794455, "learning_rate": 7.805802686027965e-06, "loss": 0.1186, "step": 2850 }, { "epoch": 1.1389776357827477, "grad_norm": 1.0232023211944392, "learning_rate": 7.801954850080075e-06, "loss": 0.0925, "step": 2852 }, { "epoch": 1.139776357827476, "grad_norm": 1.2284293666842674, "learning_rate": 7.798104593507562e-06, "loss": 0.1137, "step": 2854 }, { "epoch": 1.1405750798722045, "grad_norm": 1.0439656778352415, "learning_rate": 7.794251919636687e-06, "loss": 0.1086, "step": 2856 }, { "epoch": 1.1413738019169328, "grad_norm": 1.038970880098722, "learning_rate": 7.790396831795792e-06, "loss": 0.101, "step": 2858 }, { "epoch": 1.1421725239616614, "grad_norm": 1.0250967712229946, "learning_rate": 7.786539333315316e-06, "loss": 0.0997, "step": 2860 }, { "epoch": 1.1429712460063897, "grad_norm": 1.2328372564490289, "learning_rate": 7.782679427527768e-06, "loss": 0.1113, "step": 2862 }, { "epoch": 1.1437699680511182, "grad_norm": 1.0417991972771192, "learning_rate": 7.778817117767748e-06, "loss": 0.1005, "step": 2864 }, { "epoch": 1.1445686900958467, "grad_norm": 1.0282768724124047, "learning_rate": 7.77495240737192e-06, "loss": 0.1025, "step": 2866 }, { "epoch": 1.145367412140575, "grad_norm": 1.0115415567641788, "learning_rate": 7.771085299679033e-06, "loss": 0.0994, "step": 2868 }, { "epoch": 1.1461661341853036, "grad_norm": 1.122551661496836, "learning_rate": 7.767215798029906e-06, "loss": 0.1009, "step": 2870 }, { "epoch": 1.1469648562300319, "grad_norm": 1.2689706066461852, "learning_rate": 7.76334390576742e-06, "loss": 0.117, "step": 2872 }, { "epoch": 1.1477635782747604, "grad_norm": 1.1324363654209109, "learning_rate": 7.759469626236523e-06, "loss": 0.1068, "step": 2874 }, { "epoch": 1.148562300319489, "grad_norm": 1.162099881311204, "learning_rate": 7.75559296278423e-06, "loss": 0.1149, "step": 2876 }, { "epoch": 1.1493610223642172, "grad_norm": 1.1144526194493574, "learning_rate": 7.751713918759612e-06, "loss": 0.1189, "step": 2878 }, { "epoch": 1.1501597444089458, "grad_norm": 1.0366214429764016, "learning_rate": 7.747832497513797e-06, "loss": 0.1143, "step": 2880 }, { "epoch": 1.150958466453674, "grad_norm": 1.033901501799641, "learning_rate": 7.743948702399966e-06, "loss": 0.1038, "step": 2882 }, { "epoch": 1.1517571884984026, "grad_norm": 1.091707818538779, "learning_rate": 7.740062536773352e-06, "loss": 0.1018, "step": 2884 }, { "epoch": 1.1525559105431311, "grad_norm": 1.086352546060504, "learning_rate": 7.736174003991234e-06, "loss": 0.113, "step": 2886 }, { "epoch": 1.1533546325878594, "grad_norm": 1.0324848912394144, "learning_rate": 7.732283107412938e-06, "loss": 0.1012, "step": 2888 }, { "epoch": 1.154153354632588, "grad_norm": 1.180351977551197, "learning_rate": 7.728389850399834e-06, "loss": 0.107, "step": 2890 }, { "epoch": 1.1549520766773163, "grad_norm": 1.1213498513689995, "learning_rate": 7.724494236315327e-06, "loss": 0.118, "step": 2892 }, { "epoch": 1.1557507987220448, "grad_norm": 1.1297732537347598, "learning_rate": 7.72059626852486e-06, "loss": 0.1052, "step": 2894 }, { "epoch": 1.156549520766773, "grad_norm": 1.1260273445244882, "learning_rate": 7.71669595039591e-06, "loss": 0.1231, "step": 2896 }, { "epoch": 1.1573482428115016, "grad_norm": 1.0804349790876964, "learning_rate": 7.712793285297982e-06, "loss": 0.102, "step": 2898 }, { "epoch": 1.15814696485623, "grad_norm": 1.0887409006629178, "learning_rate": 7.70888827660261e-06, "loss": 0.1038, "step": 2900 }, { "epoch": 1.1589456869009584, "grad_norm": 1.3253348724046994, "learning_rate": 7.704980927683359e-06, "loss": 0.1085, "step": 2902 }, { "epoch": 1.159744408945687, "grad_norm": 1.2297063221974824, "learning_rate": 7.701071241915804e-06, "loss": 0.1114, "step": 2904 }, { "epoch": 1.1605431309904153, "grad_norm": 1.1163333185767002, "learning_rate": 7.697159222677544e-06, "loss": 0.1065, "step": 2906 }, { "epoch": 1.1613418530351438, "grad_norm": 1.108863651299088, "learning_rate": 7.693244873348197e-06, "loss": 0.1128, "step": 2908 }, { "epoch": 1.1621405750798721, "grad_norm": 1.0477128725454634, "learning_rate": 7.689328197309394e-06, "loss": 0.1014, "step": 2910 }, { "epoch": 1.1629392971246006, "grad_norm": 1.1667151644208233, "learning_rate": 7.685409197944768e-06, "loss": 0.0998, "step": 2912 }, { "epoch": 1.1637380191693292, "grad_norm": 1.2632234446787278, "learning_rate": 7.681487878639968e-06, "loss": 0.1067, "step": 2914 }, { "epoch": 1.1645367412140575, "grad_norm": 1.288703128597724, "learning_rate": 7.677564242782645e-06, "loss": 0.1093, "step": 2916 }, { "epoch": 1.165335463258786, "grad_norm": 1.151244739656633, "learning_rate": 7.673638293762447e-06, "loss": 0.1086, "step": 2918 }, { "epoch": 1.1661341853035143, "grad_norm": 1.1676995737767575, "learning_rate": 7.669710034971025e-06, "loss": 0.0907, "step": 2920 }, { "epoch": 1.1669329073482428, "grad_norm": 1.0099832716415298, "learning_rate": 7.665779469802024e-06, "loss": 0.1039, "step": 2922 }, { "epoch": 1.1677316293929714, "grad_norm": 1.2160567960293138, "learning_rate": 7.661846601651081e-06, "loss": 0.1054, "step": 2924 }, { "epoch": 1.1685303514376997, "grad_norm": 1.2002285628920948, "learning_rate": 7.657911433915824e-06, "loss": 0.1236, "step": 2926 }, { "epoch": 1.1693290734824282, "grad_norm": 1.1011958816133838, "learning_rate": 7.653973969995866e-06, "loss": 0.1043, "step": 2928 }, { "epoch": 1.1701277955271565, "grad_norm": 1.1222330810291938, "learning_rate": 7.650034213292804e-06, "loss": 0.0964, "step": 2930 }, { "epoch": 1.170926517571885, "grad_norm": 1.191923146958323, "learning_rate": 7.646092167210217e-06, "loss": 0.1247, "step": 2932 }, { "epoch": 1.1717252396166133, "grad_norm": 1.1057104845426344, "learning_rate": 7.642147835153659e-06, "loss": 0.1151, "step": 2934 }, { "epoch": 1.1725239616613419, "grad_norm": 1.1144118547772344, "learning_rate": 7.638201220530664e-06, "loss": 0.106, "step": 2936 }, { "epoch": 1.1733226837060702, "grad_norm": 1.03554140015377, "learning_rate": 7.634252326750733e-06, "loss": 0.1093, "step": 2938 }, { "epoch": 1.1741214057507987, "grad_norm": 1.067085791357077, "learning_rate": 7.630301157225336e-06, "loss": 0.106, "step": 2940 }, { "epoch": 1.1749201277955272, "grad_norm": 1.0936909170630784, "learning_rate": 7.626347715367912e-06, "loss": 0.1063, "step": 2942 }, { "epoch": 1.1757188498402555, "grad_norm": 1.3484514118083, "learning_rate": 7.622392004593862e-06, "loss": 0.0992, "step": 2944 }, { "epoch": 1.176517571884984, "grad_norm": 1.169969792758724, "learning_rate": 7.618434028320547e-06, "loss": 0.1107, "step": 2946 }, { "epoch": 1.1773162939297124, "grad_norm": 1.0742999979059649, "learning_rate": 7.614473789967285e-06, "loss": 0.1039, "step": 2948 }, { "epoch": 1.178115015974441, "grad_norm": 1.1077968599944528, "learning_rate": 7.610511292955347e-06, "loss": 0.1006, "step": 2950 }, { "epoch": 1.1789137380191694, "grad_norm": 1.1624715070814888, "learning_rate": 7.60654654070796e-06, "loss": 0.0994, "step": 2952 }, { "epoch": 1.1797124600638977, "grad_norm": 1.0968086205411158, "learning_rate": 7.602579536650292e-06, "loss": 0.1119, "step": 2954 }, { "epoch": 1.1805111821086263, "grad_norm": 1.168735576631901, "learning_rate": 7.59861028420946e-06, "loss": 0.1136, "step": 2956 }, { "epoch": 1.1813099041533546, "grad_norm": 1.2384061026226614, "learning_rate": 7.594638786814526e-06, "loss": 0.1176, "step": 2958 }, { "epoch": 1.182108626198083, "grad_norm": 1.241137963433562, "learning_rate": 7.59066504789649e-06, "loss": 0.12, "step": 2960 }, { "epoch": 1.1829073482428114, "grad_norm": 1.1850341035866552, "learning_rate": 7.586689070888284e-06, "loss": 0.119, "step": 2962 }, { "epoch": 1.18370607028754, "grad_norm": 1.1220698860625842, "learning_rate": 7.58271085922478e-06, "loss": 0.1121, "step": 2964 }, { "epoch": 1.1845047923322685, "grad_norm": 1.037375641928384, "learning_rate": 7.578730416342777e-06, "loss": 0.1102, "step": 2966 }, { "epoch": 1.1853035143769968, "grad_norm": 1.2171636076950145, "learning_rate": 7.574747745681e-06, "loss": 0.1163, "step": 2968 }, { "epoch": 1.1861022364217253, "grad_norm": 1.1555647145995425, "learning_rate": 7.5707628506801015e-06, "loss": 0.1033, "step": 2970 }, { "epoch": 1.1869009584664536, "grad_norm": 1.3403187676618649, "learning_rate": 7.566775734782656e-06, "loss": 0.1066, "step": 2972 }, { "epoch": 1.1876996805111821, "grad_norm": 1.1035130819507677, "learning_rate": 7.562786401433156e-06, "loss": 0.1073, "step": 2974 }, { "epoch": 1.1884984025559104, "grad_norm": 1.0839124042096482, "learning_rate": 7.558794854078006e-06, "loss": 0.1101, "step": 2976 }, { "epoch": 1.189297124600639, "grad_norm": 1.019947468283778, "learning_rate": 7.5548010961655295e-06, "loss": 0.1033, "step": 2978 }, { "epoch": 1.1900958466453675, "grad_norm": 1.0753891042248978, "learning_rate": 7.5508051311459555e-06, "loss": 0.109, "step": 2980 }, { "epoch": 1.1908945686900958, "grad_norm": 1.0714175737276472, "learning_rate": 7.546806962471419e-06, "loss": 0.1017, "step": 2982 }, { "epoch": 1.1916932907348243, "grad_norm": 1.0997902745143622, "learning_rate": 7.542806593595963e-06, "loss": 0.106, "step": 2984 }, { "epoch": 1.1924920127795526, "grad_norm": 1.0225175483161848, "learning_rate": 7.538804027975528e-06, "loss": 0.1092, "step": 2986 }, { "epoch": 1.1932907348242812, "grad_norm": 1.107891680578376, "learning_rate": 7.534799269067952e-06, "loss": 0.1091, "step": 2988 }, { "epoch": 1.1940894568690097, "grad_norm": 1.0551747572162382, "learning_rate": 7.530792320332971e-06, "loss": 0.1088, "step": 2990 }, { "epoch": 1.194888178913738, "grad_norm": 1.0629259318761108, "learning_rate": 7.526783185232208e-06, "loss": 0.0937, "step": 2992 }, { "epoch": 1.1956869009584665, "grad_norm": 1.1933108946432194, "learning_rate": 7.522771867229179e-06, "loss": 0.108, "step": 2994 }, { "epoch": 1.1964856230031948, "grad_norm": 1.157609849649065, "learning_rate": 7.518758369789286e-06, "loss": 0.1192, "step": 2996 }, { "epoch": 1.1972843450479234, "grad_norm": 1.167506126596324, "learning_rate": 7.514742696379809e-06, "loss": 0.1096, "step": 2998 }, { "epoch": 1.1980830670926517, "grad_norm": 1.0333315229582538, "learning_rate": 7.51072485046991e-06, "loss": 0.1085, "step": 3000 }, { "epoch": 1.1980830670926517, "eval_loss": 0.16306817531585693, "eval_runtime": 417.6568, "eval_samples_per_second": 42.635, "eval_steps_per_second": 5.33, "step": 3000 }, { "epoch": 1.1988817891373802, "grad_norm": 1.192881477161516, "learning_rate": 7.5067048355306334e-06, "loss": 0.1082, "step": 3002 }, { "epoch": 1.1996805111821087, "grad_norm": 1.2452028368788162, "learning_rate": 7.50268265503489e-06, "loss": 0.1156, "step": 3004 }, { "epoch": 1.200479233226837, "grad_norm": 1.1510650440674393, "learning_rate": 7.498658312457464e-06, "loss": 0.1133, "step": 3006 }, { "epoch": 1.2012779552715656, "grad_norm": 1.0520484152103042, "learning_rate": 7.494631811275008e-06, "loss": 0.1118, "step": 3008 }, { "epoch": 1.2020766773162939, "grad_norm": 1.0970981862348477, "learning_rate": 7.49060315496604e-06, "loss": 0.1072, "step": 3010 }, { "epoch": 1.2028753993610224, "grad_norm": 1.1102889095542705, "learning_rate": 7.486572347010937e-06, "loss": 0.1056, "step": 3012 }, { "epoch": 1.2036741214057507, "grad_norm": 1.178700112165599, "learning_rate": 7.482539390891941e-06, "loss": 0.1153, "step": 3014 }, { "epoch": 1.2044728434504792, "grad_norm": 1.0483204125787329, "learning_rate": 7.478504290093138e-06, "loss": 0.0957, "step": 3016 }, { "epoch": 1.2052715654952078, "grad_norm": 1.1855590333372437, "learning_rate": 7.474467048100484e-06, "loss": 0.1165, "step": 3018 }, { "epoch": 1.206070287539936, "grad_norm": 1.132537958694911, "learning_rate": 7.470427668401766e-06, "loss": 0.1087, "step": 3020 }, { "epoch": 1.2068690095846646, "grad_norm": 1.2170415750175325, "learning_rate": 7.466386154486634e-06, "loss": 0.1147, "step": 3022 }, { "epoch": 1.207667731629393, "grad_norm": 1.1082959534882166, "learning_rate": 7.462342509846571e-06, "loss": 0.1048, "step": 3024 }, { "epoch": 1.2084664536741214, "grad_norm": 1.1141000146197995, "learning_rate": 7.458296737974905e-06, "loss": 0.1132, "step": 3026 }, { "epoch": 1.20926517571885, "grad_norm": 1.0298899347940416, "learning_rate": 7.4542488423668005e-06, "loss": 0.1064, "step": 3028 }, { "epoch": 1.2100638977635783, "grad_norm": 1.083163833346684, "learning_rate": 7.450198826519259e-06, "loss": 0.1123, "step": 3030 }, { "epoch": 1.2108626198083068, "grad_norm": 1.0762705695988668, "learning_rate": 7.446146693931111e-06, "loss": 0.1008, "step": 3032 }, { "epoch": 1.211661341853035, "grad_norm": 1.1867201378486008, "learning_rate": 7.442092448103019e-06, "loss": 0.1256, "step": 3034 }, { "epoch": 1.2124600638977636, "grad_norm": 1.1714410675076814, "learning_rate": 7.438036092537465e-06, "loss": 0.1039, "step": 3036 }, { "epoch": 1.213258785942492, "grad_norm": 1.0370675696738088, "learning_rate": 7.43397763073876e-06, "loss": 0.1039, "step": 3038 }, { "epoch": 1.2140575079872205, "grad_norm": 1.097720581265046, "learning_rate": 7.42991706621303e-06, "loss": 0.1143, "step": 3040 }, { "epoch": 1.2148562300319488, "grad_norm": 0.9975002092372904, "learning_rate": 7.4258544024682245e-06, "loss": 0.1039, "step": 3042 }, { "epoch": 1.2156549520766773, "grad_norm": 1.032389232375386, "learning_rate": 7.421789643014096e-06, "loss": 0.1029, "step": 3044 }, { "epoch": 1.2164536741214058, "grad_norm": 1.159406643141239, "learning_rate": 7.417722791362216e-06, "loss": 0.121, "step": 3046 }, { "epoch": 1.2172523961661341, "grad_norm": 1.075616311885778, "learning_rate": 7.413653851025959e-06, "loss": 0.1027, "step": 3048 }, { "epoch": 1.2180511182108626, "grad_norm": 1.0677123331182898, "learning_rate": 7.4095828255205085e-06, "loss": 0.0933, "step": 3050 }, { "epoch": 1.218849840255591, "grad_norm": 1.1351445749769558, "learning_rate": 7.405509718362842e-06, "loss": 0.1028, "step": 3052 }, { "epoch": 1.2196485623003195, "grad_norm": 1.218634180761916, "learning_rate": 7.401434533071745e-06, "loss": 0.1146, "step": 3054 }, { "epoch": 1.220447284345048, "grad_norm": 1.1682846848481725, "learning_rate": 7.397357273167789e-06, "loss": 0.1014, "step": 3056 }, { "epoch": 1.2212460063897763, "grad_norm": 1.1641572787088017, "learning_rate": 7.393277942173345e-06, "loss": 0.1065, "step": 3058 }, { "epoch": 1.2220447284345048, "grad_norm": 1.1514146815699455, "learning_rate": 7.389196543612567e-06, "loss": 0.1014, "step": 3060 }, { "epoch": 1.2228434504792332, "grad_norm": 1.330134394264511, "learning_rate": 7.3851130810113995e-06, "loss": 0.118, "step": 3062 }, { "epoch": 1.2236421725239617, "grad_norm": 1.1828426669934808, "learning_rate": 7.381027557897568e-06, "loss": 0.1054, "step": 3064 }, { "epoch": 1.2244408945686902, "grad_norm": 1.1440466876810054, "learning_rate": 7.376939977800581e-06, "loss": 0.1157, "step": 3066 }, { "epoch": 1.2252396166134185, "grad_norm": 1.1022139516494007, "learning_rate": 7.372850344251722e-06, "loss": 0.0968, "step": 3068 }, { "epoch": 1.226038338658147, "grad_norm": 1.069906203066803, "learning_rate": 7.368758660784048e-06, "loss": 0.1154, "step": 3070 }, { "epoch": 1.2268370607028753, "grad_norm": 1.066501858996855, "learning_rate": 7.364664930932385e-06, "loss": 0.1114, "step": 3072 }, { "epoch": 1.2276357827476039, "grad_norm": 1.090807997716351, "learning_rate": 7.360569158233332e-06, "loss": 0.1058, "step": 3074 }, { "epoch": 1.2284345047923322, "grad_norm": 1.1653540017717812, "learning_rate": 7.356471346225249e-06, "loss": 0.1141, "step": 3076 }, { "epoch": 1.2292332268370607, "grad_norm": 1.2183346386496956, "learning_rate": 7.352371498448261e-06, "loss": 0.1086, "step": 3078 }, { "epoch": 1.230031948881789, "grad_norm": 1.1495635595373217, "learning_rate": 7.348269618444248e-06, "loss": 0.1128, "step": 3080 }, { "epoch": 1.2308306709265175, "grad_norm": 1.0657147313743496, "learning_rate": 7.344165709756847e-06, "loss": 0.1087, "step": 3082 }, { "epoch": 1.231629392971246, "grad_norm": 1.1350112419460496, "learning_rate": 7.340059775931449e-06, "loss": 0.1121, "step": 3084 }, { "epoch": 1.2324281150159744, "grad_norm": 1.1125316251538973, "learning_rate": 7.33595182051519e-06, "loss": 0.1112, "step": 3086 }, { "epoch": 1.233226837060703, "grad_norm": 1.1893400382987338, "learning_rate": 7.331841847056962e-06, "loss": 0.1047, "step": 3088 }, { "epoch": 1.2340255591054312, "grad_norm": 1.2177476387914874, "learning_rate": 7.3277298591073895e-06, "loss": 0.1136, "step": 3090 }, { "epoch": 1.2348242811501597, "grad_norm": 1.0631740360903161, "learning_rate": 7.323615860218844e-06, "loss": 0.1144, "step": 3092 }, { "epoch": 1.2356230031948883, "grad_norm": 1.092461962405361, "learning_rate": 7.319499853945431e-06, "loss": 0.0995, "step": 3094 }, { "epoch": 1.2364217252396166, "grad_norm": 1.0890967378843055, "learning_rate": 7.315381843842995e-06, "loss": 0.1033, "step": 3096 }, { "epoch": 1.237220447284345, "grad_norm": 1.2131460595478127, "learning_rate": 7.3112618334691035e-06, "loss": 0.1159, "step": 3098 }, { "epoch": 1.2380191693290734, "grad_norm": 1.2188104551185952, "learning_rate": 7.307139826383058e-06, "loss": 0.1053, "step": 3100 }, { "epoch": 1.238817891373802, "grad_norm": 1.0889591234624394, "learning_rate": 7.303015826145886e-06, "loss": 0.1012, "step": 3102 }, { "epoch": 1.2396166134185305, "grad_norm": 1.1040750999295241, "learning_rate": 7.298889836320334e-06, "loss": 0.1013, "step": 3104 }, { "epoch": 1.2404153354632588, "grad_norm": 1.100669836825261, "learning_rate": 7.294761860470866e-06, "loss": 0.1083, "step": 3106 }, { "epoch": 1.2412140575079873, "grad_norm": 1.1773345051128925, "learning_rate": 7.290631902163665e-06, "loss": 0.11, "step": 3108 }, { "epoch": 1.2420127795527156, "grad_norm": 1.2401290593387904, "learning_rate": 7.286499964966625e-06, "loss": 0.1203, "step": 3110 }, { "epoch": 1.2428115015974441, "grad_norm": 1.1482156063703826, "learning_rate": 7.282366052449351e-06, "loss": 0.1122, "step": 3112 }, { "epoch": 1.2436102236421724, "grad_norm": 1.4107571214662287, "learning_rate": 7.278230168183152e-06, "loss": 0.1183, "step": 3114 }, { "epoch": 1.244408945686901, "grad_norm": 1.0325520625453315, "learning_rate": 7.274092315741042e-06, "loss": 0.1019, "step": 3116 }, { "epoch": 1.2452076677316293, "grad_norm": 1.1107905673766356, "learning_rate": 7.269952498697734e-06, "loss": 0.1051, "step": 3118 }, { "epoch": 1.2460063897763578, "grad_norm": 1.1281556579434084, "learning_rate": 7.265810720629643e-06, "loss": 0.1187, "step": 3120 }, { "epoch": 1.2468051118210863, "grad_norm": 1.097863022792409, "learning_rate": 7.261666985114871e-06, "loss": 0.1172, "step": 3122 }, { "epoch": 1.2476038338658146, "grad_norm": 1.1043922887663058, "learning_rate": 7.257521295733214e-06, "loss": 0.0985, "step": 3124 }, { "epoch": 1.2484025559105432, "grad_norm": 1.1286717401317095, "learning_rate": 7.253373656066159e-06, "loss": 0.1152, "step": 3126 }, { "epoch": 1.2492012779552715, "grad_norm": 1.1389798339479362, "learning_rate": 7.249224069696876e-06, "loss": 0.1098, "step": 3128 }, { "epoch": 1.25, "grad_norm": 1.1580209218201918, "learning_rate": 7.245072540210213e-06, "loss": 0.1199, "step": 3130 }, { "epoch": 1.2507987220447285, "grad_norm": 1.019767242278647, "learning_rate": 7.2409190711927015e-06, "loss": 0.0945, "step": 3132 }, { "epoch": 1.2515974440894568, "grad_norm": 1.0864399680532384, "learning_rate": 7.236763666232546e-06, "loss": 0.1001, "step": 3134 }, { "epoch": 1.2523961661341854, "grad_norm": 1.261597038834277, "learning_rate": 7.232606328919627e-06, "loss": 0.1046, "step": 3136 }, { "epoch": 1.2531948881789137, "grad_norm": 1.1146593566415728, "learning_rate": 7.228447062845487e-06, "loss": 0.1201, "step": 3138 }, { "epoch": 1.2539936102236422, "grad_norm": 1.2339789575226763, "learning_rate": 7.224285871603344e-06, "loss": 0.1071, "step": 3140 }, { "epoch": 1.2547923322683707, "grad_norm": 1.1413529281364962, "learning_rate": 7.2201227587880704e-06, "loss": 0.1157, "step": 3142 }, { "epoch": 1.255591054313099, "grad_norm": 1.1321943139020432, "learning_rate": 7.215957727996208e-06, "loss": 0.1136, "step": 3144 }, { "epoch": 1.2563897763578276, "grad_norm": 1.1035881377949015, "learning_rate": 7.211790782825945e-06, "loss": 0.112, "step": 3146 }, { "epoch": 1.2571884984025559, "grad_norm": 1.1454443734559139, "learning_rate": 7.207621926877133e-06, "loss": 0.107, "step": 3148 }, { "epoch": 1.2579872204472844, "grad_norm": 1.0366769144913708, "learning_rate": 7.203451163751268e-06, "loss": 0.1025, "step": 3150 }, { "epoch": 1.2587859424920127, "grad_norm": 1.1952040379853797, "learning_rate": 7.199278497051498e-06, "loss": 0.1259, "step": 3152 }, { "epoch": 1.2595846645367412, "grad_norm": 1.1719876875826358, "learning_rate": 7.195103930382609e-06, "loss": 0.1102, "step": 3154 }, { "epoch": 1.2603833865814695, "grad_norm": 1.2526525796976853, "learning_rate": 7.190927467351037e-06, "loss": 0.1219, "step": 3156 }, { "epoch": 1.261182108626198, "grad_norm": 1.047570651725447, "learning_rate": 7.186749111564852e-06, "loss": 0.099, "step": 3158 }, { "epoch": 1.2619808306709266, "grad_norm": 1.0251728815129595, "learning_rate": 7.182568866633757e-06, "loss": 0.1129, "step": 3160 }, { "epoch": 1.262779552715655, "grad_norm": 1.0528626625673922, "learning_rate": 7.178386736169087e-06, "loss": 0.0972, "step": 3162 }, { "epoch": 1.2635782747603834, "grad_norm": 1.1434985715089727, "learning_rate": 7.174202723783815e-06, "loss": 0.1108, "step": 3164 }, { "epoch": 1.2643769968051117, "grad_norm": 1.053836775418805, "learning_rate": 7.170016833092526e-06, "loss": 0.111, "step": 3166 }, { "epoch": 1.2651757188498403, "grad_norm": 1.1472854273249073, "learning_rate": 7.16582906771144e-06, "loss": 0.1101, "step": 3168 }, { "epoch": 1.2659744408945688, "grad_norm": 1.019394721627375, "learning_rate": 7.161639431258387e-06, "loss": 0.1082, "step": 3170 }, { "epoch": 1.266773162939297, "grad_norm": 1.1497720973804415, "learning_rate": 7.157447927352821e-06, "loss": 0.1038, "step": 3172 }, { "epoch": 1.2675718849840256, "grad_norm": 0.931060363409149, "learning_rate": 7.153254559615802e-06, "loss": 0.0934, "step": 3174 }, { "epoch": 1.268370607028754, "grad_norm": 1.1003773790175626, "learning_rate": 7.149059331670009e-06, "loss": 0.1116, "step": 3176 }, { "epoch": 1.2691693290734825, "grad_norm": 0.9906365881640852, "learning_rate": 7.144862247139716e-06, "loss": 0.1048, "step": 3178 }, { "epoch": 1.269968051118211, "grad_norm": 1.1404557162820228, "learning_rate": 7.140663309650817e-06, "loss": 0.1132, "step": 3180 }, { "epoch": 1.2707667731629393, "grad_norm": 1.2239606071851836, "learning_rate": 7.1364625228307915e-06, "loss": 0.1124, "step": 3182 }, { "epoch": 1.2715654952076676, "grad_norm": 1.1186497681482894, "learning_rate": 7.132259890308726e-06, "loss": 0.1072, "step": 3184 }, { "epoch": 1.2723642172523961, "grad_norm": 1.1465574716494185, "learning_rate": 7.128055415715295e-06, "loss": 0.1034, "step": 3186 }, { "epoch": 1.2731629392971247, "grad_norm": 1.1631071186812576, "learning_rate": 7.123849102682772e-06, "loss": 0.1104, "step": 3188 }, { "epoch": 1.273961661341853, "grad_norm": 0.9901243870652381, "learning_rate": 7.119640954845011e-06, "loss": 0.0994, "step": 3190 }, { "epoch": 1.2747603833865815, "grad_norm": 1.085814944090195, "learning_rate": 7.115430975837457e-06, "loss": 0.1105, "step": 3192 }, { "epoch": 1.2755591054313098, "grad_norm": 1.2215853050594954, "learning_rate": 7.111219169297134e-06, "loss": 0.1054, "step": 3194 }, { "epoch": 1.2763578274760383, "grad_norm": 1.062258920330732, "learning_rate": 7.107005538862647e-06, "loss": 0.1087, "step": 3196 }, { "epoch": 1.2771565495207668, "grad_norm": 1.143873543200049, "learning_rate": 7.102790088174172e-06, "loss": 0.1098, "step": 3198 }, { "epoch": 1.2779552715654952, "grad_norm": 0.8890153264482474, "learning_rate": 7.098572820873461e-06, "loss": 0.096, "step": 3200 }, { "epoch": 1.2787539936102237, "grad_norm": 1.0763979750774153, "learning_rate": 7.0943537406038385e-06, "loss": 0.104, "step": 3202 }, { "epoch": 1.279552715654952, "grad_norm": 1.005382468063005, "learning_rate": 7.09013285101019e-06, "loss": 0.0956, "step": 3204 }, { "epoch": 1.2803514376996805, "grad_norm": 1.0144396802490765, "learning_rate": 7.085910155738964e-06, "loss": 0.0964, "step": 3206 }, { "epoch": 1.281150159744409, "grad_norm": 1.1536744431519097, "learning_rate": 7.081685658438173e-06, "loss": 0.127, "step": 3208 }, { "epoch": 1.2819488817891374, "grad_norm": 1.1286638022231128, "learning_rate": 7.0774593627573815e-06, "loss": 0.1083, "step": 3210 }, { "epoch": 1.2827476038338659, "grad_norm": 1.0529407867194136, "learning_rate": 7.073231272347714e-06, "loss": 0.1067, "step": 3212 }, { "epoch": 1.2835463258785942, "grad_norm": 1.0573840208402359, "learning_rate": 7.069001390861838e-06, "loss": 0.0897, "step": 3214 }, { "epoch": 1.2843450479233227, "grad_norm": 1.1598664588481753, "learning_rate": 7.064769721953975e-06, "loss": 0.1103, "step": 3216 }, { "epoch": 1.2851437699680512, "grad_norm": 1.2632526669395345, "learning_rate": 7.060536269279887e-06, "loss": 0.119, "step": 3218 }, { "epoch": 1.2859424920127795, "grad_norm": 1.0466455925316702, "learning_rate": 7.056301036496875e-06, "loss": 0.0984, "step": 3220 }, { "epoch": 1.2867412140575079, "grad_norm": 1.0228282332371574, "learning_rate": 7.052064027263785e-06, "loss": 0.0896, "step": 3222 }, { "epoch": 1.2875399361022364, "grad_norm": 1.1687594953534266, "learning_rate": 7.047825245240989e-06, "loss": 0.1064, "step": 3224 }, { "epoch": 1.288338658146965, "grad_norm": 1.1265735374171337, "learning_rate": 7.0435846940903974e-06, "loss": 0.1104, "step": 3226 }, { "epoch": 1.2891373801916932, "grad_norm": 1.070111795962378, "learning_rate": 7.039342377475444e-06, "loss": 0.1072, "step": 3228 }, { "epoch": 1.2899361022364217, "grad_norm": 1.2165890989586463, "learning_rate": 7.035098299061094e-06, "loss": 0.1192, "step": 3230 }, { "epoch": 1.29073482428115, "grad_norm": 1.1082530581997079, "learning_rate": 7.030852462513827e-06, "loss": 0.1065, "step": 3232 }, { "epoch": 1.2915335463258786, "grad_norm": 1.124650039405341, "learning_rate": 7.026604871501647e-06, "loss": 0.1045, "step": 3234 }, { "epoch": 1.292332268370607, "grad_norm": 1.2186986975548504, "learning_rate": 7.02235552969407e-06, "loss": 0.1142, "step": 3236 }, { "epoch": 1.2931309904153354, "grad_norm": 0.991464353367416, "learning_rate": 7.018104440762128e-06, "loss": 0.0919, "step": 3238 }, { "epoch": 1.293929712460064, "grad_norm": 1.020705207223098, "learning_rate": 7.013851608378359e-06, "loss": 0.0978, "step": 3240 }, { "epoch": 1.2947284345047922, "grad_norm": 1.0373213189747952, "learning_rate": 7.009597036216813e-06, "loss": 0.1088, "step": 3242 }, { "epoch": 1.2955271565495208, "grad_norm": 0.9464480336921676, "learning_rate": 7.005340727953035e-06, "loss": 0.0953, "step": 3244 }, { "epoch": 1.2963258785942493, "grad_norm": 1.0939229933892893, "learning_rate": 7.001082687264075e-06, "loss": 0.1079, "step": 3246 }, { "epoch": 1.2971246006389776, "grad_norm": 1.2560358384718262, "learning_rate": 6.9968229178284775e-06, "loss": 0.1165, "step": 3248 }, { "epoch": 1.2979233226837061, "grad_norm": 1.2448345478431648, "learning_rate": 6.992561423326284e-06, "loss": 0.1002, "step": 3250 }, { "epoch": 1.2987220447284344, "grad_norm": 1.1712943475606261, "learning_rate": 6.988298207439022e-06, "loss": 0.0989, "step": 3252 }, { "epoch": 1.299520766773163, "grad_norm": 1.1773991585755725, "learning_rate": 6.9840332738497065e-06, "loss": 0.1133, "step": 3254 }, { "epoch": 1.3003194888178915, "grad_norm": 1.1116582831985906, "learning_rate": 6.979766626242839e-06, "loss": 0.1142, "step": 3256 }, { "epoch": 1.3011182108626198, "grad_norm": 1.1586326246366871, "learning_rate": 6.975498268304401e-06, "loss": 0.1125, "step": 3258 }, { "epoch": 1.3019169329073481, "grad_norm": 1.0830604429290178, "learning_rate": 6.971228203721849e-06, "loss": 0.1086, "step": 3260 }, { "epoch": 1.3027156549520766, "grad_norm": 1.1206997334831217, "learning_rate": 6.96695643618412e-06, "loss": 0.1124, "step": 3262 }, { "epoch": 1.3035143769968052, "grad_norm": 0.967514162516545, "learning_rate": 6.9626829693816135e-06, "loss": 0.1125, "step": 3264 }, { "epoch": 1.3043130990415335, "grad_norm": 0.97337749177029, "learning_rate": 6.958407807006205e-06, "loss": 0.1037, "step": 3266 }, { "epoch": 1.305111821086262, "grad_norm": 1.1764350961941878, "learning_rate": 6.954130952751228e-06, "loss": 0.1131, "step": 3268 }, { "epoch": 1.3059105431309903, "grad_norm": 0.9892258025188561, "learning_rate": 6.949852410311484e-06, "loss": 0.1034, "step": 3270 }, { "epoch": 1.3067092651757188, "grad_norm": 1.0360171336900805, "learning_rate": 6.945572183383229e-06, "loss": 0.1067, "step": 3272 }, { "epoch": 1.3075079872204474, "grad_norm": 1.2410869037730936, "learning_rate": 6.941290275664175e-06, "loss": 0.1053, "step": 3274 }, { "epoch": 1.3083067092651757, "grad_norm": 1.0698597118994493, "learning_rate": 6.9370066908534875e-06, "loss": 0.1002, "step": 3276 }, { "epoch": 1.3091054313099042, "grad_norm": 1.0720761674526624, "learning_rate": 6.932721432651779e-06, "loss": 0.1093, "step": 3278 }, { "epoch": 1.3099041533546325, "grad_norm": 1.1243594373271324, "learning_rate": 6.928434504761106e-06, "loss": 0.1219, "step": 3280 }, { "epoch": 1.310702875399361, "grad_norm": 1.1859750342924074, "learning_rate": 6.924145910884972e-06, "loss": 0.109, "step": 3282 }, { "epoch": 1.3115015974440896, "grad_norm": 1.0930001670477891, "learning_rate": 6.919855654728317e-06, "loss": 0.108, "step": 3284 }, { "epoch": 1.3123003194888179, "grad_norm": 0.9985564810729535, "learning_rate": 6.9155637399975196e-06, "loss": 0.0973, "step": 3286 }, { "epoch": 1.3130990415335464, "grad_norm": 1.056704078140189, "learning_rate": 6.911270170400385e-06, "loss": 0.0985, "step": 3288 }, { "epoch": 1.3138977635782747, "grad_norm": 1.2319666270989271, "learning_rate": 6.9069749496461555e-06, "loss": 0.112, "step": 3290 }, { "epoch": 1.3146964856230032, "grad_norm": 1.1376694239915568, "learning_rate": 6.902678081445495e-06, "loss": 0.1048, "step": 3292 }, { "epoch": 1.3154952076677318, "grad_norm": 1.169749984507156, "learning_rate": 6.898379569510491e-06, "loss": 0.1021, "step": 3294 }, { "epoch": 1.31629392971246, "grad_norm": 1.223585035436731, "learning_rate": 6.894079417554657e-06, "loss": 0.1194, "step": 3296 }, { "epoch": 1.3170926517571884, "grad_norm": 1.19066097682282, "learning_rate": 6.889777629292914e-06, "loss": 0.1184, "step": 3298 }, { "epoch": 1.317891373801917, "grad_norm": 1.0965120097047536, "learning_rate": 6.885474208441602e-06, "loss": 0.0979, "step": 3300 }, { "epoch": 1.3186900958466454, "grad_norm": 1.0211040534374387, "learning_rate": 6.881169158718474e-06, "loss": 0.1081, "step": 3302 }, { "epoch": 1.3194888178913737, "grad_norm": 1.017263749768144, "learning_rate": 6.8768624838426815e-06, "loss": 0.0988, "step": 3304 }, { "epoch": 1.3202875399361023, "grad_norm": 1.0524568085425485, "learning_rate": 6.872554187534788e-06, "loss": 0.1032, "step": 3306 }, { "epoch": 1.3210862619808306, "grad_norm": 1.0173821412579143, "learning_rate": 6.868244273516755e-06, "loss": 0.1045, "step": 3308 }, { "epoch": 1.321884984025559, "grad_norm": 1.092162092574483, "learning_rate": 6.863932745511942e-06, "loss": 0.1127, "step": 3310 }, { "epoch": 1.3226837060702876, "grad_norm": 1.1738686449026956, "learning_rate": 6.859619607245102e-06, "loss": 0.1048, "step": 3312 }, { "epoch": 1.323482428115016, "grad_norm": 1.1698036650478054, "learning_rate": 6.855304862442379e-06, "loss": 0.109, "step": 3314 }, { "epoch": 1.3242811501597445, "grad_norm": 1.0611253385279913, "learning_rate": 6.850988514831304e-06, "loss": 0.1036, "step": 3316 }, { "epoch": 1.3250798722044728, "grad_norm": 1.180905968110651, "learning_rate": 6.846670568140797e-06, "loss": 0.1082, "step": 3318 }, { "epoch": 1.3258785942492013, "grad_norm": 1.054233295900519, "learning_rate": 6.842351026101155e-06, "loss": 0.1059, "step": 3320 }, { "epoch": 1.3266773162939298, "grad_norm": 1.080658997418189, "learning_rate": 6.838029892444056e-06, "loss": 0.1072, "step": 3322 }, { "epoch": 1.3274760383386581, "grad_norm": 1.0388885899845177, "learning_rate": 6.833707170902551e-06, "loss": 0.1042, "step": 3324 }, { "epoch": 1.3282747603833867, "grad_norm": 1.0681215628181655, "learning_rate": 6.829382865211063e-06, "loss": 0.1012, "step": 3326 }, { "epoch": 1.329073482428115, "grad_norm": 0.9880264033407431, "learning_rate": 6.825056979105382e-06, "loss": 0.1094, "step": 3328 }, { "epoch": 1.3298722044728435, "grad_norm": 1.0627754491783672, "learning_rate": 6.820729516322671e-06, "loss": 0.1045, "step": 3330 }, { "epoch": 1.330670926517572, "grad_norm": 0.9621593864476723, "learning_rate": 6.816400480601445e-06, "loss": 0.0988, "step": 3332 }, { "epoch": 1.3314696485623003, "grad_norm": 1.0320888666137535, "learning_rate": 6.812069875681585e-06, "loss": 0.0948, "step": 3334 }, { "epoch": 1.3322683706070286, "grad_norm": 1.0860752731846834, "learning_rate": 6.807737705304324e-06, "loss": 0.0979, "step": 3336 }, { "epoch": 1.3330670926517572, "grad_norm": 1.1173806530253718, "learning_rate": 6.803403973212247e-06, "loss": 0.1084, "step": 3338 }, { "epoch": 1.3338658146964857, "grad_norm": 1.211178041971597, "learning_rate": 6.799068683149291e-06, "loss": 0.1026, "step": 3340 }, { "epoch": 1.334664536741214, "grad_norm": 1.324084252679483, "learning_rate": 6.79473183886074e-06, "loss": 0.106, "step": 3342 }, { "epoch": 1.3354632587859425, "grad_norm": 1.190661411969892, "learning_rate": 6.790393444093214e-06, "loss": 0.1103, "step": 3344 }, { "epoch": 1.3362619808306708, "grad_norm": 1.0769935421724353, "learning_rate": 6.786053502594679e-06, "loss": 0.0949, "step": 3346 }, { "epoch": 1.3370607028753994, "grad_norm": 1.0884920315893003, "learning_rate": 6.781712018114435e-06, "loss": 0.1041, "step": 3348 }, { "epoch": 1.3378594249201279, "grad_norm": 1.451297567535448, "learning_rate": 6.777368994403113e-06, "loss": 0.1101, "step": 3350 }, { "epoch": 1.3386581469648562, "grad_norm": 1.0225728615227607, "learning_rate": 6.773024435212678e-06, "loss": 0.0971, "step": 3352 }, { "epoch": 1.3394568690095847, "grad_norm": 1.0713734854451504, "learning_rate": 6.7686783442964195e-06, "loss": 0.1011, "step": 3354 }, { "epoch": 1.340255591054313, "grad_norm": 1.0477177218409457, "learning_rate": 6.7643307254089485e-06, "loss": 0.107, "step": 3356 }, { "epoch": 1.3410543130990416, "grad_norm": 1.0932469970665097, "learning_rate": 6.759981582306197e-06, "loss": 0.1081, "step": 3358 }, { "epoch": 1.34185303514377, "grad_norm": 1.1498681000315574, "learning_rate": 6.7556309187454185e-06, "loss": 0.1036, "step": 3360 }, { "epoch": 1.3426517571884984, "grad_norm": 1.1306486794158652, "learning_rate": 6.751278738485169e-06, "loss": 0.117, "step": 3362 }, { "epoch": 1.343450479233227, "grad_norm": 1.0798075050613067, "learning_rate": 6.746925045285327e-06, "loss": 0.1075, "step": 3364 }, { "epoch": 1.3442492012779552, "grad_norm": 1.1185887548421796, "learning_rate": 6.742569842907071e-06, "loss": 0.1177, "step": 3366 }, { "epoch": 1.3450479233226837, "grad_norm": 1.185572779424059, "learning_rate": 6.738213135112884e-06, "loss": 0.0999, "step": 3368 }, { "epoch": 1.3458466453674123, "grad_norm": 1.113143717088373, "learning_rate": 6.733854925666552e-06, "loss": 0.111, "step": 3370 }, { "epoch": 1.3466453674121406, "grad_norm": 1.0888946989797392, "learning_rate": 6.729495218333157e-06, "loss": 0.0965, "step": 3372 }, { "epoch": 1.3474440894568689, "grad_norm": 1.0800532165447028, "learning_rate": 6.725134016879071e-06, "loss": 0.1294, "step": 3374 }, { "epoch": 1.3482428115015974, "grad_norm": 1.0293169314360555, "learning_rate": 6.720771325071965e-06, "loss": 0.1005, "step": 3376 }, { "epoch": 1.349041533546326, "grad_norm": 1.1266946956354467, "learning_rate": 6.716407146680793e-06, "loss": 0.1034, "step": 3378 }, { "epoch": 1.3498402555910542, "grad_norm": 1.0175557765865235, "learning_rate": 6.71204148547579e-06, "loss": 0.1032, "step": 3380 }, { "epoch": 1.3506389776357828, "grad_norm": 1.129124738760774, "learning_rate": 6.7076743452284776e-06, "loss": 0.1155, "step": 3382 }, { "epoch": 1.351437699680511, "grad_norm": 1.1223728907114148, "learning_rate": 6.703305729711653e-06, "loss": 0.1065, "step": 3384 }, { "epoch": 1.3522364217252396, "grad_norm": 1.0545042346249254, "learning_rate": 6.698935642699386e-06, "loss": 0.0987, "step": 3386 }, { "epoch": 1.3530351437699681, "grad_norm": 1.0675803055716808, "learning_rate": 6.694564087967023e-06, "loss": 0.1139, "step": 3388 }, { "epoch": 1.3538338658146964, "grad_norm": 1.173914284307619, "learning_rate": 6.6901910692911706e-06, "loss": 0.1187, "step": 3390 }, { "epoch": 1.354632587859425, "grad_norm": 1.0709511536838934, "learning_rate": 6.685816590449708e-06, "loss": 0.1068, "step": 3392 }, { "epoch": 1.3554313099041533, "grad_norm": 0.9466690009096309, "learning_rate": 6.68144065522177e-06, "loss": 0.0983, "step": 3394 }, { "epoch": 1.3562300319488818, "grad_norm": 1.114009958699587, "learning_rate": 6.677063267387754e-06, "loss": 0.1099, "step": 3396 }, { "epoch": 1.3570287539936103, "grad_norm": 1.1077866780843397, "learning_rate": 6.672684430729305e-06, "loss": 0.1036, "step": 3398 }, { "epoch": 1.3578274760383386, "grad_norm": 1.0412230391242072, "learning_rate": 6.668304149029331e-06, "loss": 0.1001, "step": 3400 }, { "epoch": 1.3586261980830672, "grad_norm": 1.1010636435498742, "learning_rate": 6.663922426071978e-06, "loss": 0.1045, "step": 3402 }, { "epoch": 1.3594249201277955, "grad_norm": 1.1984509482714603, "learning_rate": 6.659539265642643e-06, "loss": 0.106, "step": 3404 }, { "epoch": 1.360223642172524, "grad_norm": 0.9863537625149396, "learning_rate": 6.655154671527962e-06, "loss": 0.1033, "step": 3406 }, { "epoch": 1.3610223642172525, "grad_norm": 1.0898694676194725, "learning_rate": 6.650768647515813e-06, "loss": 0.1068, "step": 3408 }, { "epoch": 1.3618210862619808, "grad_norm": 1.0711686739516757, "learning_rate": 6.646381197395302e-06, "loss": 0.1094, "step": 3410 }, { "epoch": 1.3626198083067091, "grad_norm": 0.9804227369687718, "learning_rate": 6.641992324956776e-06, "loss": 0.1105, "step": 3412 }, { "epoch": 1.3634185303514377, "grad_norm": 1.1535558172179992, "learning_rate": 6.637602033991807e-06, "loss": 0.1053, "step": 3414 }, { "epoch": 1.3642172523961662, "grad_norm": 1.0241900868101663, "learning_rate": 6.63321032829319e-06, "loss": 0.1094, "step": 3416 }, { "epoch": 1.3650159744408945, "grad_norm": 1.1116602107856417, "learning_rate": 6.628817211654945e-06, "loss": 0.1079, "step": 3418 }, { "epoch": 1.365814696485623, "grad_norm": 1.099401650328653, "learning_rate": 6.624422687872312e-06, "loss": 0.1147, "step": 3420 }, { "epoch": 1.3666134185303513, "grad_norm": 1.1672020742722258, "learning_rate": 6.6200267607417415e-06, "loss": 0.1112, "step": 3422 }, { "epoch": 1.3674121405750799, "grad_norm": 1.0307138522820616, "learning_rate": 6.615629434060903e-06, "loss": 0.116, "step": 3424 }, { "epoch": 1.3682108626198084, "grad_norm": 1.2489824104367433, "learning_rate": 6.611230711628669e-06, "loss": 0.0957, "step": 3426 }, { "epoch": 1.3690095846645367, "grad_norm": 0.9719925332909197, "learning_rate": 6.6068305972451245e-06, "loss": 0.106, "step": 3428 }, { "epoch": 1.3698083067092652, "grad_norm": 1.058537905484153, "learning_rate": 6.602429094711549e-06, "loss": 0.1027, "step": 3430 }, { "epoch": 1.3706070287539935, "grad_norm": 1.0235718542609205, "learning_rate": 6.598026207830428e-06, "loss": 0.0964, "step": 3432 }, { "epoch": 1.371405750798722, "grad_norm": 1.0786959847501962, "learning_rate": 6.593621940405439e-06, "loss": 0.1014, "step": 3434 }, { "epoch": 1.3722044728434506, "grad_norm": 1.137212886630325, "learning_rate": 6.589216296241455e-06, "loss": 0.1168, "step": 3436 }, { "epoch": 1.373003194888179, "grad_norm": 1.0030513413696003, "learning_rate": 6.584809279144535e-06, "loss": 0.0999, "step": 3438 }, { "epoch": 1.3738019169329074, "grad_norm": 1.1517827771669715, "learning_rate": 6.5804008929219284e-06, "loss": 0.1177, "step": 3440 }, { "epoch": 1.3746006389776357, "grad_norm": 1.1512991234134653, "learning_rate": 6.575991141382063e-06, "loss": 0.0966, "step": 3442 }, { "epoch": 1.3753993610223643, "grad_norm": 1.1000379035510455, "learning_rate": 6.571580028334547e-06, "loss": 0.1053, "step": 3444 }, { "epoch": 1.3761980830670926, "grad_norm": 1.0218098747432192, "learning_rate": 6.56716755759017e-06, "loss": 0.0912, "step": 3446 }, { "epoch": 1.376996805111821, "grad_norm": 1.1551745154641282, "learning_rate": 6.562753732960887e-06, "loss": 0.1119, "step": 3448 }, { "epoch": 1.3777955271565494, "grad_norm": 1.2194490422535142, "learning_rate": 6.5583385582598255e-06, "loss": 0.099, "step": 3450 }, { "epoch": 1.378594249201278, "grad_norm": 1.4172435672608823, "learning_rate": 6.553922037301283e-06, "loss": 0.0978, "step": 3452 }, { "epoch": 1.3793929712460065, "grad_norm": 1.0695393339059773, "learning_rate": 6.549504173900715e-06, "loss": 0.0917, "step": 3454 }, { "epoch": 1.3801916932907348, "grad_norm": 1.196402708368458, "learning_rate": 6.545084971874738e-06, "loss": 0.1113, "step": 3456 }, { "epoch": 1.3809904153354633, "grad_norm": 1.0433584799967095, "learning_rate": 6.540664435041127e-06, "loss": 0.104, "step": 3458 }, { "epoch": 1.3817891373801916, "grad_norm": 1.149354927046393, "learning_rate": 6.536242567218808e-06, "loss": 0.1031, "step": 3460 }, { "epoch": 1.3825878594249201, "grad_norm": 1.1148556785558263, "learning_rate": 6.531819372227856e-06, "loss": 0.1032, "step": 3462 }, { "epoch": 1.3833865814696487, "grad_norm": 1.071711948179689, "learning_rate": 6.527394853889499e-06, "loss": 0.1085, "step": 3464 }, { "epoch": 1.384185303514377, "grad_norm": 1.082175681158764, "learning_rate": 6.522969016026099e-06, "loss": 0.11, "step": 3466 }, { "epoch": 1.3849840255591055, "grad_norm": 1.1369109201663514, "learning_rate": 6.518541862461163e-06, "loss": 0.1169, "step": 3468 }, { "epoch": 1.3857827476038338, "grad_norm": 1.2011247923954287, "learning_rate": 6.514113397019335e-06, "loss": 0.1081, "step": 3470 }, { "epoch": 1.3865814696485623, "grad_norm": 1.1792397645094768, "learning_rate": 6.5096836235263904e-06, "loss": 0.1202, "step": 3472 }, { "epoch": 1.3873801916932909, "grad_norm": 1.0727123114625765, "learning_rate": 6.505252545809238e-06, "loss": 0.0962, "step": 3474 }, { "epoch": 1.3881789137380192, "grad_norm": 1.0276014422922428, "learning_rate": 6.500820167695906e-06, "loss": 0.0996, "step": 3476 }, { "epoch": 1.3889776357827475, "grad_norm": 0.9724065635327491, "learning_rate": 6.496386493015554e-06, "loss": 0.0987, "step": 3478 }, { "epoch": 1.389776357827476, "grad_norm": 1.1292315012262628, "learning_rate": 6.491951525598461e-06, "loss": 0.0999, "step": 3480 }, { "epoch": 1.3905750798722045, "grad_norm": 1.1716250389490488, "learning_rate": 6.487515269276015e-06, "loss": 0.1015, "step": 3482 }, { "epoch": 1.3913738019169328, "grad_norm": 1.036997684705096, "learning_rate": 6.483077727880726e-06, "loss": 0.0863, "step": 3484 }, { "epoch": 1.3921725239616614, "grad_norm": 1.199757124869139, "learning_rate": 6.478638905246213e-06, "loss": 0.1116, "step": 3486 }, { "epoch": 1.3929712460063897, "grad_norm": 1.142726930599777, "learning_rate": 6.4741988052071965e-06, "loss": 0.1098, "step": 3488 }, { "epoch": 1.3937699680511182, "grad_norm": 0.9651746600463053, "learning_rate": 6.469757431599503e-06, "loss": 0.1038, "step": 3490 }, { "epoch": 1.3945686900958467, "grad_norm": 1.1474675700025958, "learning_rate": 6.465314788260067e-06, "loss": 0.1154, "step": 3492 }, { "epoch": 1.395367412140575, "grad_norm": 1.0346444553697798, "learning_rate": 6.460870879026906e-06, "loss": 0.0961, "step": 3494 }, { "epoch": 1.3961661341853036, "grad_norm": 1.032379620946616, "learning_rate": 6.45642570773914e-06, "loss": 0.0997, "step": 3496 }, { "epoch": 1.3969648562300319, "grad_norm": 1.016064340983906, "learning_rate": 6.451979278236979e-06, "loss": 0.0931, "step": 3498 }, { "epoch": 1.3977635782747604, "grad_norm": 1.178554041440438, "learning_rate": 6.447531594361719e-06, "loss": 0.1153, "step": 3500 }, { "epoch": 1.3977635782747604, "eval_loss": 0.1600114107131958, "eval_runtime": 418.1465, "eval_samples_per_second": 42.586, "eval_steps_per_second": 5.323, "step": 3500 }, { "epoch": 1.398562300319489, "grad_norm": 1.1584355046290076, "learning_rate": 6.443082659955737e-06, "loss": 0.1037, "step": 3502 }, { "epoch": 1.3993610223642172, "grad_norm": 1.0340861375288486, "learning_rate": 6.438632478862495e-06, "loss": 0.1011, "step": 3504 }, { "epoch": 1.4001597444089458, "grad_norm": 1.1875264702825228, "learning_rate": 6.434181054926528e-06, "loss": 0.1125, "step": 3506 }, { "epoch": 1.400958466453674, "grad_norm": 1.0588069817372803, "learning_rate": 6.429728391993446e-06, "loss": 0.107, "step": 3508 }, { "epoch": 1.4017571884984026, "grad_norm": 1.1098486981661322, "learning_rate": 6.425274493909932e-06, "loss": 0.1132, "step": 3510 }, { "epoch": 1.4025559105431311, "grad_norm": 0.9823134309813667, "learning_rate": 6.4208193645237314e-06, "loss": 0.0941, "step": 3512 }, { "epoch": 1.4033546325878594, "grad_norm": 1.0327788368934439, "learning_rate": 6.416363007683656e-06, "loss": 0.1105, "step": 3514 }, { "epoch": 1.4041533546325877, "grad_norm": 1.1821001072193735, "learning_rate": 6.411905427239577e-06, "loss": 0.1084, "step": 3516 }, { "epoch": 1.4049520766773163, "grad_norm": 1.1379784770115198, "learning_rate": 6.407446627042426e-06, "loss": 0.1135, "step": 3518 }, { "epoch": 1.4057507987220448, "grad_norm": 1.1147819119237963, "learning_rate": 6.402986610944183e-06, "loss": 0.1076, "step": 3520 }, { "epoch": 1.406549520766773, "grad_norm": 1.1200339092988951, "learning_rate": 6.398525382797884e-06, "loss": 0.1063, "step": 3522 }, { "epoch": 1.4073482428115016, "grad_norm": 1.1312798096131238, "learning_rate": 6.394062946457604e-06, "loss": 0.1002, "step": 3524 }, { "epoch": 1.40814696485623, "grad_norm": 1.199479655032073, "learning_rate": 6.389599305778471e-06, "loss": 0.108, "step": 3526 }, { "epoch": 1.4089456869009584, "grad_norm": 1.218231109858983, "learning_rate": 6.385134464616649e-06, "loss": 0.1069, "step": 3528 }, { "epoch": 1.409744408945687, "grad_norm": 1.1995058184118474, "learning_rate": 6.38066842682934e-06, "loss": 0.1103, "step": 3530 }, { "epoch": 1.4105431309904153, "grad_norm": 1.1046534115214985, "learning_rate": 6.376201196274778e-06, "loss": 0.1142, "step": 3532 }, { "epoch": 1.4113418530351438, "grad_norm": 1.0585983588946046, "learning_rate": 6.37173277681223e-06, "loss": 0.1114, "step": 3534 }, { "epoch": 1.4121405750798721, "grad_norm": 1.1029224104867834, "learning_rate": 6.367263172301985e-06, "loss": 0.1113, "step": 3536 }, { "epoch": 1.4129392971246006, "grad_norm": 1.1060776788940239, "learning_rate": 6.3627923866053656e-06, "loss": 0.0969, "step": 3538 }, { "epoch": 1.4137380191693292, "grad_norm": 1.1032132052448507, "learning_rate": 6.358320423584704e-06, "loss": 0.0979, "step": 3540 }, { "epoch": 1.4145367412140575, "grad_norm": 1.029503416268003, "learning_rate": 6.353847287103356e-06, "loss": 0.0953, "step": 3542 }, { "epoch": 1.415335463258786, "grad_norm": 1.1352827365353346, "learning_rate": 6.3493729810256895e-06, "loss": 0.1118, "step": 3544 }, { "epoch": 1.4161341853035143, "grad_norm": 1.119812994569396, "learning_rate": 6.344897509217085e-06, "loss": 0.1114, "step": 3546 }, { "epoch": 1.4169329073482428, "grad_norm": 1.076940990174831, "learning_rate": 6.340420875543922e-06, "loss": 0.107, "step": 3548 }, { "epoch": 1.4177316293929714, "grad_norm": 1.1517799889594393, "learning_rate": 6.335943083873596e-06, "loss": 0.0933, "step": 3550 }, { "epoch": 1.4185303514376997, "grad_norm": 1.1617154802397773, "learning_rate": 6.331464138074493e-06, "loss": 0.105, "step": 3552 }, { "epoch": 1.419329073482428, "grad_norm": 0.9494287593142456, "learning_rate": 6.326984042016e-06, "loss": 0.0921, "step": 3554 }, { "epoch": 1.4201277955271565, "grad_norm": 1.1141280255887196, "learning_rate": 6.322502799568498e-06, "loss": 0.1061, "step": 3556 }, { "epoch": 1.420926517571885, "grad_norm": 1.0414217088512951, "learning_rate": 6.3180204146033586e-06, "loss": 0.1029, "step": 3558 }, { "epoch": 1.4217252396166133, "grad_norm": 1.0724759251125648, "learning_rate": 6.313536890992935e-06, "loss": 0.1053, "step": 3560 }, { "epoch": 1.4225239616613419, "grad_norm": 1.0754047728283922, "learning_rate": 6.309052232610574e-06, "loss": 0.1096, "step": 3562 }, { "epoch": 1.4233226837060702, "grad_norm": 1.1017909660001841, "learning_rate": 6.3045664433305945e-06, "loss": 0.1092, "step": 3564 }, { "epoch": 1.4241214057507987, "grad_norm": 1.4387268439796592, "learning_rate": 6.300079527028297e-06, "loss": 0.1133, "step": 3566 }, { "epoch": 1.4249201277955272, "grad_norm": 1.1303501334206185, "learning_rate": 6.29559148757995e-06, "loss": 0.1065, "step": 3568 }, { "epoch": 1.4257188498402555, "grad_norm": 0.9640525041341275, "learning_rate": 6.291102328862801e-06, "loss": 0.0988, "step": 3570 }, { "epoch": 1.426517571884984, "grad_norm": 1.0585869277703588, "learning_rate": 6.286612054755056e-06, "loss": 0.1022, "step": 3572 }, { "epoch": 1.4273162939297124, "grad_norm": 1.0056038546914692, "learning_rate": 6.282120669135892e-06, "loss": 0.099, "step": 3574 }, { "epoch": 1.428115015974441, "grad_norm": 1.1206641244335471, "learning_rate": 6.277628175885437e-06, "loss": 0.1167, "step": 3576 }, { "epoch": 1.4289137380191694, "grad_norm": 1.1096458600950112, "learning_rate": 6.273134578884785e-06, "loss": 0.1058, "step": 3578 }, { "epoch": 1.4297124600638977, "grad_norm": 0.9857461120602693, "learning_rate": 6.2686398820159785e-06, "loss": 0.0947, "step": 3580 }, { "epoch": 1.4305111821086263, "grad_norm": 1.0203378985341014, "learning_rate": 6.2641440891620146e-06, "loss": 0.0914, "step": 3582 }, { "epoch": 1.4313099041533546, "grad_norm": 1.0724408105973864, "learning_rate": 6.2596472042068275e-06, "loss": 0.0934, "step": 3584 }, { "epoch": 1.432108626198083, "grad_norm": 1.2414438842066344, "learning_rate": 6.2551492310353094e-06, "loss": 0.1149, "step": 3586 }, { "epoch": 1.4329073482428116, "grad_norm": 1.101684816522059, "learning_rate": 6.250650173533279e-06, "loss": 0.1142, "step": 3588 }, { "epoch": 1.43370607028754, "grad_norm": 1.1753348478737677, "learning_rate": 6.2461500355875e-06, "loss": 0.116, "step": 3590 }, { "epoch": 1.4345047923322682, "grad_norm": 1.1874416062991624, "learning_rate": 6.241648821085666e-06, "loss": 0.1073, "step": 3592 }, { "epoch": 1.4353035143769968, "grad_norm": 1.0601160321918857, "learning_rate": 6.237146533916402e-06, "loss": 0.1013, "step": 3594 }, { "epoch": 1.4361022364217253, "grad_norm": 1.108271170806747, "learning_rate": 6.232643177969259e-06, "loss": 0.0952, "step": 3596 }, { "epoch": 1.4369009584664536, "grad_norm": 1.030322538054295, "learning_rate": 6.2281387571347126e-06, "loss": 0.1006, "step": 3598 }, { "epoch": 1.4376996805111821, "grad_norm": 1.1028346810795997, "learning_rate": 6.223633275304157e-06, "loss": 0.111, "step": 3600 }, { "epoch": 1.4384984025559104, "grad_norm": 1.0355367948281153, "learning_rate": 6.2191267363699026e-06, "loss": 0.1017, "step": 3602 }, { "epoch": 1.439297124600639, "grad_norm": 1.2163948163518867, "learning_rate": 6.214619144225176e-06, "loss": 0.1077, "step": 3604 }, { "epoch": 1.4400958466453675, "grad_norm": 1.1518703062495919, "learning_rate": 6.210110502764107e-06, "loss": 0.1155, "step": 3606 }, { "epoch": 1.4408945686900958, "grad_norm": 1.1725592196005963, "learning_rate": 6.205600815881741e-06, "loss": 0.0979, "step": 3608 }, { "epoch": 1.4416932907348243, "grad_norm": 1.0475879501811074, "learning_rate": 6.2010900874740225e-06, "loss": 0.106, "step": 3610 }, { "epoch": 1.4424920127795526, "grad_norm": 1.07813779648937, "learning_rate": 6.1965783214377895e-06, "loss": 0.1022, "step": 3612 }, { "epoch": 1.4432907348242812, "grad_norm": 1.0886593706957781, "learning_rate": 6.192065521670787e-06, "loss": 0.1051, "step": 3614 }, { "epoch": 1.4440894568690097, "grad_norm": 1.1378082559578835, "learning_rate": 6.187551692071648e-06, "loss": 0.1084, "step": 3616 }, { "epoch": 1.444888178913738, "grad_norm": 1.2011389042931888, "learning_rate": 6.183036836539893e-06, "loss": 0.1092, "step": 3618 }, { "epoch": 1.4456869009584665, "grad_norm": 1.1251988986799606, "learning_rate": 6.178520958975933e-06, "loss": 0.1068, "step": 3620 }, { "epoch": 1.4464856230031948, "grad_norm": 1.0150796615451205, "learning_rate": 6.17400406328106e-06, "loss": 0.1075, "step": 3622 }, { "epoch": 1.4472843450479234, "grad_norm": 1.0885442213663745, "learning_rate": 6.1694861533574445e-06, "loss": 0.1127, "step": 3624 }, { "epoch": 1.4480830670926519, "grad_norm": 1.064305780213124, "learning_rate": 6.164967233108137e-06, "loss": 0.0966, "step": 3626 }, { "epoch": 1.4488817891373802, "grad_norm": 1.0844139564754607, "learning_rate": 6.160447306437055e-06, "loss": 0.1076, "step": 3628 }, { "epoch": 1.4496805111821085, "grad_norm": 1.034026504245696, "learning_rate": 6.1559263772489905e-06, "loss": 0.096, "step": 3630 }, { "epoch": 1.450479233226837, "grad_norm": 1.001104101700096, "learning_rate": 6.1514044494496e-06, "loss": 0.0994, "step": 3632 }, { "epoch": 1.4512779552715656, "grad_norm": 1.1773822780221779, "learning_rate": 6.146881526945401e-06, "loss": 0.1076, "step": 3634 }, { "epoch": 1.4520766773162939, "grad_norm": 1.0588404307223935, "learning_rate": 6.142357613643773e-06, "loss": 0.0929, "step": 3636 }, { "epoch": 1.4528753993610224, "grad_norm": 1.2341260840634936, "learning_rate": 6.13783271345295e-06, "loss": 0.1156, "step": 3638 }, { "epoch": 1.4536741214057507, "grad_norm": 1.2029429887737177, "learning_rate": 6.133306830282021e-06, "loss": 0.1131, "step": 3640 }, { "epoch": 1.4544728434504792, "grad_norm": 0.9829158742595866, "learning_rate": 6.128779968040917e-06, "loss": 0.0977, "step": 3642 }, { "epoch": 1.4552715654952078, "grad_norm": 1.0321776789284078, "learning_rate": 6.1242521306404236e-06, "loss": 0.0928, "step": 3644 }, { "epoch": 1.456070287539936, "grad_norm": 1.0207766757489227, "learning_rate": 6.119723321992164e-06, "loss": 0.0956, "step": 3646 }, { "epoch": 1.4568690095846646, "grad_norm": 1.2565358842354744, "learning_rate": 6.115193546008602e-06, "loss": 0.1075, "step": 3648 }, { "epoch": 1.457667731629393, "grad_norm": 1.0795577230748994, "learning_rate": 6.110662806603036e-06, "loss": 0.1073, "step": 3650 }, { "epoch": 1.4584664536741214, "grad_norm": 1.1116179183482724, "learning_rate": 6.106131107689599e-06, "loss": 0.1091, "step": 3652 }, { "epoch": 1.45926517571885, "grad_norm": 1.1185062550976665, "learning_rate": 6.101598453183248e-06, "loss": 0.0981, "step": 3654 }, { "epoch": 1.4600638977635783, "grad_norm": 1.1929060063575452, "learning_rate": 6.097064846999774e-06, "loss": 0.1114, "step": 3656 }, { "epoch": 1.4608626198083068, "grad_norm": 1.1672964109989814, "learning_rate": 6.09253029305578e-06, "loss": 0.1048, "step": 3658 }, { "epoch": 1.461661341853035, "grad_norm": 1.3459483840304294, "learning_rate": 6.087994795268696e-06, "loss": 0.1175, "step": 3660 }, { "epoch": 1.4624600638977636, "grad_norm": 1.1453927282681513, "learning_rate": 6.0834583575567606e-06, "loss": 0.1024, "step": 3662 }, { "epoch": 1.4632587859424921, "grad_norm": 1.0361766854922676, "learning_rate": 6.078920983839032e-06, "loss": 0.0971, "step": 3664 }, { "epoch": 1.4640575079872205, "grad_norm": 1.0777649114365144, "learning_rate": 6.07438267803537e-06, "loss": 0.1028, "step": 3666 }, { "epoch": 1.4648562300319488, "grad_norm": 1.1306654064542798, "learning_rate": 6.069843444066444e-06, "loss": 0.1095, "step": 3668 }, { "epoch": 1.4656549520766773, "grad_norm": 1.0195281420397648, "learning_rate": 6.065303285853724e-06, "loss": 0.1075, "step": 3670 }, { "epoch": 1.4664536741214058, "grad_norm": 1.023251007369703, "learning_rate": 6.060762207319479e-06, "loss": 0.1002, "step": 3672 }, { "epoch": 1.4672523961661341, "grad_norm": 1.0920777216930633, "learning_rate": 6.056220212386769e-06, "loss": 0.1051, "step": 3674 }, { "epoch": 1.4680511182108626, "grad_norm": 1.1039024614877324, "learning_rate": 6.0516773049794545e-06, "loss": 0.098, "step": 3676 }, { "epoch": 1.468849840255591, "grad_norm": 0.9870962605725515, "learning_rate": 6.0471334890221735e-06, "loss": 0.1052, "step": 3678 }, { "epoch": 1.4696485623003195, "grad_norm": 1.0307618080780037, "learning_rate": 6.042588768440358e-06, "loss": 0.0995, "step": 3680 }, { "epoch": 1.470447284345048, "grad_norm": 1.0513694638233513, "learning_rate": 6.038043147160215e-06, "loss": 0.0954, "step": 3682 }, { "epoch": 1.4712460063897763, "grad_norm": 1.0657839501710005, "learning_rate": 6.033496629108736e-06, "loss": 0.0995, "step": 3684 }, { "epoch": 1.4720447284345048, "grad_norm": 1.1159795703718869, "learning_rate": 6.02894921821368e-06, "loss": 0.0966, "step": 3686 }, { "epoch": 1.4728434504792332, "grad_norm": 0.9598839228144777, "learning_rate": 6.024400918403581e-06, "loss": 0.0968, "step": 3688 }, { "epoch": 1.4736421725239617, "grad_norm": 1.0384010675372382, "learning_rate": 6.019851733607744e-06, "loss": 0.0955, "step": 3690 }, { "epoch": 1.4744408945686902, "grad_norm": 1.1078140257609426, "learning_rate": 6.015301667756234e-06, "loss": 0.0942, "step": 3692 }, { "epoch": 1.4752396166134185, "grad_norm": 1.2701615597150369, "learning_rate": 6.0107507247798765e-06, "loss": 0.0995, "step": 3694 }, { "epoch": 1.476038338658147, "grad_norm": 1.2102770076457041, "learning_rate": 6.006198908610261e-06, "loss": 0.1202, "step": 3696 }, { "epoch": 1.4768370607028753, "grad_norm": 1.145740181813383, "learning_rate": 6.0016462231797225e-06, "loss": 0.1117, "step": 3698 }, { "epoch": 1.4776357827476039, "grad_norm": 1.0833428968981065, "learning_rate": 5.997092672421356e-06, "loss": 0.1037, "step": 3700 }, { "epoch": 1.4784345047923324, "grad_norm": 1.0801478244071052, "learning_rate": 5.9925382602689974e-06, "loss": 0.1019, "step": 3702 }, { "epoch": 1.4792332268370607, "grad_norm": 1.2134338336543735, "learning_rate": 5.987982990657229e-06, "loss": 0.1139, "step": 3704 }, { "epoch": 1.480031948881789, "grad_norm": 1.0064298338790032, "learning_rate": 5.9834268675213745e-06, "loss": 0.1045, "step": 3706 }, { "epoch": 1.4808306709265175, "grad_norm": 1.1699403671691384, "learning_rate": 5.978869894797494e-06, "loss": 0.1054, "step": 3708 }, { "epoch": 1.481629392971246, "grad_norm": 0.9388532857646246, "learning_rate": 5.974312076422381e-06, "loss": 0.098, "step": 3710 }, { "epoch": 1.4824281150159744, "grad_norm": 1.0345840078184048, "learning_rate": 5.9697534163335645e-06, "loss": 0.1077, "step": 3712 }, { "epoch": 1.483226837060703, "grad_norm": 1.0824548650493346, "learning_rate": 5.965193918469292e-06, "loss": 0.099, "step": 3714 }, { "epoch": 1.4840255591054312, "grad_norm": 1.1507346874731688, "learning_rate": 5.9606335867685424e-06, "loss": 0.1013, "step": 3716 }, { "epoch": 1.4848242811501597, "grad_norm": 0.9628660977174563, "learning_rate": 5.9560724251710116e-06, "loss": 0.0943, "step": 3718 }, { "epoch": 1.4856230031948883, "grad_norm": 1.0718435481145705, "learning_rate": 5.95151043761711e-06, "loss": 0.1014, "step": 3720 }, { "epoch": 1.4864217252396166, "grad_norm": 0.929707333376811, "learning_rate": 5.9469476280479685e-06, "loss": 0.0907, "step": 3722 }, { "epoch": 1.487220447284345, "grad_norm": 1.0656203252614662, "learning_rate": 5.9423840004054235e-06, "loss": 0.1024, "step": 3724 }, { "epoch": 1.4880191693290734, "grad_norm": 1.1218591644267362, "learning_rate": 5.9378195586320155e-06, "loss": 0.1046, "step": 3726 }, { "epoch": 1.488817891373802, "grad_norm": 1.171155440361567, "learning_rate": 5.933254306670995e-06, "loss": 0.1007, "step": 3728 }, { "epoch": 1.4896166134185305, "grad_norm": 1.0258926148385281, "learning_rate": 5.9286882484663054e-06, "loss": 0.0955, "step": 3730 }, { "epoch": 1.4904153354632588, "grad_norm": 1.133084775825884, "learning_rate": 5.924121387962594e-06, "loss": 0.1002, "step": 3732 }, { "epoch": 1.4912140575079873, "grad_norm": 1.1855051630642663, "learning_rate": 5.919553729105194e-06, "loss": 0.1012, "step": 3734 }, { "epoch": 1.4920127795527156, "grad_norm": 1.1146676203875627, "learning_rate": 5.914985275840135e-06, "loss": 0.1081, "step": 3736 }, { "epoch": 1.4928115015974441, "grad_norm": 1.1041278385511843, "learning_rate": 5.910416032114128e-06, "loss": 0.1026, "step": 3738 }, { "epoch": 1.4936102236421724, "grad_norm": 0.9898027730365901, "learning_rate": 5.905846001874566e-06, "loss": 0.0888, "step": 3740 }, { "epoch": 1.494408945686901, "grad_norm": 1.1585515500268038, "learning_rate": 5.90127518906953e-06, "loss": 0.1052, "step": 3742 }, { "epoch": 1.4952076677316293, "grad_norm": 1.0347303041203528, "learning_rate": 5.896703597647765e-06, "loss": 0.1058, "step": 3744 }, { "epoch": 1.4960063897763578, "grad_norm": 0.9935016480185027, "learning_rate": 5.892131231558696e-06, "loss": 0.1115, "step": 3746 }, { "epoch": 1.4968051118210863, "grad_norm": 1.0501951176001236, "learning_rate": 5.88755809475242e-06, "loss": 0.1025, "step": 3748 }, { "epoch": 1.4976038338658146, "grad_norm": 1.0008810336543572, "learning_rate": 5.882984191179691e-06, "loss": 0.0984, "step": 3750 }, { "epoch": 1.4984025559105432, "grad_norm": 1.2054742114396801, "learning_rate": 5.878409524791931e-06, "loss": 0.1101, "step": 3752 }, { "epoch": 1.4992012779552715, "grad_norm": 1.094413921625223, "learning_rate": 5.8738340995412216e-06, "loss": 0.1055, "step": 3754 }, { "epoch": 1.5, "grad_norm": 1.0505101513083803, "learning_rate": 5.869257919380298e-06, "loss": 0.1026, "step": 3756 }, { "epoch": 1.5007987220447285, "grad_norm": 0.970894387236391, "learning_rate": 5.864680988262546e-06, "loss": 0.0935, "step": 3758 }, { "epoch": 1.5015974440894568, "grad_norm": 1.1554311867796712, "learning_rate": 5.8601033101420055e-06, "loss": 0.1188, "step": 3760 }, { "epoch": 1.5023961661341851, "grad_norm": 1.0830901347700692, "learning_rate": 5.855524888973358e-06, "loss": 0.1106, "step": 3762 }, { "epoch": 1.5031948881789137, "grad_norm": 1.057272294501186, "learning_rate": 5.850945728711925e-06, "loss": 0.1028, "step": 3764 }, { "epoch": 1.5039936102236422, "grad_norm": 1.0336589663794098, "learning_rate": 5.846365833313672e-06, "loss": 0.1019, "step": 3766 }, { "epoch": 1.5047923322683707, "grad_norm": 1.11484447745884, "learning_rate": 5.841785206735192e-06, "loss": 0.1009, "step": 3768 }, { "epoch": 1.505591054313099, "grad_norm": 1.1990777552155663, "learning_rate": 5.837203852933721e-06, "loss": 0.1078, "step": 3770 }, { "epoch": 1.5063897763578273, "grad_norm": 1.2341825295207347, "learning_rate": 5.83262177586711e-06, "loss": 0.1103, "step": 3772 }, { "epoch": 1.5071884984025559, "grad_norm": 1.372034044865105, "learning_rate": 5.828038979493844e-06, "loss": 0.1056, "step": 3774 }, { "epoch": 1.5079872204472844, "grad_norm": 1.1741674239048192, "learning_rate": 5.823455467773027e-06, "loss": 0.1079, "step": 3776 }, { "epoch": 1.508785942492013, "grad_norm": 1.0252917230334802, "learning_rate": 5.81887124466438e-06, "loss": 0.0914, "step": 3778 }, { "epoch": 1.5095846645367412, "grad_norm": 1.1697427527897497, "learning_rate": 5.814286314128239e-06, "loss": 0.0991, "step": 3780 }, { "epoch": 1.5103833865814695, "grad_norm": 1.042769204041001, "learning_rate": 5.809700680125552e-06, "loss": 0.1059, "step": 3782 }, { "epoch": 1.511182108626198, "grad_norm": 1.0296503251696472, "learning_rate": 5.805114346617874e-06, "loss": 0.0946, "step": 3784 }, { "epoch": 1.5119808306709266, "grad_norm": 1.009763042669415, "learning_rate": 5.800527317567365e-06, "loss": 0.0968, "step": 3786 }, { "epoch": 1.5127795527156551, "grad_norm": 1.0866525391827386, "learning_rate": 5.795939596936783e-06, "loss": 0.1014, "step": 3788 }, { "epoch": 1.5135782747603834, "grad_norm": 1.1514100776695269, "learning_rate": 5.791351188689489e-06, "loss": 0.1016, "step": 3790 }, { "epoch": 1.5143769968051117, "grad_norm": 1.2403482979785827, "learning_rate": 5.786762096789431e-06, "loss": 0.1046, "step": 3792 }, { "epoch": 1.5151757188498403, "grad_norm": 1.1732506896242403, "learning_rate": 5.782172325201155e-06, "loss": 0.1127, "step": 3794 }, { "epoch": 1.5159744408945688, "grad_norm": 1.0881019594599013, "learning_rate": 5.777581877889788e-06, "loss": 0.0894, "step": 3796 }, { "epoch": 1.516773162939297, "grad_norm": 1.1933368615094555, "learning_rate": 5.772990758821046e-06, "loss": 0.1078, "step": 3798 }, { "epoch": 1.5175718849840254, "grad_norm": 0.9420287917337775, "learning_rate": 5.768398971961221e-06, "loss": 0.0926, "step": 3800 }, { "epoch": 1.518370607028754, "grad_norm": 1.0673845641768755, "learning_rate": 5.763806521277184e-06, "loss": 0.0958, "step": 3802 }, { "epoch": 1.5191693290734825, "grad_norm": 1.0756212442333353, "learning_rate": 5.759213410736377e-06, "loss": 0.0984, "step": 3804 }, { "epoch": 1.519968051118211, "grad_norm": 1.0303021520595748, "learning_rate": 5.7546196443068195e-06, "loss": 0.099, "step": 3806 }, { "epoch": 1.5207667731629393, "grad_norm": 1.1602092966550535, "learning_rate": 5.750025225957086e-06, "loss": 0.1052, "step": 3808 }, { "epoch": 1.5215654952076676, "grad_norm": 1.1125067715645671, "learning_rate": 5.745430159656324e-06, "loss": 0.0988, "step": 3810 }, { "epoch": 1.5223642172523961, "grad_norm": 1.0076518432605175, "learning_rate": 5.740834449374237e-06, "loss": 0.1006, "step": 3812 }, { "epoch": 1.5231629392971247, "grad_norm": 1.1636274304360332, "learning_rate": 5.7362380990810836e-06, "loss": 0.0982, "step": 3814 }, { "epoch": 1.5239616613418532, "grad_norm": 1.589543826243573, "learning_rate": 5.731641112747679e-06, "loss": 0.1033, "step": 3816 }, { "epoch": 1.5247603833865815, "grad_norm": 1.130992557679762, "learning_rate": 5.7270434943453844e-06, "loss": 0.098, "step": 3818 }, { "epoch": 1.5255591054313098, "grad_norm": 0.9432691902452544, "learning_rate": 5.722445247846107e-06, "loss": 0.0888, "step": 3820 }, { "epoch": 1.5263578274760383, "grad_norm": 1.106347020699136, "learning_rate": 5.717846377222302e-06, "loss": 0.0936, "step": 3822 }, { "epoch": 1.5271565495207668, "grad_norm": 1.1052796082215754, "learning_rate": 5.713246886446954e-06, "loss": 0.1007, "step": 3824 }, { "epoch": 1.5279552715654952, "grad_norm": 1.1941612947080527, "learning_rate": 5.708646779493592e-06, "loss": 0.1107, "step": 3826 }, { "epoch": 1.5287539936102237, "grad_norm": 1.0917404470546062, "learning_rate": 5.704046060336276e-06, "loss": 0.0985, "step": 3828 }, { "epoch": 1.529552715654952, "grad_norm": 1.026940535614031, "learning_rate": 5.699444732949592e-06, "loss": 0.1008, "step": 3830 }, { "epoch": 1.5303514376996805, "grad_norm": 1.1825164845911036, "learning_rate": 5.694842801308651e-06, "loss": 0.1039, "step": 3832 }, { "epoch": 1.531150159744409, "grad_norm": 1.2523857121893918, "learning_rate": 5.69024026938909e-06, "loss": 0.1073, "step": 3834 }, { "epoch": 1.5319488817891374, "grad_norm": 0.9618590795563883, "learning_rate": 5.6856371411670605e-06, "loss": 0.0895, "step": 3836 }, { "epoch": 1.5327476038338657, "grad_norm": 1.087295231368687, "learning_rate": 5.681033420619233e-06, "loss": 0.1079, "step": 3838 }, { "epoch": 1.5335463258785942, "grad_norm": 1.0751240176712655, "learning_rate": 5.676429111722786e-06, "loss": 0.119, "step": 3840 }, { "epoch": 1.5343450479233227, "grad_norm": 0.9761262443774417, "learning_rate": 5.67182421845541e-06, "loss": 0.0985, "step": 3842 }, { "epoch": 1.5351437699680512, "grad_norm": 1.064750803984951, "learning_rate": 5.6672187447952944e-06, "loss": 0.1164, "step": 3844 }, { "epoch": 1.5359424920127795, "grad_norm": 1.0336803651600035, "learning_rate": 5.662612694721139e-06, "loss": 0.0933, "step": 3846 }, { "epoch": 1.5367412140575079, "grad_norm": 1.216380707446393, "learning_rate": 5.6580060722121325e-06, "loss": 0.0985, "step": 3848 }, { "epoch": 1.5375399361022364, "grad_norm": 1.0486840682788419, "learning_rate": 5.6533988812479626e-06, "loss": 0.0884, "step": 3850 }, { "epoch": 1.538338658146965, "grad_norm": 1.0017531938240458, "learning_rate": 5.648791125808809e-06, "loss": 0.0971, "step": 3852 }, { "epoch": 1.5391373801916934, "grad_norm": 0.9711321520409679, "learning_rate": 5.644182809875338e-06, "loss": 0.0967, "step": 3854 }, { "epoch": 1.5399361022364217, "grad_norm": 1.0444214456541656, "learning_rate": 5.639573937428699e-06, "loss": 0.1005, "step": 3856 }, { "epoch": 1.54073482428115, "grad_norm": 1.1384563164764703, "learning_rate": 5.634964512450522e-06, "loss": 0.0937, "step": 3858 }, { "epoch": 1.5415335463258786, "grad_norm": 1.1704875948472755, "learning_rate": 5.630354538922916e-06, "loss": 0.1069, "step": 3860 }, { "epoch": 1.542332268370607, "grad_norm": 1.153542545533936, "learning_rate": 5.6257440208284645e-06, "loss": 0.0944, "step": 3862 }, { "epoch": 1.5431309904153354, "grad_norm": 1.2017126948215129, "learning_rate": 5.621132962150216e-06, "loss": 0.1056, "step": 3864 }, { "epoch": 1.543929712460064, "grad_norm": 1.1096553295923908, "learning_rate": 5.616521366871697e-06, "loss": 0.1084, "step": 3866 }, { "epoch": 1.5447284345047922, "grad_norm": 1.2932559709159934, "learning_rate": 5.611909238976885e-06, "loss": 0.1077, "step": 3868 }, { "epoch": 1.5455271565495208, "grad_norm": 1.1009979044862985, "learning_rate": 5.607296582450224e-06, "loss": 0.0946, "step": 3870 }, { "epoch": 1.5463258785942493, "grad_norm": 1.1753388398369289, "learning_rate": 5.6026834012766155e-06, "loss": 0.1056, "step": 3872 }, { "epoch": 1.5471246006389776, "grad_norm": 1.1300535782520567, "learning_rate": 5.598069699441414e-06, "loss": 0.1072, "step": 3874 }, { "epoch": 1.547923322683706, "grad_norm": 1.1361407815794689, "learning_rate": 5.5934554809304184e-06, "loss": 0.1057, "step": 3876 }, { "epoch": 1.5487220447284344, "grad_norm": 1.0774923728003276, "learning_rate": 5.5888407497298824e-06, "loss": 0.0978, "step": 3878 }, { "epoch": 1.549520766773163, "grad_norm": 1.1078465378909663, "learning_rate": 5.584225509826497e-06, "loss": 0.0847, "step": 3880 }, { "epoch": 1.5503194888178915, "grad_norm": 1.117154173356503, "learning_rate": 5.579609765207393e-06, "loss": 0.1053, "step": 3882 }, { "epoch": 1.5511182108626198, "grad_norm": 1.1442174998289312, "learning_rate": 5.574993519860139e-06, "loss": 0.1024, "step": 3884 }, { "epoch": 1.5519169329073481, "grad_norm": 1.1475487693664175, "learning_rate": 5.5703767777727354e-06, "loss": 0.1073, "step": 3886 }, { "epoch": 1.5527156549520766, "grad_norm": 1.1044103730150632, "learning_rate": 5.565759542933612e-06, "loss": 0.1073, "step": 3888 }, { "epoch": 1.5535143769968052, "grad_norm": 1.1544497802356082, "learning_rate": 5.561141819331624e-06, "loss": 0.1055, "step": 3890 }, { "epoch": 1.5543130990415337, "grad_norm": 1.13696637094894, "learning_rate": 5.556523610956049e-06, "loss": 0.1032, "step": 3892 }, { "epoch": 1.555111821086262, "grad_norm": 1.0623393891516841, "learning_rate": 5.55190492179658e-06, "loss": 0.1139, "step": 3894 }, { "epoch": 1.5559105431309903, "grad_norm": 1.0054614979615821, "learning_rate": 5.547285755843334e-06, "loss": 0.0888, "step": 3896 }, { "epoch": 1.5567092651757188, "grad_norm": 0.9923971125933808, "learning_rate": 5.542666117086832e-06, "loss": 0.0884, "step": 3898 }, { "epoch": 1.5575079872204474, "grad_norm": 1.1566532715523212, "learning_rate": 5.538046009518007e-06, "loss": 0.1053, "step": 3900 }, { "epoch": 1.5583067092651757, "grad_norm": 1.0988640530351912, "learning_rate": 5.5334254371281934e-06, "loss": 0.0987, "step": 3902 }, { "epoch": 1.5591054313099042, "grad_norm": 1.1389960010587739, "learning_rate": 5.5288044039091335e-06, "loss": 0.1075, "step": 3904 }, { "epoch": 1.5599041533546325, "grad_norm": 1.2172334905611906, "learning_rate": 5.524182913852961e-06, "loss": 0.1017, "step": 3906 }, { "epoch": 1.560702875399361, "grad_norm": 1.0961769840109552, "learning_rate": 5.519560970952208e-06, "loss": 0.1074, "step": 3908 }, { "epoch": 1.5615015974440896, "grad_norm": 0.9984956732002762, "learning_rate": 5.514938579199798e-06, "loss": 0.1151, "step": 3910 }, { "epoch": 1.5623003194888179, "grad_norm": 1.0275799079683399, "learning_rate": 5.510315742589042e-06, "loss": 0.1024, "step": 3912 }, { "epoch": 1.5630990415335462, "grad_norm": 1.11512381478294, "learning_rate": 5.505692465113633e-06, "loss": 0.1132, "step": 3914 }, { "epoch": 1.5638977635782747, "grad_norm": 1.0178232850189146, "learning_rate": 5.5010687507676466e-06, "loss": 0.1075, "step": 3916 }, { "epoch": 1.5646964856230032, "grad_norm": 1.0142014809665243, "learning_rate": 5.496444603545535e-06, "loss": 0.0962, "step": 3918 }, { "epoch": 1.5654952076677318, "grad_norm": 1.046870424333617, "learning_rate": 5.491820027442126e-06, "loss": 0.1043, "step": 3920 }, { "epoch": 1.56629392971246, "grad_norm": 1.1510480151229603, "learning_rate": 5.487195026452619e-06, "loss": 0.102, "step": 3922 }, { "epoch": 1.5670926517571884, "grad_norm": 1.27959847253193, "learning_rate": 5.482569604572577e-06, "loss": 0.1021, "step": 3924 }, { "epoch": 1.567891373801917, "grad_norm": 0.9918976439707066, "learning_rate": 5.477943765797926e-06, "loss": 0.1, "step": 3926 }, { "epoch": 1.5686900958466454, "grad_norm": 1.080408951439636, "learning_rate": 5.473317514124958e-06, "loss": 0.1054, "step": 3928 }, { "epoch": 1.569488817891374, "grad_norm": 1.081550022627091, "learning_rate": 5.4686908535503135e-06, "loss": 0.1019, "step": 3930 }, { "epoch": 1.5702875399361023, "grad_norm": 1.0811985668125383, "learning_rate": 5.464063788070996e-06, "loss": 0.0936, "step": 3932 }, { "epoch": 1.5710862619808306, "grad_norm": 0.9821534674944753, "learning_rate": 5.459436321684348e-06, "loss": 0.0933, "step": 3934 }, { "epoch": 1.571884984025559, "grad_norm": 1.0914512560209013, "learning_rate": 5.454808458388069e-06, "loss": 0.1148, "step": 3936 }, { "epoch": 1.5726837060702876, "grad_norm": 1.164283845889731, "learning_rate": 5.4501802021801935e-06, "loss": 0.1018, "step": 3938 }, { "epoch": 1.573482428115016, "grad_norm": 0.970687102295011, "learning_rate": 5.445551557059098e-06, "loss": 0.0936, "step": 3940 }, { "epoch": 1.5742811501597445, "grad_norm": 1.1042280103745048, "learning_rate": 5.440922527023494e-06, "loss": 0.093, "step": 3942 }, { "epoch": 1.5750798722044728, "grad_norm": 1.2131440637336284, "learning_rate": 5.436293116072431e-06, "loss": 0.1136, "step": 3944 }, { "epoch": 1.5758785942492013, "grad_norm": 1.055324845043691, "learning_rate": 5.431663328205279e-06, "loss": 0.1043, "step": 3946 }, { "epoch": 1.5766773162939298, "grad_norm": 0.9811618715390004, "learning_rate": 5.42703316742174e-06, "loss": 0.093, "step": 3948 }, { "epoch": 1.5774760383386581, "grad_norm": 1.2173508502121098, "learning_rate": 5.4224026377218365e-06, "loss": 0.1045, "step": 3950 }, { "epoch": 1.5782747603833864, "grad_norm": 1.1661185719259703, "learning_rate": 5.417771743105908e-06, "loss": 0.1098, "step": 3952 }, { "epoch": 1.579073482428115, "grad_norm": 1.0058471682242138, "learning_rate": 5.413140487574608e-06, "loss": 0.1106, "step": 3954 }, { "epoch": 1.5798722044728435, "grad_norm": 1.0394406333548927, "learning_rate": 5.408508875128911e-06, "loss": 0.0929, "step": 3956 }, { "epoch": 1.580670926517572, "grad_norm": 1.1297610914562695, "learning_rate": 5.403876909770087e-06, "loss": 0.101, "step": 3958 }, { "epoch": 1.5814696485623003, "grad_norm": 1.1543975422216644, "learning_rate": 5.399244595499721e-06, "loss": 0.1121, "step": 3960 }, { "epoch": 1.5822683706070286, "grad_norm": 1.1306410767900936, "learning_rate": 5.394611936319692e-06, "loss": 0.1016, "step": 3962 }, { "epoch": 1.5830670926517572, "grad_norm": 0.9448150345192816, "learning_rate": 5.389978936232185e-06, "loss": 0.0893, "step": 3964 }, { "epoch": 1.5838658146964857, "grad_norm": 0.9463251751033753, "learning_rate": 5.385345599239669e-06, "loss": 0.0946, "step": 3966 }, { "epoch": 1.5846645367412142, "grad_norm": 1.0596847272661667, "learning_rate": 5.380711929344915e-06, "loss": 0.1151, "step": 3968 }, { "epoch": 1.5854632587859425, "grad_norm": 1.0752597682928209, "learning_rate": 5.376077930550973e-06, "loss": 0.107, "step": 3970 }, { "epoch": 1.5862619808306708, "grad_norm": 1.116596755502733, "learning_rate": 5.371443606861186e-06, "loss": 0.1019, "step": 3972 }, { "epoch": 1.5870607028753994, "grad_norm": 1.0051885963957903, "learning_rate": 5.366808962279166e-06, "loss": 0.0962, "step": 3974 }, { "epoch": 1.5878594249201279, "grad_norm": 1.027781803302719, "learning_rate": 5.362174000808813e-06, "loss": 0.0971, "step": 3976 }, { "epoch": 1.5886581469648562, "grad_norm": 0.8827091346960793, "learning_rate": 5.3575387264542934e-06, "loss": 0.0855, "step": 3978 }, { "epoch": 1.5894568690095847, "grad_norm": 1.072987187940347, "learning_rate": 5.352903143220051e-06, "loss": 0.0967, "step": 3980 }, { "epoch": 1.590255591054313, "grad_norm": 1.0526867412563083, "learning_rate": 5.348267255110787e-06, "loss": 0.092, "step": 3982 }, { "epoch": 1.5910543130990416, "grad_norm": 1.1489504347454111, "learning_rate": 5.343631066131476e-06, "loss": 0.1105, "step": 3984 }, { "epoch": 1.59185303514377, "grad_norm": 1.1540840295187986, "learning_rate": 5.338994580287345e-06, "loss": 0.1011, "step": 3986 }, { "epoch": 1.5926517571884984, "grad_norm": 1.1394245571406438, "learning_rate": 5.334357801583882e-06, "loss": 0.0996, "step": 3988 }, { "epoch": 1.5934504792332267, "grad_norm": 1.0287583891620165, "learning_rate": 5.329720734026824e-06, "loss": 0.0957, "step": 3990 }, { "epoch": 1.5942492012779552, "grad_norm": 1.0779004754661263, "learning_rate": 5.325083381622165e-06, "loss": 0.0979, "step": 3992 }, { "epoch": 1.5950479233226837, "grad_norm": 1.200252910830259, "learning_rate": 5.320445748376133e-06, "loss": 0.0968, "step": 3994 }, { "epoch": 1.5958466453674123, "grad_norm": 1.2695805654962276, "learning_rate": 5.3158078382952095e-06, "loss": 0.1137, "step": 3996 }, { "epoch": 1.5966453674121406, "grad_norm": 1.0386371478246852, "learning_rate": 5.311169655386112e-06, "loss": 0.0981, "step": 3998 }, { "epoch": 1.5974440894568689, "grad_norm": 1.1857255254365702, "learning_rate": 5.30653120365579e-06, "loss": 0.1205, "step": 4000 }, { "epoch": 1.5974440894568689, "eval_loss": 0.15445688366889954, "eval_runtime": 417.6616, "eval_samples_per_second": 42.635, "eval_steps_per_second": 5.33, "step": 4000 }, { "epoch": 1.5982428115015974, "grad_norm": 1.1097474658690198, "learning_rate": 5.301892487111431e-06, "loss": 0.1025, "step": 4002 }, { "epoch": 1.599041533546326, "grad_norm": 1.1919685218314882, "learning_rate": 5.2972535097604474e-06, "loss": 0.112, "step": 4004 }, { "epoch": 1.5998402555910545, "grad_norm": 0.9947826284264967, "learning_rate": 5.292614275610476e-06, "loss": 0.1021, "step": 4006 }, { "epoch": 1.6006389776357828, "grad_norm": 0.9427973792484328, "learning_rate": 5.28797478866938e-06, "loss": 0.091, "step": 4008 }, { "epoch": 1.601437699680511, "grad_norm": 1.1411139408840176, "learning_rate": 5.283335052945238e-06, "loss": 0.0975, "step": 4010 }, { "epoch": 1.6022364217252396, "grad_norm": 1.1012457970159555, "learning_rate": 5.278695072446342e-06, "loss": 0.0951, "step": 4012 }, { "epoch": 1.6030351437699681, "grad_norm": 1.1626019144814381, "learning_rate": 5.2740548511812e-06, "loss": 0.0998, "step": 4014 }, { "epoch": 1.6038338658146964, "grad_norm": 1.1201103649096615, "learning_rate": 5.269414393158523e-06, "loss": 0.0987, "step": 4016 }, { "epoch": 1.604632587859425, "grad_norm": 1.1630697149501579, "learning_rate": 5.264773702387232e-06, "loss": 0.1022, "step": 4018 }, { "epoch": 1.6054313099041533, "grad_norm": 1.2522557555471332, "learning_rate": 5.2601327828764415e-06, "loss": 0.1042, "step": 4020 }, { "epoch": 1.6062300319488818, "grad_norm": 1.1002664728813976, "learning_rate": 5.255491638635472e-06, "loss": 0.1131, "step": 4022 }, { "epoch": 1.6070287539936103, "grad_norm": 1.1476460289094692, "learning_rate": 5.250850273673831e-06, "loss": 0.1041, "step": 4024 }, { "epoch": 1.6078274760383386, "grad_norm": 1.2997227467476715, "learning_rate": 5.246208692001224e-06, "loss": 0.109, "step": 4026 }, { "epoch": 1.608626198083067, "grad_norm": 1.094935343872026, "learning_rate": 5.241566897627536e-06, "loss": 0.1007, "step": 4028 }, { "epoch": 1.6094249201277955, "grad_norm": 1.083296893505705, "learning_rate": 5.236924894562841e-06, "loss": 0.102, "step": 4030 }, { "epoch": 1.610223642172524, "grad_norm": 1.0723795202774216, "learning_rate": 5.232282686817392e-06, "loss": 0.0996, "step": 4032 }, { "epoch": 1.6110223642172525, "grad_norm": 1.078393017722415, "learning_rate": 5.227640278401616e-06, "loss": 0.1047, "step": 4034 }, { "epoch": 1.6118210862619808, "grad_norm": 1.1241219264829283, "learning_rate": 5.222997673326118e-06, "loss": 0.1125, "step": 4036 }, { "epoch": 1.6126198083067091, "grad_norm": 1.1494086764740628, "learning_rate": 5.218354875601672e-06, "loss": 0.1077, "step": 4038 }, { "epoch": 1.6134185303514377, "grad_norm": 1.0313683486241818, "learning_rate": 5.213711889239214e-06, "loss": 0.1056, "step": 4040 }, { "epoch": 1.6142172523961662, "grad_norm": 1.0785913080866496, "learning_rate": 5.209068718249849e-06, "loss": 0.1026, "step": 4042 }, { "epoch": 1.6150159744408947, "grad_norm": 1.1093921293966087, "learning_rate": 5.2044253666448364e-06, "loss": 0.1019, "step": 4044 }, { "epoch": 1.615814696485623, "grad_norm": 1.048516005384363, "learning_rate": 5.1997818384355945e-06, "loss": 0.1198, "step": 4046 }, { "epoch": 1.6166134185303513, "grad_norm": 1.0667431272462546, "learning_rate": 5.195138137633695e-06, "loss": 0.1, "step": 4048 }, { "epoch": 1.6174121405750799, "grad_norm": 1.162801685632335, "learning_rate": 5.190494268250856e-06, "loss": 0.108, "step": 4050 }, { "epoch": 1.6182108626198084, "grad_norm": 1.0168007347140544, "learning_rate": 5.185850234298943e-06, "loss": 0.0927, "step": 4052 }, { "epoch": 1.6190095846645367, "grad_norm": 1.0037984889528644, "learning_rate": 5.1812060397899624e-06, "loss": 0.0876, "step": 4054 }, { "epoch": 1.619808306709265, "grad_norm": 1.0382499080256122, "learning_rate": 5.17656168873606e-06, "loss": 0.107, "step": 4056 }, { "epoch": 1.6206070287539935, "grad_norm": 1.1185835251899559, "learning_rate": 5.171917185149518e-06, "loss": 0.1044, "step": 4058 }, { "epoch": 1.621405750798722, "grad_norm": 1.0657788685206113, "learning_rate": 5.167272533042748e-06, "loss": 0.097, "step": 4060 }, { "epoch": 1.6222044728434506, "grad_norm": 1.0542383484498254, "learning_rate": 5.162627736428293e-06, "loss": 0.0946, "step": 4062 }, { "epoch": 1.623003194888179, "grad_norm": 1.1196684728128978, "learning_rate": 5.157982799318817e-06, "loss": 0.1071, "step": 4064 }, { "epoch": 1.6238019169329072, "grad_norm": 1.0973928801710329, "learning_rate": 5.153337725727109e-06, "loss": 0.0965, "step": 4066 }, { "epoch": 1.6246006389776357, "grad_norm": 1.085899130068226, "learning_rate": 5.148692519666072e-06, "loss": 0.087, "step": 4068 }, { "epoch": 1.6253993610223643, "grad_norm": 1.0337553281158798, "learning_rate": 5.1440471851487286e-06, "loss": 0.1031, "step": 4070 }, { "epoch": 1.6261980830670928, "grad_norm": 1.0455524042989242, "learning_rate": 5.139401726188208e-06, "loss": 0.1012, "step": 4072 }, { "epoch": 1.626996805111821, "grad_norm": 1.1232283088505088, "learning_rate": 5.1347561467977495e-06, "loss": 0.1048, "step": 4074 }, { "epoch": 1.6277955271565494, "grad_norm": 1.1377273856980297, "learning_rate": 5.130110450990694e-06, "loss": 0.1015, "step": 4076 }, { "epoch": 1.628594249201278, "grad_norm": 1.0376776381456139, "learning_rate": 5.1254646427804855e-06, "loss": 0.0892, "step": 4078 }, { "epoch": 1.6293929712460065, "grad_norm": 1.0639780774252217, "learning_rate": 5.120818726180662e-06, "loss": 0.0909, "step": 4080 }, { "epoch": 1.630191693290735, "grad_norm": 1.0527532913896227, "learning_rate": 5.116172705204859e-06, "loss": 0.092, "step": 4082 }, { "epoch": 1.6309904153354633, "grad_norm": 1.129982374301854, "learning_rate": 5.111526583866801e-06, "loss": 0.1016, "step": 4084 }, { "epoch": 1.6317891373801916, "grad_norm": 1.0125124329212987, "learning_rate": 5.106880366180297e-06, "loss": 0.0933, "step": 4086 }, { "epoch": 1.6325878594249201, "grad_norm": 1.0905837841114607, "learning_rate": 5.1022340561592396e-06, "loss": 0.1028, "step": 4088 }, { "epoch": 1.6333865814696487, "grad_norm": 1.058617488709683, "learning_rate": 5.097587657817605e-06, "loss": 0.0988, "step": 4090 }, { "epoch": 1.634185303514377, "grad_norm": 1.1990334757384018, "learning_rate": 5.09294117516944e-06, "loss": 0.1045, "step": 4092 }, { "epoch": 1.6349840255591053, "grad_norm": 1.1978345768548917, "learning_rate": 5.08829461222887e-06, "loss": 0.1134, "step": 4094 }, { "epoch": 1.6357827476038338, "grad_norm": 1.082413217695199, "learning_rate": 5.083647973010085e-06, "loss": 0.1002, "step": 4096 }, { "epoch": 1.6365814696485623, "grad_norm": 1.1386432805954145, "learning_rate": 5.079001261527345e-06, "loss": 0.1029, "step": 4098 }, { "epoch": 1.6373801916932909, "grad_norm": 1.1557604620641984, "learning_rate": 5.074354481794969e-06, "loss": 0.0951, "step": 4100 }, { "epoch": 1.6381789137380192, "grad_norm": 1.2243404366242359, "learning_rate": 5.069707637827336e-06, "loss": 0.1121, "step": 4102 }, { "epoch": 1.6389776357827475, "grad_norm": 1.157977041885688, "learning_rate": 5.065060733638878e-06, "loss": 0.1104, "step": 4104 }, { "epoch": 1.639776357827476, "grad_norm": 1.094920449521866, "learning_rate": 5.0604137732440875e-06, "loss": 0.0898, "step": 4106 }, { "epoch": 1.6405750798722045, "grad_norm": 0.9795046872308669, "learning_rate": 5.055766760657497e-06, "loss": 0.0933, "step": 4108 }, { "epoch": 1.641373801916933, "grad_norm": 1.2246816953116304, "learning_rate": 5.051119699893686e-06, "loss": 0.1047, "step": 4110 }, { "epoch": 1.6421725239616614, "grad_norm": 0.9474742886147536, "learning_rate": 5.046472594967279e-06, "loss": 0.0825, "step": 4112 }, { "epoch": 1.6429712460063897, "grad_norm": 1.050959142356909, "learning_rate": 5.041825449892933e-06, "loss": 0.102, "step": 4114 }, { "epoch": 1.6437699680511182, "grad_norm": 1.0798115551123924, "learning_rate": 5.037178268685345e-06, "loss": 0.0943, "step": 4116 }, { "epoch": 1.6445686900958467, "grad_norm": 0.9443315270439328, "learning_rate": 5.032531055359241e-06, "loss": 0.094, "step": 4118 }, { "epoch": 1.645367412140575, "grad_norm": 1.0292007756942556, "learning_rate": 5.027883813929374e-06, "loss": 0.0981, "step": 4120 }, { "epoch": 1.6461661341853036, "grad_norm": 1.036373592711972, "learning_rate": 5.0232365484105235e-06, "loss": 0.0955, "step": 4122 }, { "epoch": 1.6469648562300319, "grad_norm": 1.0940288914430132, "learning_rate": 5.018589262817488e-06, "loss": 0.106, "step": 4124 }, { "epoch": 1.6477635782747604, "grad_norm": 1.1951722585755589, "learning_rate": 5.013941961165082e-06, "loss": 0.1127, "step": 4126 }, { "epoch": 1.648562300319489, "grad_norm": 1.0463562631972685, "learning_rate": 5.009294647468137e-06, "loss": 0.0902, "step": 4128 }, { "epoch": 1.6493610223642172, "grad_norm": 1.1257302379959406, "learning_rate": 5.004647325741495e-06, "loss": 0.0911, "step": 4130 }, { "epoch": 1.6501597444089455, "grad_norm": 0.9285397218586545, "learning_rate": 5e-06, "loss": 0.0821, "step": 4132 }, { "epoch": 1.650958466453674, "grad_norm": 1.13221857976679, "learning_rate": 4.9953526742585065e-06, "loss": 0.093, "step": 4134 }, { "epoch": 1.6517571884984026, "grad_norm": 1.119100390728916, "learning_rate": 4.990705352531864e-06, "loss": 0.096, "step": 4136 }, { "epoch": 1.6525559105431311, "grad_norm": 1.0890152144340473, "learning_rate": 4.9860580388349196e-06, "loss": 0.0968, "step": 4138 }, { "epoch": 1.6533546325878594, "grad_norm": 1.0415531324033633, "learning_rate": 4.981410737182515e-06, "loss": 0.1018, "step": 4140 }, { "epoch": 1.6541533546325877, "grad_norm": 1.1279757348197161, "learning_rate": 4.976763451589478e-06, "loss": 0.0989, "step": 4142 }, { "epoch": 1.6549520766773163, "grad_norm": 1.1174158609629532, "learning_rate": 4.972116186070626e-06, "loss": 0.0981, "step": 4144 }, { "epoch": 1.6557507987220448, "grad_norm": 0.978788514984628, "learning_rate": 4.96746894464076e-06, "loss": 0.086, "step": 4146 }, { "epoch": 1.6565495207667733, "grad_norm": 1.0391532776923562, "learning_rate": 4.962821731314656e-06, "loss": 0.0893, "step": 4148 }, { "epoch": 1.6573482428115016, "grad_norm": 1.1408365847624822, "learning_rate": 4.958174550107069e-06, "loss": 0.0926, "step": 4150 }, { "epoch": 1.65814696485623, "grad_norm": 1.0976480883499857, "learning_rate": 4.953527405032723e-06, "loss": 0.0969, "step": 4152 }, { "epoch": 1.6589456869009584, "grad_norm": 1.099197170300517, "learning_rate": 4.948880300106315e-06, "loss": 0.0889, "step": 4154 }, { "epoch": 1.659744408945687, "grad_norm": 0.9962207801035776, "learning_rate": 4.944233239342505e-06, "loss": 0.0828, "step": 4156 }, { "epoch": 1.6605431309904153, "grad_norm": 1.164367259050149, "learning_rate": 4.939586226755913e-06, "loss": 0.0993, "step": 4158 }, { "epoch": 1.6613418530351438, "grad_norm": 1.0915643883560848, "learning_rate": 4.934939266361123e-06, "loss": 0.0958, "step": 4160 }, { "epoch": 1.6621405750798721, "grad_norm": 1.0384971354394401, "learning_rate": 4.930292362172667e-06, "loss": 0.0987, "step": 4162 }, { "epoch": 1.6629392971246006, "grad_norm": 0.9714525373847863, "learning_rate": 4.9256455182050345e-06, "loss": 0.0985, "step": 4164 }, { "epoch": 1.6637380191693292, "grad_norm": 1.0774781447464505, "learning_rate": 4.920998738472657e-06, "loss": 0.0956, "step": 4166 }, { "epoch": 1.6645367412140575, "grad_norm": 1.1101370347081296, "learning_rate": 4.916352026989914e-06, "loss": 0.1085, "step": 4168 }, { "epoch": 1.6653354632587858, "grad_norm": 1.1576145184604405, "learning_rate": 4.911705387771131e-06, "loss": 0.0998, "step": 4170 }, { "epoch": 1.6661341853035143, "grad_norm": 1.0183758786516857, "learning_rate": 4.90705882483056e-06, "loss": 0.0856, "step": 4172 }, { "epoch": 1.6669329073482428, "grad_norm": 1.110373708674366, "learning_rate": 4.902412342182396e-06, "loss": 0.0922, "step": 4174 }, { "epoch": 1.6677316293929714, "grad_norm": 1.0395390430208176, "learning_rate": 4.897765943840761e-06, "loss": 0.0986, "step": 4176 }, { "epoch": 1.6685303514376997, "grad_norm": 1.0700900909956106, "learning_rate": 4.8931196338197045e-06, "loss": 0.1094, "step": 4178 }, { "epoch": 1.669329073482428, "grad_norm": 1.0744452627922665, "learning_rate": 4.888473416133201e-06, "loss": 0.0945, "step": 4180 }, { "epoch": 1.6701277955271565, "grad_norm": 1.1251668177944376, "learning_rate": 4.883827294795142e-06, "loss": 0.0973, "step": 4182 }, { "epoch": 1.670926517571885, "grad_norm": 0.9568523532004309, "learning_rate": 4.87918127381934e-06, "loss": 0.0924, "step": 4184 }, { "epoch": 1.6717252396166136, "grad_norm": 1.0605161422920366, "learning_rate": 4.874535357219517e-06, "loss": 0.0969, "step": 4186 }, { "epoch": 1.6725239616613419, "grad_norm": 1.0949585639802137, "learning_rate": 4.869889549009309e-06, "loss": 0.1023, "step": 4188 }, { "epoch": 1.6733226837060702, "grad_norm": 1.0200170497583934, "learning_rate": 4.8652438532022505e-06, "loss": 0.0968, "step": 4190 }, { "epoch": 1.6741214057507987, "grad_norm": 1.1240991571874712, "learning_rate": 4.860598273811793e-06, "loss": 0.1056, "step": 4192 }, { "epoch": 1.6749201277955272, "grad_norm": 1.1355929086530765, "learning_rate": 4.855952814851272e-06, "loss": 0.096, "step": 4194 }, { "epoch": 1.6757188498402555, "grad_norm": 0.9333875889896245, "learning_rate": 4.851307480333929e-06, "loss": 0.0939, "step": 4196 }, { "epoch": 1.676517571884984, "grad_norm": 1.093435156403032, "learning_rate": 4.846662274272893e-06, "loss": 0.0945, "step": 4198 }, { "epoch": 1.6773162939297124, "grad_norm": 1.0883095662785185, "learning_rate": 4.842017200681185e-06, "loss": 0.0916, "step": 4200 }, { "epoch": 1.678115015974441, "grad_norm": 1.0711389939013298, "learning_rate": 4.8373722635717095e-06, "loss": 0.1045, "step": 4202 }, { "epoch": 1.6789137380191694, "grad_norm": 0.9565453405933899, "learning_rate": 4.832727466957254e-06, "loss": 0.084, "step": 4204 }, { "epoch": 1.6797124600638977, "grad_norm": 0.9827521143045396, "learning_rate": 4.828082814850484e-06, "loss": 0.0895, "step": 4206 }, { "epoch": 1.680511182108626, "grad_norm": 1.0715525135749726, "learning_rate": 4.823438311263943e-06, "loss": 0.0879, "step": 4208 }, { "epoch": 1.6813099041533546, "grad_norm": 1.1041262630985615, "learning_rate": 4.81879396021004e-06, "loss": 0.1067, "step": 4210 }, { "epoch": 1.682108626198083, "grad_norm": 1.0586655240128, "learning_rate": 4.814149765701059e-06, "loss": 0.1022, "step": 4212 }, { "epoch": 1.6829073482428116, "grad_norm": 1.186055020520042, "learning_rate": 4.809505731749144e-06, "loss": 0.1139, "step": 4214 }, { "epoch": 1.68370607028754, "grad_norm": 1.1122466690968793, "learning_rate": 4.804861862366306e-06, "loss": 0.1051, "step": 4216 }, { "epoch": 1.6845047923322682, "grad_norm": 1.0754569180945435, "learning_rate": 4.8002181615644055e-06, "loss": 0.0961, "step": 4218 }, { "epoch": 1.6853035143769968, "grad_norm": 1.0878287200397418, "learning_rate": 4.795574633355165e-06, "loss": 0.0885, "step": 4220 }, { "epoch": 1.6861022364217253, "grad_norm": 1.1364082104476718, "learning_rate": 4.790931281750152e-06, "loss": 0.1, "step": 4222 }, { "epoch": 1.6869009584664538, "grad_norm": 1.0119111534228, "learning_rate": 4.786288110760787e-06, "loss": 0.09, "step": 4224 }, { "epoch": 1.6876996805111821, "grad_norm": 1.072752741199254, "learning_rate": 4.78164512439833e-06, "loss": 0.1017, "step": 4226 }, { "epoch": 1.6884984025559104, "grad_norm": 1.27263253817292, "learning_rate": 4.777002326673884e-06, "loss": 0.1046, "step": 4228 }, { "epoch": 1.689297124600639, "grad_norm": 1.0255757488823853, "learning_rate": 4.772359721598386e-06, "loss": 0.1052, "step": 4230 }, { "epoch": 1.6900958466453675, "grad_norm": 1.1943745213053019, "learning_rate": 4.767717313182611e-06, "loss": 0.0992, "step": 4232 }, { "epoch": 1.6908945686900958, "grad_norm": 1.1636475166052027, "learning_rate": 4.763075105437161e-06, "loss": 0.0991, "step": 4234 }, { "epoch": 1.6916932907348243, "grad_norm": 1.0844729963688502, "learning_rate": 4.758433102372466e-06, "loss": 0.1026, "step": 4236 }, { "epoch": 1.6924920127795526, "grad_norm": 1.1114480349576357, "learning_rate": 4.753791307998776e-06, "loss": 0.093, "step": 4238 }, { "epoch": 1.6932907348242812, "grad_norm": 0.9636290087080969, "learning_rate": 4.74914972632617e-06, "loss": 0.0865, "step": 4240 }, { "epoch": 1.6940894568690097, "grad_norm": 1.0002038172553764, "learning_rate": 4.744508361364529e-06, "loss": 0.0907, "step": 4242 }, { "epoch": 1.694888178913738, "grad_norm": 1.0275565003608014, "learning_rate": 4.73986721712356e-06, "loss": 0.0944, "step": 4244 }, { "epoch": 1.6956869009584663, "grad_norm": 1.0458584389637045, "learning_rate": 4.73522629761277e-06, "loss": 0.0982, "step": 4246 }, { "epoch": 1.6964856230031948, "grad_norm": 1.0999887308140721, "learning_rate": 4.730585606841479e-06, "loss": 0.0994, "step": 4248 }, { "epoch": 1.6972843450479234, "grad_norm": 1.0805343621335457, "learning_rate": 4.725945148818801e-06, "loss": 0.0922, "step": 4250 }, { "epoch": 1.6980830670926519, "grad_norm": 1.1009086534316133, "learning_rate": 4.721304927553659e-06, "loss": 0.0978, "step": 4252 }, { "epoch": 1.6988817891373802, "grad_norm": 1.0686039798032159, "learning_rate": 4.716664947054764e-06, "loss": 0.0926, "step": 4254 }, { "epoch": 1.6996805111821085, "grad_norm": 1.2264200698539793, "learning_rate": 4.7120252113306216e-06, "loss": 0.0997, "step": 4256 }, { "epoch": 1.700479233226837, "grad_norm": 1.0512823258442452, "learning_rate": 4.707385724389526e-06, "loss": 0.1054, "step": 4258 }, { "epoch": 1.7012779552715656, "grad_norm": 1.2065999780404362, "learning_rate": 4.702746490239554e-06, "loss": 0.1031, "step": 4260 }, { "epoch": 1.702076677316294, "grad_norm": 1.190627424392971, "learning_rate": 4.69810751288857e-06, "loss": 0.0996, "step": 4262 }, { "epoch": 1.7028753993610224, "grad_norm": 1.136415340105126, "learning_rate": 4.693468796344211e-06, "loss": 0.1046, "step": 4264 }, { "epoch": 1.7036741214057507, "grad_norm": 1.056631284731312, "learning_rate": 4.6888303446138895e-06, "loss": 0.0916, "step": 4266 }, { "epoch": 1.7044728434504792, "grad_norm": 1.0261838244170962, "learning_rate": 4.684192161704792e-06, "loss": 0.0876, "step": 4268 }, { "epoch": 1.7052715654952078, "grad_norm": 0.9913120866257363, "learning_rate": 4.679554251623869e-06, "loss": 0.0975, "step": 4270 }, { "epoch": 1.706070287539936, "grad_norm": 0.9909781303643188, "learning_rate": 4.6749166183778375e-06, "loss": 0.0899, "step": 4272 }, { "epoch": 1.7068690095846646, "grad_norm": 1.0133453073875816, "learning_rate": 4.670279265973177e-06, "loss": 0.0944, "step": 4274 }, { "epoch": 1.707667731629393, "grad_norm": 1.084241919717283, "learning_rate": 4.665642198416119e-06, "loss": 0.1014, "step": 4276 }, { "epoch": 1.7084664536741214, "grad_norm": 1.1144409380935076, "learning_rate": 4.661005419712657e-06, "loss": 0.1041, "step": 4278 }, { "epoch": 1.70926517571885, "grad_norm": 1.0934629080398204, "learning_rate": 4.656368933868525e-06, "loss": 0.0912, "step": 4280 }, { "epoch": 1.7100638977635783, "grad_norm": 1.1463371146812387, "learning_rate": 4.651732744889215e-06, "loss": 0.1037, "step": 4282 }, { "epoch": 1.7108626198083066, "grad_norm": 1.1729562022224107, "learning_rate": 4.647096856779951e-06, "loss": 0.0965, "step": 4284 }, { "epoch": 1.711661341853035, "grad_norm": 1.1118256334824752, "learning_rate": 4.642461273545707e-06, "loss": 0.1034, "step": 4286 }, { "epoch": 1.7124600638977636, "grad_norm": 1.132712399437204, "learning_rate": 4.637825999191189e-06, "loss": 0.0982, "step": 4288 }, { "epoch": 1.7132587859424921, "grad_norm": 1.0848347886256517, "learning_rate": 4.633191037720835e-06, "loss": 0.1018, "step": 4290 }, { "epoch": 1.7140575079872205, "grad_norm": 1.0658858217343385, "learning_rate": 4.628556393138816e-06, "loss": 0.097, "step": 4292 }, { "epoch": 1.7148562300319488, "grad_norm": 1.0923087477574933, "learning_rate": 4.623922069449028e-06, "loss": 0.1016, "step": 4294 }, { "epoch": 1.7156549520766773, "grad_norm": 1.161421995581851, "learning_rate": 4.619288070655086e-06, "loss": 0.0925, "step": 4296 }, { "epoch": 1.7164536741214058, "grad_norm": 0.9470108447713622, "learning_rate": 4.614654400760333e-06, "loss": 0.0989, "step": 4298 }, { "epoch": 1.7172523961661343, "grad_norm": 1.0516808788040757, "learning_rate": 4.610021063767818e-06, "loss": 0.1055, "step": 4300 }, { "epoch": 1.7180511182108626, "grad_norm": 1.1453407398766986, "learning_rate": 4.60538806368031e-06, "loss": 0.0986, "step": 4302 }, { "epoch": 1.718849840255591, "grad_norm": 0.9890278332037488, "learning_rate": 4.600755404500281e-06, "loss": 0.0956, "step": 4304 }, { "epoch": 1.7196485623003195, "grad_norm": 1.0910031893484988, "learning_rate": 4.596123090229913e-06, "loss": 0.0959, "step": 4306 }, { "epoch": 1.720447284345048, "grad_norm": 0.954845783952346, "learning_rate": 4.59149112487109e-06, "loss": 0.0866, "step": 4308 }, { "epoch": 1.7212460063897763, "grad_norm": 1.0693628763645768, "learning_rate": 4.5868595124253915e-06, "loss": 0.0984, "step": 4310 }, { "epoch": 1.7220447284345048, "grad_norm": 1.0048364707538442, "learning_rate": 4.582228256894093e-06, "loss": 0.0939, "step": 4312 }, { "epoch": 1.7228434504792332, "grad_norm": 1.1603565075058264, "learning_rate": 4.577597362278165e-06, "loss": 0.1026, "step": 4314 }, { "epoch": 1.7236421725239617, "grad_norm": 1.1312773137686531, "learning_rate": 4.572966832578261e-06, "loss": 0.1076, "step": 4316 }, { "epoch": 1.7244408945686902, "grad_norm": 1.0962659717799779, "learning_rate": 4.568336671794722e-06, "loss": 0.0956, "step": 4318 }, { "epoch": 1.7252396166134185, "grad_norm": 1.0220568178816403, "learning_rate": 4.56370688392757e-06, "loss": 0.0904, "step": 4320 }, { "epoch": 1.7260383386581468, "grad_norm": 0.9834388109812503, "learning_rate": 4.5590774729765076e-06, "loss": 0.0939, "step": 4322 }, { "epoch": 1.7268370607028753, "grad_norm": 1.2111951372748109, "learning_rate": 4.554448442940905e-06, "loss": 0.0968, "step": 4324 }, { "epoch": 1.7276357827476039, "grad_norm": 1.1926580241309344, "learning_rate": 4.549819797819809e-06, "loss": 0.1017, "step": 4326 }, { "epoch": 1.7284345047923324, "grad_norm": 1.0572382689425925, "learning_rate": 4.545191541611933e-06, "loss": 0.0871, "step": 4328 }, { "epoch": 1.7292332268370607, "grad_norm": 1.0126727217605194, "learning_rate": 4.540563678315652e-06, "loss": 0.0895, "step": 4330 }, { "epoch": 1.730031948881789, "grad_norm": 1.0365109166295632, "learning_rate": 4.535936211929005e-06, "loss": 0.0971, "step": 4332 }, { "epoch": 1.7308306709265175, "grad_norm": 1.2600986522602922, "learning_rate": 4.5313091464496865e-06, "loss": 0.1067, "step": 4334 }, { "epoch": 1.731629392971246, "grad_norm": 1.1349358517047685, "learning_rate": 4.526682485875044e-06, "loss": 0.0954, "step": 4336 }, { "epoch": 1.7324281150159746, "grad_norm": 1.0404286721064964, "learning_rate": 4.5220562342020755e-06, "loss": 0.1014, "step": 4338 }, { "epoch": 1.733226837060703, "grad_norm": 1.2958712155282752, "learning_rate": 4.517430395427424e-06, "loss": 0.1121, "step": 4340 }, { "epoch": 1.7340255591054312, "grad_norm": 1.1652339074566052, "learning_rate": 4.512804973547383e-06, "loss": 0.0985, "step": 4342 }, { "epoch": 1.7348242811501597, "grad_norm": 1.204403964641989, "learning_rate": 4.508179972557875e-06, "loss": 0.1071, "step": 4344 }, { "epoch": 1.7356230031948883, "grad_norm": 1.0146957490804427, "learning_rate": 4.503555396454468e-06, "loss": 0.089, "step": 4346 }, { "epoch": 1.7364217252396166, "grad_norm": 1.038473290463769, "learning_rate": 4.498931249232357e-06, "loss": 0.1022, "step": 4348 }, { "epoch": 1.7372204472843449, "grad_norm": 1.0396114915923156, "learning_rate": 4.49430753488637e-06, "loss": 0.0958, "step": 4350 }, { "epoch": 1.7380191693290734, "grad_norm": 0.9976185951916329, "learning_rate": 4.489684257410959e-06, "loss": 0.0872, "step": 4352 }, { "epoch": 1.738817891373802, "grad_norm": 0.8940691455776141, "learning_rate": 4.485061420800202e-06, "loss": 0.0736, "step": 4354 }, { "epoch": 1.7396166134185305, "grad_norm": 1.0562465504348986, "learning_rate": 4.480439029047792e-06, "loss": 0.0928, "step": 4356 }, { "epoch": 1.7404153354632588, "grad_norm": 1.3220699498489616, "learning_rate": 4.47581708614704e-06, "loss": 0.1032, "step": 4358 }, { "epoch": 1.741214057507987, "grad_norm": 1.044321957199521, "learning_rate": 4.471195596090867e-06, "loss": 0.0893, "step": 4360 }, { "epoch": 1.7420127795527156, "grad_norm": 1.1907505200191142, "learning_rate": 4.466574562871807e-06, "loss": 0.0955, "step": 4362 }, { "epoch": 1.7428115015974441, "grad_norm": 1.2092564945972575, "learning_rate": 4.461953990481995e-06, "loss": 0.1037, "step": 4364 }, { "epoch": 1.7436102236421727, "grad_norm": 1.0977545829393272, "learning_rate": 4.45733388291317e-06, "loss": 0.0854, "step": 4366 }, { "epoch": 1.744408945686901, "grad_norm": 1.0941242092439605, "learning_rate": 4.452714244156667e-06, "loss": 0.1029, "step": 4368 }, { "epoch": 1.7452076677316293, "grad_norm": 1.1568260987190029, "learning_rate": 4.448095078203421e-06, "loss": 0.0954, "step": 4370 }, { "epoch": 1.7460063897763578, "grad_norm": 1.021261678557611, "learning_rate": 4.443476389043955e-06, "loss": 0.1055, "step": 4372 }, { "epoch": 1.7468051118210863, "grad_norm": 1.1554032258458062, "learning_rate": 4.438858180668379e-06, "loss": 0.0907, "step": 4374 }, { "epoch": 1.7476038338658149, "grad_norm": 0.9759468899388195, "learning_rate": 4.434240457066388e-06, "loss": 0.0893, "step": 4376 }, { "epoch": 1.7484025559105432, "grad_norm": 1.1342860014921987, "learning_rate": 4.429623222227265e-06, "loss": 0.0965, "step": 4378 }, { "epoch": 1.7492012779552715, "grad_norm": 1.0507089455768055, "learning_rate": 4.425006480139861e-06, "loss": 0.0925, "step": 4380 }, { "epoch": 1.75, "grad_norm": 1.044726162269031, "learning_rate": 4.420390234792608e-06, "loss": 0.1003, "step": 4382 }, { "epoch": 1.7507987220447285, "grad_norm": 1.1039342031586883, "learning_rate": 4.415774490173504e-06, "loss": 0.0893, "step": 4384 }, { "epoch": 1.7515974440894568, "grad_norm": 1.075578365679244, "learning_rate": 4.411159250270119e-06, "loss": 0.0948, "step": 4386 }, { "epoch": 1.7523961661341851, "grad_norm": 1.0755507149516828, "learning_rate": 4.406544519069582e-06, "loss": 0.0943, "step": 4388 }, { "epoch": 1.7531948881789137, "grad_norm": 1.2422536273434488, "learning_rate": 4.401930300558588e-06, "loss": 0.1042, "step": 4390 }, { "epoch": 1.7539936102236422, "grad_norm": 1.125409271455376, "learning_rate": 4.397316598723385e-06, "loss": 0.1008, "step": 4392 }, { "epoch": 1.7547923322683707, "grad_norm": 0.9912325306555418, "learning_rate": 4.392703417549777e-06, "loss": 0.0893, "step": 4394 }, { "epoch": 1.755591054313099, "grad_norm": 1.1298243639433942, "learning_rate": 4.388090761023118e-06, "loss": 0.0968, "step": 4396 }, { "epoch": 1.7563897763578273, "grad_norm": 1.03503405875718, "learning_rate": 4.3834786331283055e-06, "loss": 0.0925, "step": 4398 }, { "epoch": 1.7571884984025559, "grad_norm": 1.1423391405624483, "learning_rate": 4.3788670378497836e-06, "loss": 0.0989, "step": 4400 }, { "epoch": 1.7579872204472844, "grad_norm": 1.0654118903753726, "learning_rate": 4.374255979171538e-06, "loss": 0.0993, "step": 4402 }, { "epoch": 1.758785942492013, "grad_norm": 1.1141658088401454, "learning_rate": 4.369645461077085e-06, "loss": 0.1005, "step": 4404 }, { "epoch": 1.7595846645367412, "grad_norm": 1.0554546089720755, "learning_rate": 4.365035487549481e-06, "loss": 0.0894, "step": 4406 }, { "epoch": 1.7603833865814695, "grad_norm": 1.0768929412416388, "learning_rate": 4.360426062571303e-06, "loss": 0.0984, "step": 4408 }, { "epoch": 1.761182108626198, "grad_norm": 1.0579464025983196, "learning_rate": 4.3558171901246635e-06, "loss": 0.0989, "step": 4410 }, { "epoch": 1.7619808306709266, "grad_norm": 1.0630018800627314, "learning_rate": 4.351208874191192e-06, "loss": 0.1076, "step": 4412 }, { "epoch": 1.7627795527156551, "grad_norm": 1.0546218890926364, "learning_rate": 4.346601118752039e-06, "loss": 0.0952, "step": 4414 }, { "epoch": 1.7635782747603834, "grad_norm": 0.9956147311105759, "learning_rate": 4.341993927787871e-06, "loss": 0.0947, "step": 4416 }, { "epoch": 1.7643769968051117, "grad_norm": 1.0747196506285701, "learning_rate": 4.337387305278864e-06, "loss": 0.0928, "step": 4418 }, { "epoch": 1.7651757188498403, "grad_norm": 1.121082140094148, "learning_rate": 4.332781255204708e-06, "loss": 0.0919, "step": 4420 }, { "epoch": 1.7659744408945688, "grad_norm": 1.1615527137887067, "learning_rate": 4.328175781544593e-06, "loss": 0.108, "step": 4422 }, { "epoch": 1.766773162939297, "grad_norm": 0.9564313983946037, "learning_rate": 4.323570888277215e-06, "loss": 0.0932, "step": 4424 }, { "epoch": 1.7675718849840254, "grad_norm": 1.0463601569604801, "learning_rate": 4.318966579380768e-06, "loss": 0.0895, "step": 4426 }, { "epoch": 1.768370607028754, "grad_norm": 1.163800342765404, "learning_rate": 4.31436285883294e-06, "loss": 0.1033, "step": 4428 }, { "epoch": 1.7691693290734825, "grad_norm": 1.039700123732381, "learning_rate": 4.3097597306109115e-06, "loss": 0.0958, "step": 4430 }, { "epoch": 1.769968051118211, "grad_norm": 1.1932778000690207, "learning_rate": 4.305157198691351e-06, "loss": 0.0972, "step": 4432 }, { "epoch": 1.7707667731629393, "grad_norm": 1.0406189324731365, "learning_rate": 4.30055526705041e-06, "loss": 0.0978, "step": 4434 }, { "epoch": 1.7715654952076676, "grad_norm": 1.1896973942703604, "learning_rate": 4.2959539396637265e-06, "loss": 0.1038, "step": 4436 }, { "epoch": 1.7723642172523961, "grad_norm": 1.1009904452623798, "learning_rate": 4.291353220506409e-06, "loss": 0.0933, "step": 4438 }, { "epoch": 1.7731629392971247, "grad_norm": 1.0740791345413632, "learning_rate": 4.286753113553049e-06, "loss": 0.0901, "step": 4440 }, { "epoch": 1.7739616613418532, "grad_norm": 1.0663374997120347, "learning_rate": 4.2821536227777016e-06, "loss": 0.0899, "step": 4442 }, { "epoch": 1.7747603833865815, "grad_norm": 0.9560869355538776, "learning_rate": 4.277554752153895e-06, "loss": 0.0862, "step": 4444 }, { "epoch": 1.7755591054313098, "grad_norm": 1.1931319093376327, "learning_rate": 4.272956505654616e-06, "loss": 0.1013, "step": 4446 }, { "epoch": 1.7763578274760383, "grad_norm": 1.1029308559048692, "learning_rate": 4.268358887252322e-06, "loss": 0.0971, "step": 4448 }, { "epoch": 1.7771565495207668, "grad_norm": 1.0214787393703604, "learning_rate": 4.263761900918916e-06, "loss": 0.0896, "step": 4450 }, { "epoch": 1.7779552715654952, "grad_norm": 1.1538006513738512, "learning_rate": 4.259165550625765e-06, "loss": 0.1008, "step": 4452 }, { "epoch": 1.7787539936102237, "grad_norm": 1.1069143300956876, "learning_rate": 4.254569840343677e-06, "loss": 0.0904, "step": 4454 }, { "epoch": 1.779552715654952, "grad_norm": 1.2455527249765532, "learning_rate": 4.249974774042915e-06, "loss": 0.109, "step": 4456 }, { "epoch": 1.7803514376996805, "grad_norm": 1.0281866104050703, "learning_rate": 4.245380355693183e-06, "loss": 0.0902, "step": 4458 }, { "epoch": 1.781150159744409, "grad_norm": 1.0141761099764164, "learning_rate": 4.240786589263623e-06, "loss": 0.0941, "step": 4460 }, { "epoch": 1.7819488817891374, "grad_norm": 1.0607373591565843, "learning_rate": 4.236193478722818e-06, "loss": 0.0991, "step": 4462 }, { "epoch": 1.7827476038338657, "grad_norm": 0.9907340835532562, "learning_rate": 4.231601028038781e-06, "loss": 0.0797, "step": 4464 }, { "epoch": 1.7835463258785942, "grad_norm": 1.112591753336717, "learning_rate": 4.2270092411789555e-06, "loss": 0.0888, "step": 4466 }, { "epoch": 1.7843450479233227, "grad_norm": 1.2313881450954993, "learning_rate": 4.222418122110212e-06, "loss": 0.1055, "step": 4468 }, { "epoch": 1.7851437699680512, "grad_norm": 1.0862948713868201, "learning_rate": 4.217827674798845e-06, "loss": 0.098, "step": 4470 }, { "epoch": 1.7859424920127795, "grad_norm": 1.060155034044239, "learning_rate": 4.2132379032105695e-06, "loss": 0.0996, "step": 4472 }, { "epoch": 1.7867412140575079, "grad_norm": 1.1021893535744396, "learning_rate": 4.208648811310513e-06, "loss": 0.1053, "step": 4474 }, { "epoch": 1.7875399361022364, "grad_norm": 1.188590840385766, "learning_rate": 4.204060403063218e-06, "loss": 0.1067, "step": 4476 }, { "epoch": 1.788338658146965, "grad_norm": 1.1601988700851704, "learning_rate": 4.199472682432637e-06, "loss": 0.1025, "step": 4478 }, { "epoch": 1.7891373801916934, "grad_norm": 1.2008156142165938, "learning_rate": 4.194885653382128e-06, "loss": 0.1031, "step": 4480 }, { "epoch": 1.7899361022364217, "grad_norm": 1.1590922201903628, "learning_rate": 4.190299319874449e-06, "loss": 0.0935, "step": 4482 }, { "epoch": 1.79073482428115, "grad_norm": 1.1080943430270318, "learning_rate": 4.185713685871763e-06, "loss": 0.1121, "step": 4484 }, { "epoch": 1.7915335463258786, "grad_norm": 1.0680082395002684, "learning_rate": 4.1811287553356214e-06, "loss": 0.0997, "step": 4486 }, { "epoch": 1.792332268370607, "grad_norm": 1.169219691370761, "learning_rate": 4.176544532226974e-06, "loss": 0.106, "step": 4488 }, { "epoch": 1.7931309904153354, "grad_norm": 1.0745747283017117, "learning_rate": 4.171961020506158e-06, "loss": 0.0981, "step": 4490 }, { "epoch": 1.793929712460064, "grad_norm": 1.3455597906264747, "learning_rate": 4.167378224132891e-06, "loss": 0.0982, "step": 4492 }, { "epoch": 1.7947284345047922, "grad_norm": 1.1972510456665748, "learning_rate": 4.162796147066279e-06, "loss": 0.0978, "step": 4494 }, { "epoch": 1.7955271565495208, "grad_norm": 1.1197171631008844, "learning_rate": 4.158214793264808e-06, "loss": 0.0837, "step": 4496 }, { "epoch": 1.7963258785942493, "grad_norm": 0.9880917577387588, "learning_rate": 4.15363416668633e-06, "loss": 0.0884, "step": 4498 }, { "epoch": 1.7971246006389776, "grad_norm": 1.080269566035693, "learning_rate": 4.149054271288076e-06, "loss": 0.102, "step": 4500 }, { "epoch": 1.7971246006389776, "eval_loss": 0.14962869882583618, "eval_runtime": 418.7656, "eval_samples_per_second": 42.523, "eval_steps_per_second": 5.316, "step": 4500 }, { "epoch": 1.797923322683706, "grad_norm": 1.067144176619292, "learning_rate": 4.144475111026643e-06, "loss": 0.104, "step": 4502 }, { "epoch": 1.7987220447284344, "grad_norm": 1.060342189299496, "learning_rate": 4.139896689857995e-06, "loss": 0.0939, "step": 4504 }, { "epoch": 1.799520766773163, "grad_norm": 1.059142250496446, "learning_rate": 4.1353190117374545e-06, "loss": 0.0987, "step": 4506 }, { "epoch": 1.8003194888178915, "grad_norm": 1.0930753497438312, "learning_rate": 4.130742080619704e-06, "loss": 0.0907, "step": 4508 }, { "epoch": 1.8011182108626198, "grad_norm": 1.0270924676421964, "learning_rate": 4.126165900458781e-06, "loss": 0.09, "step": 4510 }, { "epoch": 1.8019169329073481, "grad_norm": 1.1110244428747031, "learning_rate": 4.121590475208071e-06, "loss": 0.1013, "step": 4512 }, { "epoch": 1.8027156549520766, "grad_norm": 1.0236243613153262, "learning_rate": 4.11701580882031e-06, "loss": 0.0976, "step": 4514 }, { "epoch": 1.8035143769968052, "grad_norm": 1.06524813010489, "learning_rate": 4.1124419052475815e-06, "loss": 0.1, "step": 4516 }, { "epoch": 1.8043130990415337, "grad_norm": 1.1330875679173593, "learning_rate": 4.107868768441304e-06, "loss": 0.1166, "step": 4518 }, { "epoch": 1.805111821086262, "grad_norm": 1.4531489997318303, "learning_rate": 4.1032964023522366e-06, "loss": 0.0858, "step": 4520 }, { "epoch": 1.8059105431309903, "grad_norm": 1.1107724952996303, "learning_rate": 4.098724810930472e-06, "loss": 0.1035, "step": 4522 }, { "epoch": 1.8067092651757188, "grad_norm": 1.0926573298123923, "learning_rate": 4.0941539981254345e-06, "loss": 0.1026, "step": 4524 }, { "epoch": 1.8075079872204474, "grad_norm": 0.9721623108703124, "learning_rate": 4.089583967885874e-06, "loss": 0.1025, "step": 4526 }, { "epoch": 1.8083067092651757, "grad_norm": 1.0370754238710584, "learning_rate": 4.085014724159866e-06, "loss": 0.0942, "step": 4528 }, { "epoch": 1.8091054313099042, "grad_norm": 1.0940664318814297, "learning_rate": 4.0804462708948076e-06, "loss": 0.0988, "step": 4530 }, { "epoch": 1.8099041533546325, "grad_norm": 1.0244857172089716, "learning_rate": 4.075878612037408e-06, "loss": 0.089, "step": 4532 }, { "epoch": 1.810702875399361, "grad_norm": 1.0706540589086802, "learning_rate": 4.071311751533696e-06, "loss": 0.0996, "step": 4534 }, { "epoch": 1.8115015974440896, "grad_norm": 1.1352354117693362, "learning_rate": 4.066745693329008e-06, "loss": 0.0924, "step": 4536 }, { "epoch": 1.8123003194888179, "grad_norm": 1.0659040735499372, "learning_rate": 4.062180441367985e-06, "loss": 0.0939, "step": 4538 }, { "epoch": 1.8130990415335462, "grad_norm": 1.1129307554195655, "learning_rate": 4.057615999594578e-06, "loss": 0.1015, "step": 4540 }, { "epoch": 1.8138977635782747, "grad_norm": 1.0090716344766517, "learning_rate": 4.053052371952032e-06, "loss": 0.1119, "step": 4542 }, { "epoch": 1.8146964856230032, "grad_norm": 1.1454677517831415, "learning_rate": 4.0484895623828906e-06, "loss": 0.0984, "step": 4544 }, { "epoch": 1.8154952076677318, "grad_norm": 1.1153321341980642, "learning_rate": 4.04392757482899e-06, "loss": 0.0977, "step": 4546 }, { "epoch": 1.81629392971246, "grad_norm": 1.0654841000292614, "learning_rate": 4.039366413231458e-06, "loss": 0.0746, "step": 4548 }, { "epoch": 1.8170926517571884, "grad_norm": 1.0311164679367486, "learning_rate": 4.034806081530709e-06, "loss": 0.0932, "step": 4550 }, { "epoch": 1.817891373801917, "grad_norm": 1.044082427416449, "learning_rate": 4.030246583666437e-06, "loss": 0.0967, "step": 4552 }, { "epoch": 1.8186900958466454, "grad_norm": 1.1455338692396, "learning_rate": 4.0256879235776195e-06, "loss": 0.097, "step": 4554 }, { "epoch": 1.819488817891374, "grad_norm": 1.1193697932818554, "learning_rate": 4.0211301052025075e-06, "loss": 0.1049, "step": 4556 }, { "epoch": 1.8202875399361023, "grad_norm": 0.9888093223647104, "learning_rate": 4.016573132478628e-06, "loss": 0.0877, "step": 4558 }, { "epoch": 1.8210862619808306, "grad_norm": 0.9046844324915236, "learning_rate": 4.012017009342773e-06, "loss": 0.0953, "step": 4560 }, { "epoch": 1.821884984025559, "grad_norm": 0.8970891044543646, "learning_rate": 4.007461739731003e-06, "loss": 0.0782, "step": 4562 }, { "epoch": 1.8226837060702876, "grad_norm": 1.0504566189159226, "learning_rate": 4.002907327578644e-06, "loss": 0.096, "step": 4564 }, { "epoch": 1.823482428115016, "grad_norm": 1.0396603169543621, "learning_rate": 3.9983537768202775e-06, "loss": 0.0966, "step": 4566 }, { "epoch": 1.8242811501597445, "grad_norm": 1.0690123100147384, "learning_rate": 3.99380109138974e-06, "loss": 0.0781, "step": 4568 }, { "epoch": 1.8250798722044728, "grad_norm": 1.0129487553480776, "learning_rate": 3.989249275220124e-06, "loss": 0.0884, "step": 4570 }, { "epoch": 1.8258785942492013, "grad_norm": 1.2215632281871855, "learning_rate": 3.984698332243767e-06, "loss": 0.1092, "step": 4572 }, { "epoch": 1.8266773162939298, "grad_norm": 1.0991702997365314, "learning_rate": 3.980148266392257e-06, "loss": 0.0998, "step": 4574 }, { "epoch": 1.8274760383386581, "grad_norm": 1.0590928610197108, "learning_rate": 3.97559908159642e-06, "loss": 0.0884, "step": 4576 }, { "epoch": 1.8282747603833864, "grad_norm": 1.1608407892475257, "learning_rate": 3.971050781786323e-06, "loss": 0.0974, "step": 4578 }, { "epoch": 1.829073482428115, "grad_norm": 1.0916397944105056, "learning_rate": 3.966503370891266e-06, "loss": 0.0906, "step": 4580 }, { "epoch": 1.8298722044728435, "grad_norm": 1.0395162456922145, "learning_rate": 3.961956852839787e-06, "loss": 0.0923, "step": 4582 }, { "epoch": 1.830670926517572, "grad_norm": 1.068914817127223, "learning_rate": 3.9574112315596425e-06, "loss": 0.1025, "step": 4584 }, { "epoch": 1.8314696485623003, "grad_norm": 1.0785020160593977, "learning_rate": 3.952866510977827e-06, "loss": 0.0863, "step": 4586 }, { "epoch": 1.8322683706070286, "grad_norm": 1.1371133744493924, "learning_rate": 3.948322695020546e-06, "loss": 0.0911, "step": 4588 }, { "epoch": 1.8330670926517572, "grad_norm": 0.9829285073485866, "learning_rate": 3.943779787613231e-06, "loss": 0.0857, "step": 4590 }, { "epoch": 1.8338658146964857, "grad_norm": 1.5286458187672483, "learning_rate": 3.9392377926805226e-06, "loss": 0.0919, "step": 4592 }, { "epoch": 1.8346645367412142, "grad_norm": 1.0894667065959676, "learning_rate": 3.934696714146277e-06, "loss": 0.0965, "step": 4594 }, { "epoch": 1.8354632587859425, "grad_norm": 1.147015209954055, "learning_rate": 3.930156555933557e-06, "loss": 0.0959, "step": 4596 }, { "epoch": 1.8362619808306708, "grad_norm": 1.0015372010863606, "learning_rate": 3.925617321964632e-06, "loss": 0.0912, "step": 4598 }, { "epoch": 1.8370607028753994, "grad_norm": 1.0413881872588078, "learning_rate": 3.92107901616097e-06, "loss": 0.1001, "step": 4600 }, { "epoch": 1.8378594249201279, "grad_norm": 1.1162230686005519, "learning_rate": 3.916541642443242e-06, "loss": 0.0963, "step": 4602 }, { "epoch": 1.8386581469648562, "grad_norm": 1.0840497131856408, "learning_rate": 3.912005204731307e-06, "loss": 0.1011, "step": 4604 }, { "epoch": 1.8394568690095847, "grad_norm": 1.0114188708406382, "learning_rate": 3.907469706944222e-06, "loss": 0.0894, "step": 4606 }, { "epoch": 1.840255591054313, "grad_norm": 1.0694376480162955, "learning_rate": 3.9029351530002264e-06, "loss": 0.086, "step": 4608 }, { "epoch": 1.8410543130990416, "grad_norm": 1.3957274955440564, "learning_rate": 3.898401546816752e-06, "loss": 0.0927, "step": 4610 }, { "epoch": 1.84185303514377, "grad_norm": 0.9916548350535773, "learning_rate": 3.8938688923104015e-06, "loss": 0.0877, "step": 4612 }, { "epoch": 1.8426517571884984, "grad_norm": 1.0958198194521465, "learning_rate": 3.8893371933969644e-06, "loss": 0.0921, "step": 4614 }, { "epoch": 1.8434504792332267, "grad_norm": 0.9890953898234616, "learning_rate": 3.884806453991399e-06, "loss": 0.0898, "step": 4616 }, { "epoch": 1.8442492012779552, "grad_norm": 1.099760617918534, "learning_rate": 3.880276678007838e-06, "loss": 0.0967, "step": 4618 }, { "epoch": 1.8450479233226837, "grad_norm": 1.1341946755721188, "learning_rate": 3.875747869359578e-06, "loss": 0.0911, "step": 4620 }, { "epoch": 1.8458466453674123, "grad_norm": 1.0924073653891035, "learning_rate": 3.871220031959085e-06, "loss": 0.0991, "step": 4622 }, { "epoch": 1.8466453674121406, "grad_norm": 1.1994386671625121, "learning_rate": 3.866693169717982e-06, "loss": 0.1012, "step": 4624 }, { "epoch": 1.8474440894568689, "grad_norm": 1.1087200452695056, "learning_rate": 3.8621672865470505e-06, "loss": 0.0901, "step": 4626 }, { "epoch": 1.8482428115015974, "grad_norm": 1.1622238598835266, "learning_rate": 3.8576423863562285e-06, "loss": 0.0949, "step": 4628 }, { "epoch": 1.849041533546326, "grad_norm": 1.031896254949219, "learning_rate": 3.8531184730546e-06, "loss": 0.0986, "step": 4630 }, { "epoch": 1.8498402555910545, "grad_norm": 1.1267405061140343, "learning_rate": 3.848595550550401e-06, "loss": 0.0963, "step": 4632 }, { "epoch": 1.8506389776357828, "grad_norm": 1.091392917402568, "learning_rate": 3.84407362275101e-06, "loss": 0.0952, "step": 4634 }, { "epoch": 1.851437699680511, "grad_norm": 1.07201414860687, "learning_rate": 3.839552693562946e-06, "loss": 0.1002, "step": 4636 }, { "epoch": 1.8522364217252396, "grad_norm": 1.0286376655218412, "learning_rate": 3.835032766891865e-06, "loss": 0.0948, "step": 4638 }, { "epoch": 1.8530351437699681, "grad_norm": 1.0789695524482392, "learning_rate": 3.830513846642556e-06, "loss": 0.0967, "step": 4640 }, { "epoch": 1.8538338658146964, "grad_norm": 1.1844906017376726, "learning_rate": 3.825995936718942e-06, "loss": 0.0951, "step": 4642 }, { "epoch": 1.854632587859425, "grad_norm": 0.9957944115590945, "learning_rate": 3.821479041024069e-06, "loss": 0.0902, "step": 4644 }, { "epoch": 1.8554313099041533, "grad_norm": 1.0579821850330315, "learning_rate": 3.816963163460109e-06, "loss": 0.1, "step": 4646 }, { "epoch": 1.8562300319488818, "grad_norm": 1.080547600595392, "learning_rate": 3.8124483079283546e-06, "loss": 0.1039, "step": 4648 }, { "epoch": 1.8570287539936103, "grad_norm": 1.0913158139520798, "learning_rate": 3.8079344783292145e-06, "loss": 0.1021, "step": 4650 }, { "epoch": 1.8578274760383386, "grad_norm": 1.1039317269281381, "learning_rate": 3.803421678562213e-06, "loss": 0.096, "step": 4652 }, { "epoch": 1.858626198083067, "grad_norm": 1.0837414701686472, "learning_rate": 3.79890991252598e-06, "loss": 0.0931, "step": 4654 }, { "epoch": 1.8594249201277955, "grad_norm": 0.9817886186922399, "learning_rate": 3.7943991841182586e-06, "loss": 0.0918, "step": 4656 }, { "epoch": 1.860223642172524, "grad_norm": 1.2753358547680838, "learning_rate": 3.7898894972358934e-06, "loss": 0.0879, "step": 4658 }, { "epoch": 1.8610223642172525, "grad_norm": 1.0132549906162278, "learning_rate": 3.7853808557748263e-06, "loss": 0.0982, "step": 4660 }, { "epoch": 1.8618210862619808, "grad_norm": 1.0604493491043667, "learning_rate": 3.7808732636300987e-06, "loss": 0.0909, "step": 4662 }, { "epoch": 1.8626198083067091, "grad_norm": 1.0110270014624545, "learning_rate": 3.7763667246958447e-06, "loss": 0.0913, "step": 4664 }, { "epoch": 1.8634185303514377, "grad_norm": 1.1816528766134167, "learning_rate": 3.771861242865288e-06, "loss": 0.0987, "step": 4666 }, { "epoch": 1.8642172523961662, "grad_norm": 1.1276543836075537, "learning_rate": 3.767356822030742e-06, "loss": 0.1006, "step": 4668 }, { "epoch": 1.8650159744408947, "grad_norm": 1.0029943146012643, "learning_rate": 3.7628534660835996e-06, "loss": 0.0925, "step": 4670 }, { "epoch": 1.865814696485623, "grad_norm": 1.1636739447782507, "learning_rate": 3.758351178914336e-06, "loss": 0.0946, "step": 4672 }, { "epoch": 1.8666134185303513, "grad_norm": 1.0912114279896796, "learning_rate": 3.753849964412502e-06, "loss": 0.09, "step": 4674 }, { "epoch": 1.8674121405750799, "grad_norm": 1.2020359973073858, "learning_rate": 3.749349826466724e-06, "loss": 0.1037, "step": 4676 }, { "epoch": 1.8682108626198084, "grad_norm": 0.9661948745934541, "learning_rate": 3.744850768964692e-06, "loss": 0.0862, "step": 4678 }, { "epoch": 1.8690095846645367, "grad_norm": 1.0588016583262083, "learning_rate": 3.7403527957931716e-06, "loss": 0.0929, "step": 4680 }, { "epoch": 1.869808306709265, "grad_norm": 1.095002175972765, "learning_rate": 3.7358559108379867e-06, "loss": 0.0982, "step": 4682 }, { "epoch": 1.8706070287539935, "grad_norm": 1.0767223792716407, "learning_rate": 3.731360117984022e-06, "loss": 0.0911, "step": 4684 }, { "epoch": 1.871405750798722, "grad_norm": 1.0769547783245055, "learning_rate": 3.7268654211152156e-06, "loss": 0.0931, "step": 4686 }, { "epoch": 1.8722044728434506, "grad_norm": 1.1230982099429845, "learning_rate": 3.7223718241145646e-06, "loss": 0.0946, "step": 4688 }, { "epoch": 1.873003194888179, "grad_norm": 1.0232620319916697, "learning_rate": 3.71787933086411e-06, "loss": 0.0863, "step": 4690 }, { "epoch": 1.8738019169329072, "grad_norm": 1.1155033054935346, "learning_rate": 3.713387945244945e-06, "loss": 0.1052, "step": 4692 }, { "epoch": 1.8746006389776357, "grad_norm": 1.086463915995651, "learning_rate": 3.7088976711372006e-06, "loss": 0.0894, "step": 4694 }, { "epoch": 1.8753993610223643, "grad_norm": 1.1158786507370728, "learning_rate": 3.7044085124200517e-06, "loss": 0.0897, "step": 4696 }, { "epoch": 1.8761980830670928, "grad_norm": 1.1044666274555124, "learning_rate": 3.6999204729717057e-06, "loss": 0.0935, "step": 4698 }, { "epoch": 1.876996805111821, "grad_norm": 1.0364347122571194, "learning_rate": 3.695433556669406e-06, "loss": 0.0881, "step": 4700 }, { "epoch": 1.8777955271565494, "grad_norm": 1.179238289944297, "learning_rate": 3.690947767389426e-06, "loss": 0.0903, "step": 4702 }, { "epoch": 1.878594249201278, "grad_norm": 1.2284443916952599, "learning_rate": 3.6864631090070656e-06, "loss": 0.1027, "step": 4704 }, { "epoch": 1.8793929712460065, "grad_norm": 1.1165613439165785, "learning_rate": 3.6819795853966435e-06, "loss": 0.0908, "step": 4706 }, { "epoch": 1.880191693290735, "grad_norm": 1.1221181152137276, "learning_rate": 3.6774972004315035e-06, "loss": 0.0994, "step": 4708 }, { "epoch": 1.8809904153354633, "grad_norm": 1.120777637697723, "learning_rate": 3.6730159579840007e-06, "loss": 0.1016, "step": 4710 }, { "epoch": 1.8817891373801916, "grad_norm": 1.0499328599769773, "learning_rate": 3.668535861925509e-06, "loss": 0.1007, "step": 4712 }, { "epoch": 1.8825878594249201, "grad_norm": 1.0952348015619684, "learning_rate": 3.6640569161264055e-06, "loss": 0.1008, "step": 4714 }, { "epoch": 1.8833865814696487, "grad_norm": 1.0556886162538723, "learning_rate": 3.6595791244560795e-06, "loss": 0.0909, "step": 4716 }, { "epoch": 1.884185303514377, "grad_norm": 1.0779044498229438, "learning_rate": 3.655102490782918e-06, "loss": 0.1072, "step": 4718 }, { "epoch": 1.8849840255591053, "grad_norm": 1.0237827262029766, "learning_rate": 3.650627018974312e-06, "loss": 0.0964, "step": 4720 }, { "epoch": 1.8857827476038338, "grad_norm": 0.9712789192282018, "learning_rate": 3.6461527128966457e-06, "loss": 0.0942, "step": 4722 }, { "epoch": 1.8865814696485623, "grad_norm": 1.1258507713014763, "learning_rate": 3.6416795764152967e-06, "loss": 0.104, "step": 4724 }, { "epoch": 1.8873801916932909, "grad_norm": 0.9821860855241218, "learning_rate": 3.6372076133946353e-06, "loss": 0.087, "step": 4726 }, { "epoch": 1.8881789137380192, "grad_norm": 1.0700005208247971, "learning_rate": 3.632736827698015e-06, "loss": 0.0883, "step": 4728 }, { "epoch": 1.8889776357827475, "grad_norm": 0.9731935760176809, "learning_rate": 3.6282672231877714e-06, "loss": 0.0883, "step": 4730 }, { "epoch": 1.889776357827476, "grad_norm": 1.045015911757986, "learning_rate": 3.623798803725223e-06, "loss": 0.097, "step": 4732 }, { "epoch": 1.8905750798722045, "grad_norm": 1.218717767375124, "learning_rate": 3.619331573170661e-06, "loss": 0.0921, "step": 4734 }, { "epoch": 1.891373801916933, "grad_norm": 1.058626369915082, "learning_rate": 3.6148655353833518e-06, "loss": 0.0856, "step": 4736 }, { "epoch": 1.8921725239616614, "grad_norm": 1.0773766259028001, "learning_rate": 3.6104006942215296e-06, "loss": 0.0909, "step": 4738 }, { "epoch": 1.8929712460063897, "grad_norm": 1.0619367090404177, "learning_rate": 3.605937053542398e-06, "loss": 0.0854, "step": 4740 }, { "epoch": 1.8937699680511182, "grad_norm": 1.0878885631245991, "learning_rate": 3.6014746172021197e-06, "loss": 0.096, "step": 4742 }, { "epoch": 1.8945686900958467, "grad_norm": 1.0235847877959412, "learning_rate": 3.5970133890558184e-06, "loss": 0.0826, "step": 4744 }, { "epoch": 1.895367412140575, "grad_norm": 1.1515886076002215, "learning_rate": 3.5925533729575745e-06, "loss": 0.0891, "step": 4746 }, { "epoch": 1.8961661341853036, "grad_norm": 1.0787838461269919, "learning_rate": 3.588094572760423e-06, "loss": 0.0947, "step": 4748 }, { "epoch": 1.8969648562300319, "grad_norm": 1.1230844413759828, "learning_rate": 3.583636992316345e-06, "loss": 0.0902, "step": 4750 }, { "epoch": 1.8977635782747604, "grad_norm": 1.0936152470611447, "learning_rate": 3.5791806354762702e-06, "loss": 0.0998, "step": 4752 }, { "epoch": 1.898562300319489, "grad_norm": 1.2224698612193565, "learning_rate": 3.5747255060900687e-06, "loss": 0.1032, "step": 4754 }, { "epoch": 1.8993610223642172, "grad_norm": 1.0690769327967091, "learning_rate": 3.5702716080065546e-06, "loss": 0.0856, "step": 4756 }, { "epoch": 1.9001597444089455, "grad_norm": 1.0519872936846355, "learning_rate": 3.5658189450734727e-06, "loss": 0.088, "step": 4758 }, { "epoch": 1.900958466453674, "grad_norm": 1.1511736996930644, "learning_rate": 3.5613675211375066e-06, "loss": 0.093, "step": 4760 }, { "epoch": 1.9017571884984026, "grad_norm": 1.1254396775195248, "learning_rate": 3.5569173400442634e-06, "loss": 0.086, "step": 4762 }, { "epoch": 1.9025559105431311, "grad_norm": 1.2048144094026239, "learning_rate": 3.5524684056382824e-06, "loss": 0.096, "step": 4764 }, { "epoch": 1.9033546325878594, "grad_norm": 1.0379587437323166, "learning_rate": 3.5480207217630224e-06, "loss": 0.0929, "step": 4766 }, { "epoch": 1.9041533546325877, "grad_norm": 1.0861096082948125, "learning_rate": 3.5435742922608618e-06, "loss": 0.0843, "step": 4768 }, { "epoch": 1.9049520766773163, "grad_norm": 1.094770765692993, "learning_rate": 3.539129120973095e-06, "loss": 0.0913, "step": 4770 }, { "epoch": 1.9057507987220448, "grad_norm": 1.0808303484556492, "learning_rate": 3.534685211739935e-06, "loss": 0.084, "step": 4772 }, { "epoch": 1.9065495207667733, "grad_norm": 1.0531941918931411, "learning_rate": 3.5302425684004957e-06, "loss": 0.0837, "step": 4774 }, { "epoch": 1.9073482428115016, "grad_norm": 1.0972839581319511, "learning_rate": 3.525801194792805e-06, "loss": 0.0969, "step": 4776 }, { "epoch": 1.90814696485623, "grad_norm": 1.1088742467231305, "learning_rate": 3.521361094753788e-06, "loss": 0.09, "step": 4778 }, { "epoch": 1.9089456869009584, "grad_norm": 1.062941595145544, "learning_rate": 3.516922272119274e-06, "loss": 0.0826, "step": 4780 }, { "epoch": 1.909744408945687, "grad_norm": 1.01692209679147, "learning_rate": 3.5124847307239863e-06, "loss": 0.0857, "step": 4782 }, { "epoch": 1.9105431309904153, "grad_norm": 1.2055133548353663, "learning_rate": 3.508048474401541e-06, "loss": 0.096, "step": 4784 }, { "epoch": 1.9113418530351438, "grad_norm": 1.001313631088606, "learning_rate": 3.503613506984447e-06, "loss": 0.0884, "step": 4786 }, { "epoch": 1.9121405750798721, "grad_norm": 1.1432556607777271, "learning_rate": 3.499179832304096e-06, "loss": 0.0909, "step": 4788 }, { "epoch": 1.9129392971246006, "grad_norm": 1.1768617881519154, "learning_rate": 3.4947474541907655e-06, "loss": 0.0895, "step": 4790 }, { "epoch": 1.9137380191693292, "grad_norm": 1.054267796873096, "learning_rate": 3.4903163764736104e-06, "loss": 0.0944, "step": 4792 }, { "epoch": 1.9145367412140575, "grad_norm": 1.1032448450072194, "learning_rate": 3.4858866029806658e-06, "loss": 0.0956, "step": 4794 }, { "epoch": 1.9153354632587858, "grad_norm": 0.9183906781681136, "learning_rate": 3.4814581375388384e-06, "loss": 0.0838, "step": 4796 }, { "epoch": 1.9161341853035143, "grad_norm": 1.1483868020594459, "learning_rate": 3.4770309839739026e-06, "loss": 0.0913, "step": 4798 }, { "epoch": 1.9169329073482428, "grad_norm": 1.0510791282744207, "learning_rate": 3.4726051461105016e-06, "loss": 0.0895, "step": 4800 }, { "epoch": 1.9177316293929714, "grad_norm": 1.0681148740204216, "learning_rate": 3.468180627772144e-06, "loss": 0.0901, "step": 4802 }, { "epoch": 1.9185303514376997, "grad_norm": 1.056725620741662, "learning_rate": 3.4637574327811934e-06, "loss": 0.087, "step": 4804 }, { "epoch": 1.919329073482428, "grad_norm": 1.1165517293384117, "learning_rate": 3.459335564958875e-06, "loss": 0.0949, "step": 4806 }, { "epoch": 1.9201277955271565, "grad_norm": 1.1714069822717896, "learning_rate": 3.4549150281252635e-06, "loss": 0.1003, "step": 4808 }, { "epoch": 1.920926517571885, "grad_norm": 1.0857128594280534, "learning_rate": 3.4504958260992877e-06, "loss": 0.0917, "step": 4810 }, { "epoch": 1.9217252396166136, "grad_norm": 0.9514303291715913, "learning_rate": 3.4460779626987186e-06, "loss": 0.081, "step": 4812 }, { "epoch": 1.9225239616613419, "grad_norm": 1.1178021513245053, "learning_rate": 3.441661441740176e-06, "loss": 0.0945, "step": 4814 }, { "epoch": 1.9233226837060702, "grad_norm": 1.0436166022295423, "learning_rate": 3.437246267039115e-06, "loss": 0.0939, "step": 4816 }, { "epoch": 1.9241214057507987, "grad_norm": 0.9558565581069985, "learning_rate": 3.4328324424098315e-06, "loss": 0.0866, "step": 4818 }, { "epoch": 1.9249201277955272, "grad_norm": 1.0690618672851848, "learning_rate": 3.4284199716654526e-06, "loss": 0.0971, "step": 4820 }, { "epoch": 1.9257188498402555, "grad_norm": 1.0330544019396488, "learning_rate": 3.424008858617939e-06, "loss": 0.0877, "step": 4822 }, { "epoch": 1.926517571884984, "grad_norm": 1.090972476610282, "learning_rate": 3.419599107078073e-06, "loss": 0.0933, "step": 4824 }, { "epoch": 1.9273162939297124, "grad_norm": 1.1389205515436118, "learning_rate": 3.4151907208554657e-06, "loss": 0.0914, "step": 4826 }, { "epoch": 1.928115015974441, "grad_norm": 1.0521512673067093, "learning_rate": 3.4107837037585463e-06, "loss": 0.0956, "step": 4828 }, { "epoch": 1.9289137380191694, "grad_norm": 0.9943276734942584, "learning_rate": 3.4063780595945627e-06, "loss": 0.0971, "step": 4830 }, { "epoch": 1.9297124600638977, "grad_norm": 1.1979073104114557, "learning_rate": 3.401973792169574e-06, "loss": 0.0929, "step": 4832 }, { "epoch": 1.930511182108626, "grad_norm": 0.9579355850140132, "learning_rate": 3.397570905288453e-06, "loss": 0.0895, "step": 4834 }, { "epoch": 1.9313099041533546, "grad_norm": 1.0976423710464434, "learning_rate": 3.393169402754878e-06, "loss": 0.0844, "step": 4836 }, { "epoch": 1.932108626198083, "grad_norm": 1.1417293656435412, "learning_rate": 3.388769288371333e-06, "loss": 0.0954, "step": 4838 }, { "epoch": 1.9329073482428116, "grad_norm": 0.9657207657700554, "learning_rate": 3.384370565939098e-06, "loss": 0.08, "step": 4840 }, { "epoch": 1.93370607028754, "grad_norm": 1.122410966064945, "learning_rate": 3.3799732392582598e-06, "loss": 0.1073, "step": 4842 }, { "epoch": 1.9345047923322682, "grad_norm": 0.9499390486261202, "learning_rate": 3.375577312127689e-06, "loss": 0.081, "step": 4844 }, { "epoch": 1.9353035143769968, "grad_norm": 1.089004347076352, "learning_rate": 3.3711827883450552e-06, "loss": 0.0916, "step": 4846 }, { "epoch": 1.9361022364217253, "grad_norm": 1.121477411735401, "learning_rate": 3.3667896717068105e-06, "loss": 0.0944, "step": 4848 }, { "epoch": 1.9369009584664538, "grad_norm": 1.137024861471035, "learning_rate": 3.3623979660081944e-06, "loss": 0.0898, "step": 4850 }, { "epoch": 1.9376996805111821, "grad_norm": 1.036107942325039, "learning_rate": 3.3580076750432244e-06, "loss": 0.1024, "step": 4852 }, { "epoch": 1.9384984025559104, "grad_norm": 1.0079391443696057, "learning_rate": 3.3536188026047e-06, "loss": 0.0806, "step": 4854 }, { "epoch": 1.939297124600639, "grad_norm": 1.0869459285245229, "learning_rate": 3.34923135248419e-06, "loss": 0.0918, "step": 4856 }, { "epoch": 1.9400958466453675, "grad_norm": 1.1976937501317675, "learning_rate": 3.3448453284720407e-06, "loss": 0.0992, "step": 4858 }, { "epoch": 1.9408945686900958, "grad_norm": 1.1088825356732732, "learning_rate": 3.340460734357359e-06, "loss": 0.0986, "step": 4860 }, { "epoch": 1.9416932907348243, "grad_norm": 1.03132569544229, "learning_rate": 3.336077573928023e-06, "loss": 0.0864, "step": 4862 }, { "epoch": 1.9424920127795526, "grad_norm": 1.029156495025623, "learning_rate": 3.33169585097067e-06, "loss": 0.0865, "step": 4864 }, { "epoch": 1.9432907348242812, "grad_norm": 1.1169233865438082, "learning_rate": 3.3273155692706956e-06, "loss": 0.087, "step": 4866 }, { "epoch": 1.9440894568690097, "grad_norm": 1.0237220064497436, "learning_rate": 3.3229367326122475e-06, "loss": 0.088, "step": 4868 }, { "epoch": 1.944888178913738, "grad_norm": 1.0716607652719201, "learning_rate": 3.318559344778231e-06, "loss": 0.094, "step": 4870 }, { "epoch": 1.9456869009584663, "grad_norm": 0.9965570911044683, "learning_rate": 3.314183409550293e-06, "loss": 0.0913, "step": 4872 }, { "epoch": 1.9464856230031948, "grad_norm": 1.0605235496788143, "learning_rate": 3.3098089307088307e-06, "loss": 0.088, "step": 4874 }, { "epoch": 1.9472843450479234, "grad_norm": 1.0509076016295873, "learning_rate": 3.3054359120329788e-06, "loss": 0.0967, "step": 4876 }, { "epoch": 1.9480830670926519, "grad_norm": 1.0048257649363173, "learning_rate": 3.301064357300615e-06, "loss": 0.0924, "step": 4878 }, { "epoch": 1.9488817891373802, "grad_norm": 1.1009166155154577, "learning_rate": 3.2966942702883494e-06, "loss": 0.0979, "step": 4880 }, { "epoch": 1.9496805111821085, "grad_norm": 1.123335138251925, "learning_rate": 3.2923256547715245e-06, "loss": 0.0952, "step": 4882 }, { "epoch": 1.950479233226837, "grad_norm": 1.0429249357254708, "learning_rate": 3.287958514524212e-06, "loss": 0.0868, "step": 4884 }, { "epoch": 1.9512779552715656, "grad_norm": 1.0420461671724415, "learning_rate": 3.2835928533192086e-06, "loss": 0.091, "step": 4886 }, { "epoch": 1.952076677316294, "grad_norm": 1.066480483177895, "learning_rate": 3.279228674928035e-06, "loss": 0.0911, "step": 4888 }, { "epoch": 1.9528753993610224, "grad_norm": 1.034614202531424, "learning_rate": 3.2748659831209293e-06, "loss": 0.0925, "step": 4890 }, { "epoch": 1.9536741214057507, "grad_norm": 1.010806358143463, "learning_rate": 3.270504781666845e-06, "loss": 0.0948, "step": 4892 }, { "epoch": 1.9544728434504792, "grad_norm": 0.9676259211028123, "learning_rate": 3.2661450743334495e-06, "loss": 0.0838, "step": 4894 }, { "epoch": 1.9552715654952078, "grad_norm": 1.9272872858361931, "learning_rate": 3.261786864887117e-06, "loss": 0.0941, "step": 4896 }, { "epoch": 1.956070287539936, "grad_norm": 1.1661864224993708, "learning_rate": 3.2574301570929313e-06, "loss": 0.0946, "step": 4898 }, { "epoch": 1.9568690095846646, "grad_norm": 1.0658970850145277, "learning_rate": 3.2530749547146745e-06, "loss": 0.0846, "step": 4900 }, { "epoch": 1.957667731629393, "grad_norm": 1.1441635938072476, "learning_rate": 3.2487212615148316e-06, "loss": 0.0957, "step": 4902 }, { "epoch": 1.9584664536741214, "grad_norm": 1.0929232430584095, "learning_rate": 3.244369081254585e-06, "loss": 0.0881, "step": 4904 }, { "epoch": 1.95926517571885, "grad_norm": 0.9707906667424272, "learning_rate": 3.240018417693803e-06, "loss": 0.0854, "step": 4906 }, { "epoch": 1.9600638977635783, "grad_norm": 1.166565929217826, "learning_rate": 3.235669274591051e-06, "loss": 0.0952, "step": 4908 }, { "epoch": 1.9608626198083066, "grad_norm": 0.9958821823082478, "learning_rate": 3.231321655703581e-06, "loss": 0.0896, "step": 4910 }, { "epoch": 1.961661341853035, "grad_norm": 1.16345333204102, "learning_rate": 3.226975564787322e-06, "loss": 0.0967, "step": 4912 }, { "epoch": 1.9624600638977636, "grad_norm": 1.0194684910483922, "learning_rate": 3.222631005596888e-06, "loss": 0.0792, "step": 4914 }, { "epoch": 1.9632587859424921, "grad_norm": 1.1421002598232535, "learning_rate": 3.218287981885567e-06, "loss": 0.0998, "step": 4916 }, { "epoch": 1.9640575079872205, "grad_norm": 1.0778986286246897, "learning_rate": 3.2139464974053225e-06, "loss": 0.0979, "step": 4918 }, { "epoch": 1.9648562300319488, "grad_norm": 1.071349946349977, "learning_rate": 3.209606555906788e-06, "loss": 0.0792, "step": 4920 }, { "epoch": 1.9656549520766773, "grad_norm": 0.9608643299126582, "learning_rate": 3.2052681611392616e-06, "loss": 0.0946, "step": 4922 }, { "epoch": 1.9664536741214058, "grad_norm": 1.0944937155116632, "learning_rate": 3.20093131685071e-06, "loss": 0.094, "step": 4924 }, { "epoch": 1.9672523961661343, "grad_norm": 1.045526176848772, "learning_rate": 3.1965960267877544e-06, "loss": 0.0986, "step": 4926 }, { "epoch": 1.9680511182108626, "grad_norm": 1.2126133137340172, "learning_rate": 3.192262294695679e-06, "loss": 0.1038, "step": 4928 }, { "epoch": 1.968849840255591, "grad_norm": 1.1192306783949446, "learning_rate": 3.187930124318417e-06, "loss": 0.0928, "step": 4930 }, { "epoch": 1.9696485623003195, "grad_norm": 1.1586123177119687, "learning_rate": 3.1835995193985548e-06, "loss": 0.0973, "step": 4932 }, { "epoch": 1.970447284345048, "grad_norm": 1.1103102164856757, "learning_rate": 3.1792704836773303e-06, "loss": 0.0953, "step": 4934 }, { "epoch": 1.9712460063897763, "grad_norm": 1.165044371843411, "learning_rate": 3.174943020894618e-06, "loss": 0.0933, "step": 4936 }, { "epoch": 1.9720447284345048, "grad_norm": 0.9945637263764828, "learning_rate": 3.170617134788939e-06, "loss": 0.0801, "step": 4938 }, { "epoch": 1.9728434504792332, "grad_norm": 1.0071008319354997, "learning_rate": 3.1662928290974514e-06, "loss": 0.0916, "step": 4940 }, { "epoch": 1.9736421725239617, "grad_norm": 1.1375941744992564, "learning_rate": 3.161970107555945e-06, "loss": 0.1005, "step": 4942 }, { "epoch": 1.9744408945686902, "grad_norm": 1.0807791951129158, "learning_rate": 3.1576489738988457e-06, "loss": 0.0919, "step": 4944 }, { "epoch": 1.9752396166134185, "grad_norm": 1.0609866630309928, "learning_rate": 3.153329431859204e-06, "loss": 0.0915, "step": 4946 }, { "epoch": 1.9760383386581468, "grad_norm": 1.1417443819482096, "learning_rate": 3.1490114851686984e-06, "loss": 0.0931, "step": 4948 }, { "epoch": 1.9768370607028753, "grad_norm": 0.9100647284833052, "learning_rate": 3.144695137557624e-06, "loss": 0.0772, "step": 4950 }, { "epoch": 1.9776357827476039, "grad_norm": 0.9703032312651547, "learning_rate": 3.140380392754901e-06, "loss": 0.098, "step": 4952 }, { "epoch": 1.9784345047923324, "grad_norm": 1.0592857088516412, "learning_rate": 3.1360672544880586e-06, "loss": 0.0962, "step": 4954 }, { "epoch": 1.9792332268370607, "grad_norm": 1.0580116375862416, "learning_rate": 3.1317557264832454e-06, "loss": 0.0841, "step": 4956 }, { "epoch": 1.980031948881789, "grad_norm": 1.0265861095673052, "learning_rate": 3.1274458124652117e-06, "loss": 0.0901, "step": 4958 }, { "epoch": 1.9808306709265175, "grad_norm": 1.0372575193954332, "learning_rate": 3.12313751615732e-06, "loss": 0.0736, "step": 4960 }, { "epoch": 1.981629392971246, "grad_norm": 0.9602359888946597, "learning_rate": 3.1188308412815276e-06, "loss": 0.087, "step": 4962 }, { "epoch": 1.9824281150159746, "grad_norm": 1.1199736388647585, "learning_rate": 3.114525791558398e-06, "loss": 0.0867, "step": 4964 }, { "epoch": 1.983226837060703, "grad_norm": 1.0792261618908379, "learning_rate": 3.1102223707070865e-06, "loss": 0.0887, "step": 4966 }, { "epoch": 1.9840255591054312, "grad_norm": 1.1181889010486792, "learning_rate": 3.1059205824453446e-06, "loss": 0.0929, "step": 4968 }, { "epoch": 1.9848242811501597, "grad_norm": 1.0142291902192897, "learning_rate": 3.101620430489509e-06, "loss": 0.0876, "step": 4970 }, { "epoch": 1.9856230031948883, "grad_norm": 1.2214112942901962, "learning_rate": 3.0973219185545077e-06, "loss": 0.1051, "step": 4972 }, { "epoch": 1.9864217252396166, "grad_norm": 1.1881886135246487, "learning_rate": 3.093025050353847e-06, "loss": 0.0861, "step": 4974 }, { "epoch": 1.9872204472843449, "grad_norm": 1.0284769527949214, "learning_rate": 3.0887298295996183e-06, "loss": 0.0848, "step": 4976 }, { "epoch": 1.9880191693290734, "grad_norm": 1.0067298405880323, "learning_rate": 3.0844362600024813e-06, "loss": 0.0859, "step": 4978 }, { "epoch": 1.988817891373802, "grad_norm": 1.0990861865733983, "learning_rate": 3.0801443452716835e-06, "loss": 0.0938, "step": 4980 }, { "epoch": 1.9896166134185305, "grad_norm": 0.914893405123526, "learning_rate": 3.0758540891150286e-06, "loss": 0.0785, "step": 4982 }, { "epoch": 1.9904153354632588, "grad_norm": 1.04304535262742, "learning_rate": 3.0715654952388957e-06, "loss": 0.0926, "step": 4984 }, { "epoch": 1.991214057507987, "grad_norm": 1.1088676583366113, "learning_rate": 3.067278567348223e-06, "loss": 0.0885, "step": 4986 }, { "epoch": 1.9920127795527156, "grad_norm": 1.0903427682959805, "learning_rate": 3.062993309146514e-06, "loss": 0.083, "step": 4988 }, { "epoch": 1.9928115015974441, "grad_norm": 1.1545810549870772, "learning_rate": 3.0587097243358254e-06, "loss": 0.0835, "step": 4990 }, { "epoch": 1.9936102236421727, "grad_norm": 1.080925053565247, "learning_rate": 3.054427816616773e-06, "loss": 0.0955, "step": 4992 }, { "epoch": 1.994408945686901, "grad_norm": 1.0715872751652953, "learning_rate": 3.0501475896885175e-06, "loss": 0.0842, "step": 4994 }, { "epoch": 1.9952076677316293, "grad_norm": 1.1378173558732894, "learning_rate": 3.045869047248774e-06, "loss": 0.0974, "step": 4996 }, { "epoch": 1.9960063897763578, "grad_norm": 1.110967676470196, "learning_rate": 3.041592192993798e-06, "loss": 0.0928, "step": 4998 }, { "epoch": 1.9968051118210863, "grad_norm": 0.9911386613472772, "learning_rate": 3.0373170306183885e-06, "loss": 0.0737, "step": 5000 }, { "epoch": 1.9968051118210863, "eval_loss": 0.1455243080854416, "eval_runtime": 417.8708, "eval_samples_per_second": 42.614, "eval_steps_per_second": 5.327, "step": 5000 }, { "epoch": 1.9976038338658149, "grad_norm": 1.124610364572997, "learning_rate": 3.0330435638158805e-06, "loss": 0.0935, "step": 5002 }, { "epoch": 1.9984025559105432, "grad_norm": 1.053117611083023, "learning_rate": 3.028771796278151e-06, "loss": 0.1017, "step": 5004 }, { "epoch": 1.9992012779552715, "grad_norm": 1.151547544509955, "learning_rate": 3.0245017316956e-06, "loss": 0.0902, "step": 5006 }, { "epoch": 2.0, "grad_norm": 1.1612757379061158, "learning_rate": 3.020233373757162e-06, "loss": 0.0987, "step": 5008 }, { "epoch": 2.0007987220447285, "grad_norm": 0.6298174764155958, "learning_rate": 3.0159667261502944e-06, "loss": 0.0395, "step": 5010 }, { "epoch": 2.001597444089457, "grad_norm": 0.625623431879438, "learning_rate": 3.0117017925609802e-06, "loss": 0.0417, "step": 5012 }, { "epoch": 2.002396166134185, "grad_norm": 0.6897871271662769, "learning_rate": 3.007438576673717e-06, "loss": 0.0421, "step": 5014 }, { "epoch": 2.0031948881789137, "grad_norm": 0.6744101045802077, "learning_rate": 3.0031770821715233e-06, "loss": 0.0378, "step": 5016 }, { "epoch": 2.003993610223642, "grad_norm": 0.7756444837427113, "learning_rate": 2.9989173127359267e-06, "loss": 0.04, "step": 5018 }, { "epoch": 2.0047923322683707, "grad_norm": 0.7859449397270831, "learning_rate": 2.9946592720469662e-06, "loss": 0.037, "step": 5020 }, { "epoch": 2.0055910543130993, "grad_norm": 0.6704787926555024, "learning_rate": 2.9904029637831887e-06, "loss": 0.0347, "step": 5022 }, { "epoch": 2.0063897763578273, "grad_norm": 0.751662927080756, "learning_rate": 2.9861483916216404e-06, "loss": 0.0389, "step": 5024 }, { "epoch": 2.007188498402556, "grad_norm": 0.9221469194496961, "learning_rate": 2.981895559237873e-06, "loss": 0.0447, "step": 5026 }, { "epoch": 2.0079872204472844, "grad_norm": 0.80221078116781, "learning_rate": 2.9776444703059316e-06, "loss": 0.0345, "step": 5028 }, { "epoch": 2.008785942492013, "grad_norm": 0.8059814098019393, "learning_rate": 2.9733951284983555e-06, "loss": 0.0373, "step": 5030 }, { "epoch": 2.009584664536741, "grad_norm": 0.9358045993876437, "learning_rate": 2.969147537486175e-06, "loss": 0.0408, "step": 5032 }, { "epoch": 2.0103833865814695, "grad_norm": 0.8885370742724994, "learning_rate": 2.9649017009389077e-06, "loss": 0.0483, "step": 5034 }, { "epoch": 2.011182108626198, "grad_norm": 1.057708534412327, "learning_rate": 2.9606576225245566e-06, "loss": 0.0379, "step": 5036 }, { "epoch": 2.0119808306709266, "grad_norm": 0.8731366761911997, "learning_rate": 2.9564153059096047e-06, "loss": 0.0375, "step": 5038 }, { "epoch": 2.012779552715655, "grad_norm": 0.8694849953429314, "learning_rate": 2.952174754759012e-06, "loss": 0.0359, "step": 5040 }, { "epoch": 2.013578274760383, "grad_norm": 0.8842840721862356, "learning_rate": 2.947935972736217e-06, "loss": 0.0351, "step": 5042 }, { "epoch": 2.0143769968051117, "grad_norm": 0.8362117175566706, "learning_rate": 2.9436989635031253e-06, "loss": 0.0353, "step": 5044 }, { "epoch": 2.0151757188498403, "grad_norm": 0.8610101010994339, "learning_rate": 2.9394637307201156e-06, "loss": 0.0365, "step": 5046 }, { "epoch": 2.015974440894569, "grad_norm": 0.8646570046386132, "learning_rate": 2.935230278046025e-06, "loss": 0.0376, "step": 5048 }, { "epoch": 2.0167731629392973, "grad_norm": 0.8838043984613749, "learning_rate": 2.9309986091381616e-06, "loss": 0.0331, "step": 5050 }, { "epoch": 2.0175718849840254, "grad_norm": 0.8681957347092829, "learning_rate": 2.9267687276522876e-06, "loss": 0.0355, "step": 5052 }, { "epoch": 2.018370607028754, "grad_norm": 0.9197550042905789, "learning_rate": 2.922540637242619e-06, "loss": 0.0366, "step": 5054 }, { "epoch": 2.0191693290734825, "grad_norm": 0.9123850396028426, "learning_rate": 2.9183143415618297e-06, "loss": 0.035, "step": 5056 }, { "epoch": 2.019968051118211, "grad_norm": 0.8949092568287165, "learning_rate": 2.9140898442610375e-06, "loss": 0.0383, "step": 5058 }, { "epoch": 2.0207667731629395, "grad_norm": 1.0371718273633521, "learning_rate": 2.909867148989812e-06, "loss": 0.0365, "step": 5060 }, { "epoch": 2.0215654952076676, "grad_norm": 0.7653356631917099, "learning_rate": 2.905646259396162e-06, "loss": 0.0358, "step": 5062 }, { "epoch": 2.022364217252396, "grad_norm": 0.7314418159084973, "learning_rate": 2.9014271791265403e-06, "loss": 0.031, "step": 5064 }, { "epoch": 2.0231629392971247, "grad_norm": 0.8465101892289698, "learning_rate": 2.8972099118258305e-06, "loss": 0.0313, "step": 5066 }, { "epoch": 2.023961661341853, "grad_norm": 0.8580294083980158, "learning_rate": 2.8929944611373555e-06, "loss": 0.0349, "step": 5068 }, { "epoch": 2.0247603833865813, "grad_norm": 0.820776207639085, "learning_rate": 2.888780830702867e-06, "loss": 0.0337, "step": 5070 }, { "epoch": 2.02555910543131, "grad_norm": 0.9528932671606859, "learning_rate": 2.8845690241625437e-06, "loss": 0.0396, "step": 5072 }, { "epoch": 2.0263578274760383, "grad_norm": 1.0061064855225768, "learning_rate": 2.88035904515499e-06, "loss": 0.0397, "step": 5074 }, { "epoch": 2.027156549520767, "grad_norm": 0.8452096897085726, "learning_rate": 2.8761508973172293e-06, "loss": 0.0346, "step": 5076 }, { "epoch": 2.0279552715654954, "grad_norm": 0.7634443097067318, "learning_rate": 2.871944584284705e-06, "loss": 0.0322, "step": 5078 }, { "epoch": 2.0287539936102235, "grad_norm": 1.0425744751303125, "learning_rate": 2.867740109691277e-06, "loss": 0.0355, "step": 5080 }, { "epoch": 2.029552715654952, "grad_norm": 1.0565258664896207, "learning_rate": 2.86353747716921e-06, "loss": 0.0378, "step": 5082 }, { "epoch": 2.0303514376996805, "grad_norm": 1.0498591946749731, "learning_rate": 2.859336690349185e-06, "loss": 0.0407, "step": 5084 }, { "epoch": 2.031150159744409, "grad_norm": 0.7704667558370644, "learning_rate": 2.8551377528602836e-06, "loss": 0.0325, "step": 5086 }, { "epoch": 2.0319488817891376, "grad_norm": 0.9433544175624843, "learning_rate": 2.850940668329996e-06, "loss": 0.034, "step": 5088 }, { "epoch": 2.0327476038338657, "grad_norm": 0.8469629469335119, "learning_rate": 2.8467454403842005e-06, "loss": 0.0341, "step": 5090 }, { "epoch": 2.033546325878594, "grad_norm": 0.7628386290158324, "learning_rate": 2.842552072647182e-06, "loss": 0.0306, "step": 5092 }, { "epoch": 2.0343450479233227, "grad_norm": 0.9647679056914126, "learning_rate": 2.838360568741613e-06, "loss": 0.0419, "step": 5094 }, { "epoch": 2.0351437699680512, "grad_norm": 0.8818509646802964, "learning_rate": 2.8341709322885624e-06, "loss": 0.0322, "step": 5096 }, { "epoch": 2.0359424920127798, "grad_norm": 0.9443959085201566, "learning_rate": 2.8299831669074744e-06, "loss": 0.04, "step": 5098 }, { "epoch": 2.036741214057508, "grad_norm": 1.0294482005419765, "learning_rate": 2.8257972762161865e-06, "loss": 0.0371, "step": 5100 }, { "epoch": 2.0375399361022364, "grad_norm": 1.0211095104000223, "learning_rate": 2.8216132638309124e-06, "loss": 0.0365, "step": 5102 }, { "epoch": 2.038338658146965, "grad_norm": 1.9713879821461935, "learning_rate": 2.817431133366246e-06, "loss": 0.0437, "step": 5104 }, { "epoch": 2.0391373801916934, "grad_norm": 0.9823140675823749, "learning_rate": 2.8132508884351504e-06, "loss": 0.0379, "step": 5106 }, { "epoch": 2.0399361022364215, "grad_norm": 0.8967790877119935, "learning_rate": 2.809072532648963e-06, "loss": 0.0335, "step": 5108 }, { "epoch": 2.04073482428115, "grad_norm": 0.9950222932994464, "learning_rate": 2.804896069617391e-06, "loss": 0.0414, "step": 5110 }, { "epoch": 2.0415335463258786, "grad_norm": 1.0503584997850632, "learning_rate": 2.800721502948506e-06, "loss": 0.0371, "step": 5112 }, { "epoch": 2.042332268370607, "grad_norm": 0.8863498646029845, "learning_rate": 2.7965488362487337e-06, "loss": 0.037, "step": 5114 }, { "epoch": 2.0431309904153356, "grad_norm": 0.9129952865118284, "learning_rate": 2.7923780731228665e-06, "loss": 0.0379, "step": 5116 }, { "epoch": 2.0439297124600637, "grad_norm": 0.9360258175259091, "learning_rate": 2.7882092171740544e-06, "loss": 0.0386, "step": 5118 }, { "epoch": 2.0447284345047922, "grad_norm": 0.9038391882536216, "learning_rate": 2.7840422720037943e-06, "loss": 0.0336, "step": 5120 }, { "epoch": 2.0455271565495208, "grad_norm": 0.861340429852411, "learning_rate": 2.77987724121193e-06, "loss": 0.0396, "step": 5122 }, { "epoch": 2.0463258785942493, "grad_norm": 0.9598176808433139, "learning_rate": 2.775714128396658e-06, "loss": 0.0353, "step": 5124 }, { "epoch": 2.047124600638978, "grad_norm": 0.8776533817537266, "learning_rate": 2.7715529371545138e-06, "loss": 0.0357, "step": 5126 }, { "epoch": 2.047923322683706, "grad_norm": 0.9993457119073764, "learning_rate": 2.767393671080376e-06, "loss": 0.0357, "step": 5128 }, { "epoch": 2.0487220447284344, "grad_norm": 0.9982845161626676, "learning_rate": 2.763236333767455e-06, "loss": 0.0329, "step": 5130 }, { "epoch": 2.049520766773163, "grad_norm": 0.9890197900955416, "learning_rate": 2.7590809288073e-06, "loss": 0.0334, "step": 5132 }, { "epoch": 2.0503194888178915, "grad_norm": 0.838107029970905, "learning_rate": 2.7549274597897878e-06, "loss": 0.0349, "step": 5134 }, { "epoch": 2.0511182108626196, "grad_norm": 0.8727714025662038, "learning_rate": 2.7507759303031257e-06, "loss": 0.0358, "step": 5136 }, { "epoch": 2.051916932907348, "grad_norm": 0.9757184296946524, "learning_rate": 2.7466263439338424e-06, "loss": 0.0388, "step": 5138 }, { "epoch": 2.0527156549520766, "grad_norm": 0.9770465103402337, "learning_rate": 2.7424787042667856e-06, "loss": 0.0423, "step": 5140 }, { "epoch": 2.053514376996805, "grad_norm": 0.8990138413398377, "learning_rate": 2.7383330148851293e-06, "loss": 0.0315, "step": 5142 }, { "epoch": 2.0543130990415337, "grad_norm": 0.8588787728847247, "learning_rate": 2.7341892793703594e-06, "loss": 0.0315, "step": 5144 }, { "epoch": 2.055111821086262, "grad_norm": 0.9632194247423946, "learning_rate": 2.7300475013022666e-06, "loss": 0.0335, "step": 5146 }, { "epoch": 2.0559105431309903, "grad_norm": 0.730091684485515, "learning_rate": 2.7259076842589595e-06, "loss": 0.0316, "step": 5148 }, { "epoch": 2.056709265175719, "grad_norm": 0.8722727704441476, "learning_rate": 2.721769831816849e-06, "loss": 0.0385, "step": 5150 }, { "epoch": 2.0575079872204474, "grad_norm": 0.9033151795461563, "learning_rate": 2.7176339475506515e-06, "loss": 0.0369, "step": 5152 }, { "epoch": 2.058306709265176, "grad_norm": 0.9092639705567416, "learning_rate": 2.7135000350333762e-06, "loss": 0.0371, "step": 5154 }, { "epoch": 2.059105431309904, "grad_norm": 1.0234924384548971, "learning_rate": 2.7093680978363367e-06, "loss": 0.0379, "step": 5156 }, { "epoch": 2.0599041533546325, "grad_norm": 0.9677538639597134, "learning_rate": 2.7052381395291355e-06, "loss": 0.0308, "step": 5158 }, { "epoch": 2.060702875399361, "grad_norm": 1.0727950843621823, "learning_rate": 2.7011101636796677e-06, "loss": 0.0454, "step": 5160 }, { "epoch": 2.0615015974440896, "grad_norm": 1.02920375202306, "learning_rate": 2.6969841738541165e-06, "loss": 0.0421, "step": 5162 }, { "epoch": 2.062300319488818, "grad_norm": 1.1885181849696531, "learning_rate": 2.6928601736169423e-06, "loss": 0.0444, "step": 5164 }, { "epoch": 2.063099041533546, "grad_norm": 1.0717318540862604, "learning_rate": 2.6887381665308977e-06, "loss": 0.0385, "step": 5166 }, { "epoch": 2.0638977635782747, "grad_norm": 0.7332635219694293, "learning_rate": 2.6846181561570085e-06, "loss": 0.0322, "step": 5168 }, { "epoch": 2.0646964856230032, "grad_norm": 0.854254385375307, "learning_rate": 2.68050014605457e-06, "loss": 0.037, "step": 5170 }, { "epoch": 2.0654952076677318, "grad_norm": 0.8081983560893325, "learning_rate": 2.6763841397811576e-06, "loss": 0.034, "step": 5172 }, { "epoch": 2.06629392971246, "grad_norm": 0.9741434482130531, "learning_rate": 2.6722701408926117e-06, "loss": 0.0356, "step": 5174 }, { "epoch": 2.0670926517571884, "grad_norm": 0.8836539172435507, "learning_rate": 2.668158152943039e-06, "loss": 0.0402, "step": 5176 }, { "epoch": 2.067891373801917, "grad_norm": 0.9171861393472643, "learning_rate": 2.664048179484812e-06, "loss": 0.0354, "step": 5178 }, { "epoch": 2.0686900958466454, "grad_norm": 0.9101919637114287, "learning_rate": 2.6599402240685546e-06, "loss": 0.0354, "step": 5180 }, { "epoch": 2.069488817891374, "grad_norm": 0.9747609472900693, "learning_rate": 2.6558342902431553e-06, "loss": 0.0365, "step": 5182 }, { "epoch": 2.070287539936102, "grad_norm": 0.8324045055842462, "learning_rate": 2.651730381555754e-06, "loss": 0.0386, "step": 5184 }, { "epoch": 2.0710862619808306, "grad_norm": 0.7931217955187458, "learning_rate": 2.64762850155174e-06, "loss": 0.0329, "step": 5186 }, { "epoch": 2.071884984025559, "grad_norm": 0.797923260607255, "learning_rate": 2.6435286537747512e-06, "loss": 0.0336, "step": 5188 }, { "epoch": 2.0726837060702876, "grad_norm": 0.958305531859498, "learning_rate": 2.6394308417666686e-06, "loss": 0.0376, "step": 5190 }, { "epoch": 2.073482428115016, "grad_norm": 0.9482044419754099, "learning_rate": 2.635335069067617e-06, "loss": 0.0393, "step": 5192 }, { "epoch": 2.0742811501597442, "grad_norm": 1.0868323089043634, "learning_rate": 2.6312413392159553e-06, "loss": 0.0379, "step": 5194 }, { "epoch": 2.0750798722044728, "grad_norm": 0.8207739576375747, "learning_rate": 2.6271496557482795e-06, "loss": 0.0343, "step": 5196 }, { "epoch": 2.0758785942492013, "grad_norm": 0.8539413020355823, "learning_rate": 2.6230600221994195e-06, "loss": 0.0397, "step": 5198 }, { "epoch": 2.07667731629393, "grad_norm": 0.8965919210232974, "learning_rate": 2.618972442102432e-06, "loss": 0.0381, "step": 5200 }, { "epoch": 2.0774760383386583, "grad_norm": 0.8973056412608038, "learning_rate": 2.614886918988604e-06, "loss": 0.0342, "step": 5202 }, { "epoch": 2.0782747603833864, "grad_norm": 0.9313693805488207, "learning_rate": 2.610803456387436e-06, "loss": 0.0358, "step": 5204 }, { "epoch": 2.079073482428115, "grad_norm": 0.7917376380998945, "learning_rate": 2.6067220578266574e-06, "loss": 0.0329, "step": 5206 }, { "epoch": 2.0798722044728435, "grad_norm": 0.961444379029632, "learning_rate": 2.602642726832212e-06, "loss": 0.0361, "step": 5208 }, { "epoch": 2.080670926517572, "grad_norm": 0.8429511197266044, "learning_rate": 2.5985654669282556e-06, "loss": 0.0334, "step": 5210 }, { "epoch": 2.0814696485623, "grad_norm": 0.9355752641552857, "learning_rate": 2.5944902816371573e-06, "loss": 0.0395, "step": 5212 }, { "epoch": 2.0822683706070286, "grad_norm": 0.8965794716087095, "learning_rate": 2.5904171744794927e-06, "loss": 0.031, "step": 5214 }, { "epoch": 2.083067092651757, "grad_norm": 0.906394944166487, "learning_rate": 2.5863461489740403e-06, "loss": 0.0362, "step": 5216 }, { "epoch": 2.0838658146964857, "grad_norm": 0.916184630737243, "learning_rate": 2.5822772086377863e-06, "loss": 0.035, "step": 5218 }, { "epoch": 2.084664536741214, "grad_norm": 0.9777881509465214, "learning_rate": 2.5782103569859057e-06, "loss": 0.0388, "step": 5220 }, { "epoch": 2.0854632587859423, "grad_norm": 1.0639934309744312, "learning_rate": 2.5741455975317776e-06, "loss": 0.0379, "step": 5222 }, { "epoch": 2.086261980830671, "grad_norm": 0.8855617385185786, "learning_rate": 2.57008293378697e-06, "loss": 0.0366, "step": 5224 }, { "epoch": 2.0870607028753994, "grad_norm": 0.9693916259214078, "learning_rate": 2.566022369261243e-06, "loss": 0.0384, "step": 5226 }, { "epoch": 2.087859424920128, "grad_norm": 0.9541503121738648, "learning_rate": 2.5619639074625374e-06, "loss": 0.0367, "step": 5228 }, { "epoch": 2.0886581469648564, "grad_norm": 1.0451416066763384, "learning_rate": 2.557907551896984e-06, "loss": 0.0352, "step": 5230 }, { "epoch": 2.0894568690095845, "grad_norm": 0.9406486155465448, "learning_rate": 2.553853306068888e-06, "loss": 0.0326, "step": 5232 }, { "epoch": 2.090255591054313, "grad_norm": 0.97097723247182, "learning_rate": 2.549801173480742e-06, "loss": 0.0348, "step": 5234 }, { "epoch": 2.0910543130990416, "grad_norm": 1.0076436948344818, "learning_rate": 2.5457511576332008e-06, "loss": 0.0423, "step": 5236 }, { "epoch": 2.09185303514377, "grad_norm": 1.1036965373373038, "learning_rate": 2.5417032620250962e-06, "loss": 0.0392, "step": 5238 }, { "epoch": 2.0926517571884986, "grad_norm": 0.9846203021426592, "learning_rate": 2.5376574901534303e-06, "loss": 0.0382, "step": 5240 }, { "epoch": 2.0934504792332267, "grad_norm": 1.346629151831746, "learning_rate": 2.5336138455133684e-06, "loss": 0.0379, "step": 5242 }, { "epoch": 2.094249201277955, "grad_norm": 0.822209398062977, "learning_rate": 2.5295723315982344e-06, "loss": 0.0308, "step": 5244 }, { "epoch": 2.0950479233226837, "grad_norm": 0.912394753377447, "learning_rate": 2.5255329518995185e-06, "loss": 0.0325, "step": 5246 }, { "epoch": 2.0958466453674123, "grad_norm": 0.9775498681194755, "learning_rate": 2.5214957099068613e-06, "loss": 0.0384, "step": 5248 }, { "epoch": 2.0966453674121404, "grad_norm": 0.810870250386815, "learning_rate": 2.517460609108063e-06, "loss": 0.0362, "step": 5250 }, { "epoch": 2.097444089456869, "grad_norm": 1.3581064330087438, "learning_rate": 2.5134276529890646e-06, "loss": 0.0388, "step": 5252 }, { "epoch": 2.0982428115015974, "grad_norm": 0.9668476880810652, "learning_rate": 2.509396845033962e-06, "loss": 0.036, "step": 5254 }, { "epoch": 2.099041533546326, "grad_norm": 0.9552646055277381, "learning_rate": 2.5053681887249916e-06, "loss": 0.0354, "step": 5256 }, { "epoch": 2.0998402555910545, "grad_norm": 0.962167773860268, "learning_rate": 2.501341687542538e-06, "loss": 0.0378, "step": 5258 }, { "epoch": 2.1006389776357826, "grad_norm": 0.8997107031650471, "learning_rate": 2.497317344965111e-06, "loss": 0.0378, "step": 5260 }, { "epoch": 2.101437699680511, "grad_norm": 0.9369128058048221, "learning_rate": 2.493295164469367e-06, "loss": 0.0352, "step": 5262 }, { "epoch": 2.1022364217252396, "grad_norm": 0.8376692502141213, "learning_rate": 2.4892751495300893e-06, "loss": 0.0319, "step": 5264 }, { "epoch": 2.103035143769968, "grad_norm": 0.9548206607669597, "learning_rate": 2.4852573036201937e-06, "loss": 0.0395, "step": 5266 }, { "epoch": 2.1038338658146967, "grad_norm": 0.9387516661598057, "learning_rate": 2.481241630210716e-06, "loss": 0.0349, "step": 5268 }, { "epoch": 2.1046325878594248, "grad_norm": 0.874167713863454, "learning_rate": 2.4772281327708213e-06, "loss": 0.0335, "step": 5270 }, { "epoch": 2.1054313099041533, "grad_norm": 1.0185097250661654, "learning_rate": 2.4732168147677927e-06, "loss": 0.0357, "step": 5272 }, { "epoch": 2.106230031948882, "grad_norm": 1.043882328934825, "learning_rate": 2.46920767966703e-06, "loss": 0.0347, "step": 5274 }, { "epoch": 2.1070287539936103, "grad_norm": 1.0232740029306155, "learning_rate": 2.4652007309320497e-06, "loss": 0.0412, "step": 5276 }, { "epoch": 2.107827476038339, "grad_norm": 0.884509400992361, "learning_rate": 2.461195972024472e-06, "loss": 0.0345, "step": 5278 }, { "epoch": 2.108626198083067, "grad_norm": 0.9621954527543679, "learning_rate": 2.4571934064040364e-06, "loss": 0.0392, "step": 5280 }, { "epoch": 2.1094249201277955, "grad_norm": 0.9325329293080564, "learning_rate": 2.453193037528582e-06, "loss": 0.0372, "step": 5282 }, { "epoch": 2.110223642172524, "grad_norm": 0.77944405137395, "learning_rate": 2.449194868854046e-06, "loss": 0.0326, "step": 5284 }, { "epoch": 2.1110223642172525, "grad_norm": 0.8761868440249553, "learning_rate": 2.4451989038344713e-06, "loss": 0.0341, "step": 5286 }, { "epoch": 2.1118210862619806, "grad_norm": 0.8178367213212951, "learning_rate": 2.4412051459219945e-06, "loss": 0.0306, "step": 5288 }, { "epoch": 2.112619808306709, "grad_norm": 0.8640047451958446, "learning_rate": 2.4372135985668473e-06, "loss": 0.0332, "step": 5290 }, { "epoch": 2.1134185303514377, "grad_norm": 1.0879914849221164, "learning_rate": 2.433224265217346e-06, "loss": 0.0374, "step": 5292 }, { "epoch": 2.114217252396166, "grad_norm": 0.9388437125454063, "learning_rate": 2.4292371493199e-06, "loss": 0.039, "step": 5294 }, { "epoch": 2.1150159744408947, "grad_norm": 0.9016588347052378, "learning_rate": 2.425252254319002e-06, "loss": 0.0298, "step": 5296 }, { "epoch": 2.115814696485623, "grad_norm": 0.9448880094558859, "learning_rate": 2.4212695836572255e-06, "loss": 0.0348, "step": 5298 }, { "epoch": 2.1166134185303513, "grad_norm": 1.058137286496039, "learning_rate": 2.4172891407752225e-06, "loss": 0.0407, "step": 5300 }, { "epoch": 2.11741214057508, "grad_norm": 1.0908676673345152, "learning_rate": 2.4133109291117156e-06, "loss": 0.0337, "step": 5302 }, { "epoch": 2.1182108626198084, "grad_norm": 1.0100591708294226, "learning_rate": 2.4093349521035105e-06, "loss": 0.0357, "step": 5304 }, { "epoch": 2.119009584664537, "grad_norm": 0.9113240938208252, "learning_rate": 2.405361213185475e-06, "loss": 0.0395, "step": 5306 }, { "epoch": 2.119808306709265, "grad_norm": 1.0238016688467406, "learning_rate": 2.4013897157905414e-06, "loss": 0.0414, "step": 5308 }, { "epoch": 2.1206070287539935, "grad_norm": 0.9249144986993447, "learning_rate": 2.39742046334971e-06, "loss": 0.038, "step": 5310 }, { "epoch": 2.121405750798722, "grad_norm": 0.8853057009535151, "learning_rate": 2.3934534592920416e-06, "loss": 0.0381, "step": 5312 }, { "epoch": 2.1222044728434506, "grad_norm": 0.9382408504614896, "learning_rate": 2.3894887070446526e-06, "loss": 0.0327, "step": 5314 }, { "epoch": 2.123003194888179, "grad_norm": 0.9682097923317222, "learning_rate": 2.385526210032717e-06, "loss": 0.0345, "step": 5316 }, { "epoch": 2.123801916932907, "grad_norm": 1.007959850484497, "learning_rate": 2.3815659716794544e-06, "loss": 0.0362, "step": 5318 }, { "epoch": 2.1246006389776357, "grad_norm": 0.87498608229541, "learning_rate": 2.3776079954061385e-06, "loss": 0.0314, "step": 5320 }, { "epoch": 2.1253993610223643, "grad_norm": 0.8230237293988656, "learning_rate": 2.3736522846320894e-06, "loss": 0.0329, "step": 5322 }, { "epoch": 2.126198083067093, "grad_norm": 0.893770656802708, "learning_rate": 2.369698842774667e-06, "loss": 0.0379, "step": 5324 }, { "epoch": 2.126996805111821, "grad_norm": 0.9516447751453979, "learning_rate": 2.365747673249268e-06, "loss": 0.0398, "step": 5326 }, { "epoch": 2.1277955271565494, "grad_norm": 0.8303186005151683, "learning_rate": 2.3617987794693358e-06, "loss": 0.0291, "step": 5328 }, { "epoch": 2.128594249201278, "grad_norm": 0.9512354460837069, "learning_rate": 2.3578521648463414e-06, "loss": 0.0326, "step": 5330 }, { "epoch": 2.1293929712460065, "grad_norm": 0.9842083956952964, "learning_rate": 2.3539078327897846e-06, "loss": 0.036, "step": 5332 }, { "epoch": 2.130191693290735, "grad_norm": 0.8378931235387028, "learning_rate": 2.3499657867071973e-06, "loss": 0.0289, "step": 5334 }, { "epoch": 2.130990415335463, "grad_norm": 0.9684963805154867, "learning_rate": 2.3460260300041355e-06, "loss": 0.0319, "step": 5336 }, { "epoch": 2.1317891373801916, "grad_norm": 0.9200136986577663, "learning_rate": 2.342088566084177e-06, "loss": 0.0345, "step": 5338 }, { "epoch": 2.13258785942492, "grad_norm": 0.9761484143353245, "learning_rate": 2.3381533983489213e-06, "loss": 0.0296, "step": 5340 }, { "epoch": 2.1333865814696487, "grad_norm": 0.8882033903773601, "learning_rate": 2.334220530197979e-06, "loss": 0.0305, "step": 5342 }, { "epoch": 2.134185303514377, "grad_norm": 0.8114312897596324, "learning_rate": 2.3302899650289773e-06, "loss": 0.0317, "step": 5344 }, { "epoch": 2.1349840255591053, "grad_norm": 0.9453542646442153, "learning_rate": 2.3263617062375556e-06, "loss": 0.0381, "step": 5346 }, { "epoch": 2.135782747603834, "grad_norm": 1.0338865546370732, "learning_rate": 2.322435757217357e-06, "loss": 0.0361, "step": 5348 }, { "epoch": 2.1365814696485623, "grad_norm": 0.9901385988788762, "learning_rate": 2.3185121213600328e-06, "loss": 0.0391, "step": 5350 }, { "epoch": 2.137380191693291, "grad_norm": 0.9091912381004705, "learning_rate": 2.314590802055232e-06, "loss": 0.0316, "step": 5352 }, { "epoch": 2.1381789137380194, "grad_norm": 0.9808804213459485, "learning_rate": 2.3106718026906073e-06, "loss": 0.0383, "step": 5354 }, { "epoch": 2.1389776357827475, "grad_norm": 0.8455611288247308, "learning_rate": 2.306755126651804e-06, "loss": 0.0345, "step": 5356 }, { "epoch": 2.139776357827476, "grad_norm": 0.8955449128754519, "learning_rate": 2.3028407773224576e-06, "loss": 0.0349, "step": 5358 }, { "epoch": 2.1405750798722045, "grad_norm": 0.9436017969146995, "learning_rate": 2.2989287580841985e-06, "loss": 0.0399, "step": 5360 }, { "epoch": 2.141373801916933, "grad_norm": 0.980164453106363, "learning_rate": 2.2950190723166427e-06, "loss": 0.0412, "step": 5362 }, { "epoch": 2.142172523961661, "grad_norm": 0.8957004905788968, "learning_rate": 2.291111723397391e-06, "loss": 0.0327, "step": 5364 }, { "epoch": 2.1429712460063897, "grad_norm": 0.8853186556177322, "learning_rate": 2.2872067147020204e-06, "loss": 0.0368, "step": 5366 }, { "epoch": 2.143769968051118, "grad_norm": 0.9295930702988132, "learning_rate": 2.2833040496040925e-06, "loss": 0.0375, "step": 5368 }, { "epoch": 2.1445686900958467, "grad_norm": 0.8269626905882466, "learning_rate": 2.2794037314751412e-06, "loss": 0.0377, "step": 5370 }, { "epoch": 2.1453674121405752, "grad_norm": 0.9022026161705567, "learning_rate": 2.275505763684674e-06, "loss": 0.0377, "step": 5372 }, { "epoch": 2.1461661341853033, "grad_norm": 0.867448073493281, "learning_rate": 2.2716101496001663e-06, "loss": 0.0385, "step": 5374 }, { "epoch": 2.146964856230032, "grad_norm": 0.8312737054421381, "learning_rate": 2.267716892587062e-06, "loss": 0.0307, "step": 5376 }, { "epoch": 2.1477635782747604, "grad_norm": 0.881602657629581, "learning_rate": 2.2638259960087665e-06, "loss": 0.0307, "step": 5378 }, { "epoch": 2.148562300319489, "grad_norm": 0.9967389956242162, "learning_rate": 2.2599374632266514e-06, "loss": 0.0356, "step": 5380 }, { "epoch": 2.1493610223642174, "grad_norm": 0.8461757104444423, "learning_rate": 2.2560512976000366e-06, "loss": 0.0347, "step": 5382 }, { "epoch": 2.1501597444089455, "grad_norm": 1.0004876530323552, "learning_rate": 2.252167502486205e-06, "loss": 0.0353, "step": 5384 }, { "epoch": 2.150958466453674, "grad_norm": 0.9652946295733036, "learning_rate": 2.2482860812403887e-06, "loss": 0.0322, "step": 5386 }, { "epoch": 2.1517571884984026, "grad_norm": 0.8292517773055327, "learning_rate": 2.2444070372157724e-06, "loss": 0.0339, "step": 5388 }, { "epoch": 2.152555910543131, "grad_norm": 0.8955686140212332, "learning_rate": 2.2405303737634794e-06, "loss": 0.0339, "step": 5390 }, { "epoch": 2.1533546325878596, "grad_norm": 0.8707856960045441, "learning_rate": 2.2366560942325833e-06, "loss": 0.0378, "step": 5392 }, { "epoch": 2.1541533546325877, "grad_norm": 0.9343304509307997, "learning_rate": 2.232784201970094e-06, "loss": 0.0329, "step": 5394 }, { "epoch": 2.1549520766773163, "grad_norm": 0.8050623246924522, "learning_rate": 2.228914700320967e-06, "loss": 0.0316, "step": 5396 }, { "epoch": 2.155750798722045, "grad_norm": 0.9979211632823265, "learning_rate": 2.2250475926280814e-06, "loss": 0.039, "step": 5398 }, { "epoch": 2.1565495207667733, "grad_norm": 1.1270330722228508, "learning_rate": 2.2211828822322547e-06, "loss": 0.0391, "step": 5400 }, { "epoch": 2.1573482428115014, "grad_norm": 1.1140624044951437, "learning_rate": 2.217320572472232e-06, "loss": 0.0357, "step": 5402 }, { "epoch": 2.15814696485623, "grad_norm": 1.0128344684936457, "learning_rate": 2.2134606666846863e-06, "loss": 0.0402, "step": 5404 }, { "epoch": 2.1589456869009584, "grad_norm": 0.8869317328324887, "learning_rate": 2.209603168204209e-06, "loss": 0.0301, "step": 5406 }, { "epoch": 2.159744408945687, "grad_norm": 0.8719598409358753, "learning_rate": 2.205748080363316e-06, "loss": 0.0327, "step": 5408 }, { "epoch": 2.1605431309904155, "grad_norm": 0.8425272637471012, "learning_rate": 2.2018954064924392e-06, "loss": 0.0317, "step": 5410 }, { "epoch": 2.1613418530351436, "grad_norm": 0.9849794944821841, "learning_rate": 2.1980451499199262e-06, "loss": 0.0352, "step": 5412 }, { "epoch": 2.162140575079872, "grad_norm": 1.0062068145536687, "learning_rate": 2.1941973139720368e-06, "loss": 0.034, "step": 5414 }, { "epoch": 2.1629392971246006, "grad_norm": 1.025042092696479, "learning_rate": 2.190351901972935e-06, "loss": 0.0364, "step": 5416 }, { "epoch": 2.163738019169329, "grad_norm": 0.9038868061338211, "learning_rate": 2.1865089172446928e-06, "loss": 0.0342, "step": 5418 }, { "epoch": 2.1645367412140577, "grad_norm": 0.9811870087519612, "learning_rate": 2.1826683631072932e-06, "loss": 0.0354, "step": 5420 }, { "epoch": 2.165335463258786, "grad_norm": 0.9159807996481562, "learning_rate": 2.1788302428786057e-06, "loss": 0.0343, "step": 5422 }, { "epoch": 2.1661341853035143, "grad_norm": 1.065699305014696, "learning_rate": 2.1749945598744076e-06, "loss": 0.0384, "step": 5424 }, { "epoch": 2.166932907348243, "grad_norm": 0.9974570265355018, "learning_rate": 2.171161317408366e-06, "loss": 0.0333, "step": 5426 }, { "epoch": 2.1677316293929714, "grad_norm": 1.0782436820025358, "learning_rate": 2.1673305187920422e-06, "loss": 0.036, "step": 5428 }, { "epoch": 2.1685303514377, "grad_norm": 1.0684005095258051, "learning_rate": 2.163502167334882e-06, "loss": 0.0406, "step": 5430 }, { "epoch": 2.169329073482428, "grad_norm": 0.933155992724429, "learning_rate": 2.159676266344222e-06, "loss": 0.0385, "step": 5432 }, { "epoch": 2.1701277955271565, "grad_norm": 0.9059016726528943, "learning_rate": 2.155852819125278e-06, "loss": 0.0304, "step": 5434 }, { "epoch": 2.170926517571885, "grad_norm": 0.8433132037155296, "learning_rate": 2.1520318289811493e-06, "loss": 0.0311, "step": 5436 }, { "epoch": 2.1717252396166136, "grad_norm": 0.8527052161750599, "learning_rate": 2.1482132992128125e-06, "loss": 0.0325, "step": 5438 }, { "epoch": 2.1725239616613417, "grad_norm": 0.9111474137493859, "learning_rate": 2.144397233119112e-06, "loss": 0.0326, "step": 5440 }, { "epoch": 2.17332268370607, "grad_norm": 0.8278663259091216, "learning_rate": 2.1405836339967707e-06, "loss": 0.0316, "step": 5442 }, { "epoch": 2.1741214057507987, "grad_norm": 0.8126851634140201, "learning_rate": 2.136772505140382e-06, "loss": 0.0297, "step": 5444 }, { "epoch": 2.1749201277955272, "grad_norm": 0.8533464184624501, "learning_rate": 2.1329638498423978e-06, "loss": 0.0311, "step": 5446 }, { "epoch": 2.1757188498402558, "grad_norm": 0.853939722117267, "learning_rate": 2.1291576713931382e-06, "loss": 0.0367, "step": 5448 }, { "epoch": 2.176517571884984, "grad_norm": 1.1560454061905685, "learning_rate": 2.125353973080782e-06, "loss": 0.0383, "step": 5450 }, { "epoch": 2.1773162939297124, "grad_norm": 1.013070043627522, "learning_rate": 2.121552758191366e-06, "loss": 0.0323, "step": 5452 }, { "epoch": 2.178115015974441, "grad_norm": 1.011960788493202, "learning_rate": 2.117754030008783e-06, "loss": 0.0364, "step": 5454 }, { "epoch": 2.1789137380191694, "grad_norm": 0.8655045405461433, "learning_rate": 2.1139577918147715e-06, "loss": 0.0333, "step": 5456 }, { "epoch": 2.179712460063898, "grad_norm": 0.8272557585170817, "learning_rate": 2.1101640468889255e-06, "loss": 0.0354, "step": 5458 }, { "epoch": 2.180511182108626, "grad_norm": 1.0574986102956363, "learning_rate": 2.1063727985086827e-06, "loss": 0.0333, "step": 5460 }, { "epoch": 2.1813099041533546, "grad_norm": 1.0764551576276693, "learning_rate": 2.102584049949326e-06, "loss": 0.0307, "step": 5462 }, { "epoch": 2.182108626198083, "grad_norm": 0.9248131636959693, "learning_rate": 2.0987978044839707e-06, "loss": 0.0331, "step": 5464 }, { "epoch": 2.1829073482428116, "grad_norm": 0.9555477735825861, "learning_rate": 2.0950140653835814e-06, "loss": 0.032, "step": 5466 }, { "epoch": 2.18370607028754, "grad_norm": 1.0306732674470207, "learning_rate": 2.0912328359169498e-06, "loss": 0.0356, "step": 5468 }, { "epoch": 2.1845047923322682, "grad_norm": 1.0189431478737734, "learning_rate": 2.087454119350703e-06, "loss": 0.0317, "step": 5470 }, { "epoch": 2.1853035143769968, "grad_norm": 0.8526696540231634, "learning_rate": 2.0836779189492925e-06, "loss": 0.034, "step": 5472 }, { "epoch": 2.1861022364217253, "grad_norm": 0.8741979901728469, "learning_rate": 2.079904237975e-06, "loss": 0.0313, "step": 5474 }, { "epoch": 2.186900958466454, "grad_norm": 0.9921256355991658, "learning_rate": 2.0761330796879307e-06, "loss": 0.0356, "step": 5476 }, { "epoch": 2.187699680511182, "grad_norm": 1.0651481599446444, "learning_rate": 2.0723644473460114e-06, "loss": 0.0376, "step": 5478 }, { "epoch": 2.1884984025559104, "grad_norm": 0.9968183069089577, "learning_rate": 2.068598344204981e-06, "loss": 0.0317, "step": 5480 }, { "epoch": 2.189297124600639, "grad_norm": 1.0274581864731451, "learning_rate": 2.064834773518399e-06, "loss": 0.0384, "step": 5482 }, { "epoch": 2.1900958466453675, "grad_norm": 0.7933724201731249, "learning_rate": 2.061073738537635e-06, "loss": 0.0335, "step": 5484 }, { "epoch": 2.190894568690096, "grad_norm": 0.8535934658038107, "learning_rate": 2.0573152425118703e-06, "loss": 0.0366, "step": 5486 }, { "epoch": 2.191693290734824, "grad_norm": 0.8643572127104457, "learning_rate": 2.053559288688086e-06, "loss": 0.0308, "step": 5488 }, { "epoch": 2.1924920127795526, "grad_norm": 0.9310105796168394, "learning_rate": 2.0498058803110775e-06, "loss": 0.0327, "step": 5490 }, { "epoch": 2.193290734824281, "grad_norm": 0.9425151428347762, "learning_rate": 2.0460550206234324e-06, "loss": 0.0367, "step": 5492 }, { "epoch": 2.1940894568690097, "grad_norm": 0.8903710371632394, "learning_rate": 2.042306712865543e-06, "loss": 0.0332, "step": 5494 }, { "epoch": 2.194888178913738, "grad_norm": 0.9619998941867759, "learning_rate": 2.0385609602755878e-06, "loss": 0.0342, "step": 5496 }, { "epoch": 2.1956869009584663, "grad_norm": 0.8331065075000925, "learning_rate": 2.0348177660895473e-06, "loss": 0.0268, "step": 5498 }, { "epoch": 2.196485623003195, "grad_norm": 0.7812360489536925, "learning_rate": 2.031077133541188e-06, "loss": 0.0261, "step": 5500 }, { "epoch": 2.196485623003195, "eval_loss": 0.17987604439258575, "eval_runtime": 417.2456, "eval_samples_per_second": 42.678, "eval_steps_per_second": 5.335, "step": 5500 }, { "epoch": 2.1972843450479234, "grad_norm": 1.0053912019916695, "learning_rate": 2.027339065862064e-06, "loss": 0.0331, "step": 5502 }, { "epoch": 2.198083067092652, "grad_norm": 0.9961342952209272, "learning_rate": 2.02360356628151e-06, "loss": 0.0321, "step": 5504 }, { "epoch": 2.1988817891373804, "grad_norm": 1.1371974183191713, "learning_rate": 2.019870638026648e-06, "loss": 0.0357, "step": 5506 }, { "epoch": 2.1996805111821085, "grad_norm": 1.0401230360921, "learning_rate": 2.016140284322375e-06, "loss": 0.0346, "step": 5508 }, { "epoch": 2.200479233226837, "grad_norm": 0.8835551794696535, "learning_rate": 2.0124125083913636e-06, "loss": 0.0334, "step": 5510 }, { "epoch": 2.2012779552715656, "grad_norm": 0.8819429072389985, "learning_rate": 2.0086873134540626e-06, "loss": 0.0336, "step": 5512 }, { "epoch": 2.202076677316294, "grad_norm": 0.87779251719051, "learning_rate": 2.004964702728688e-06, "loss": 0.0328, "step": 5514 }, { "epoch": 2.202875399361022, "grad_norm": 0.8894119758104073, "learning_rate": 2.0012446794312236e-06, "loss": 0.0284, "step": 5516 }, { "epoch": 2.2036741214057507, "grad_norm": 1.1793198313761428, "learning_rate": 1.997527246775421e-06, "loss": 0.0375, "step": 5518 }, { "epoch": 2.2044728434504792, "grad_norm": 0.9932553166480073, "learning_rate": 1.9938124079727874e-06, "loss": 0.0345, "step": 5520 }, { "epoch": 2.2052715654952078, "grad_norm": 1.0489913796282613, "learning_rate": 1.9901001662325946e-06, "loss": 0.0339, "step": 5522 }, { "epoch": 2.2060702875399363, "grad_norm": 0.9036884868208198, "learning_rate": 1.9863905247618702e-06, "loss": 0.0318, "step": 5524 }, { "epoch": 2.2068690095846644, "grad_norm": 1.0039658882019877, "learning_rate": 1.9826834867653956e-06, "loss": 0.0378, "step": 5526 }, { "epoch": 2.207667731629393, "grad_norm": 0.9584370299885461, "learning_rate": 1.9789790554456977e-06, "loss": 0.0351, "step": 5528 }, { "epoch": 2.2084664536741214, "grad_norm": 1.0693519590795688, "learning_rate": 1.9752772340030584e-06, "loss": 0.033, "step": 5530 }, { "epoch": 2.20926517571885, "grad_norm": 0.9311182383118676, "learning_rate": 1.9715780256355014e-06, "loss": 0.0315, "step": 5532 }, { "epoch": 2.2100638977635785, "grad_norm": 0.886315430031371, "learning_rate": 1.967881433538795e-06, "loss": 0.0321, "step": 5534 }, { "epoch": 2.2108626198083066, "grad_norm": 0.9667837860549522, "learning_rate": 1.9641874609064443e-06, "loss": 0.0298, "step": 5536 }, { "epoch": 2.211661341853035, "grad_norm": 0.9696824443915598, "learning_rate": 1.960496110929694e-06, "loss": 0.0348, "step": 5538 }, { "epoch": 2.2124600638977636, "grad_norm": 0.9405160870011481, "learning_rate": 1.9568073867975217e-06, "loss": 0.0307, "step": 5540 }, { "epoch": 2.213258785942492, "grad_norm": 0.9078061671374033, "learning_rate": 1.9531212916966395e-06, "loss": 0.0337, "step": 5542 }, { "epoch": 2.2140575079872207, "grad_norm": 0.8927963016211171, "learning_rate": 1.9494378288114816e-06, "loss": 0.0318, "step": 5544 }, { "epoch": 2.2148562300319488, "grad_norm": 0.8652473614478016, "learning_rate": 1.945757001324215e-06, "loss": 0.029, "step": 5546 }, { "epoch": 2.2156549520766773, "grad_norm": 1.0041756596183897, "learning_rate": 1.9420788124147266e-06, "loss": 0.0353, "step": 5548 }, { "epoch": 2.216453674121406, "grad_norm": 0.8814650902754025, "learning_rate": 1.938403265260625e-06, "loss": 0.0325, "step": 5550 }, { "epoch": 2.2172523961661343, "grad_norm": 0.9335249441231224, "learning_rate": 1.9347303630372373e-06, "loss": 0.0349, "step": 5552 }, { "epoch": 2.2180511182108624, "grad_norm": 0.889201823881265, "learning_rate": 1.931060108917601e-06, "loss": 0.0344, "step": 5554 }, { "epoch": 2.218849840255591, "grad_norm": 0.8902317631970373, "learning_rate": 1.92739250607247e-06, "loss": 0.0342, "step": 5556 }, { "epoch": 2.2196485623003195, "grad_norm": 0.8614037453548934, "learning_rate": 1.9237275576703125e-06, "loss": 0.0346, "step": 5558 }, { "epoch": 2.220447284345048, "grad_norm": 1.121859600487758, "learning_rate": 1.9200652668772924e-06, "loss": 0.0357, "step": 5560 }, { "epoch": 2.2212460063897765, "grad_norm": 0.9204383985383575, "learning_rate": 1.9164056368572847e-06, "loss": 0.0327, "step": 5562 }, { "epoch": 2.2220447284345046, "grad_norm": 1.069244881184746, "learning_rate": 1.912748670771865e-06, "loss": 0.035, "step": 5564 }, { "epoch": 2.222843450479233, "grad_norm": 0.9412541774316218, "learning_rate": 1.909094371780309e-06, "loss": 0.034, "step": 5566 }, { "epoch": 2.2236421725239617, "grad_norm": 1.1270763500131213, "learning_rate": 1.9054427430395828e-06, "loss": 0.0376, "step": 5568 }, { "epoch": 2.22444089456869, "grad_norm": 1.0152263236453896, "learning_rate": 1.9017937877043496e-06, "loss": 0.035, "step": 5570 }, { "epoch": 2.2252396166134187, "grad_norm": 1.0576930750635203, "learning_rate": 1.8981475089269641e-06, "loss": 0.0348, "step": 5572 }, { "epoch": 2.226038338658147, "grad_norm": 0.9331195909365617, "learning_rate": 1.8945039098574658e-06, "loss": 0.0322, "step": 5574 }, { "epoch": 2.2268370607028753, "grad_norm": 0.9101266572569631, "learning_rate": 1.890862993643583e-06, "loss": 0.0367, "step": 5576 }, { "epoch": 2.227635782747604, "grad_norm": 0.958445615782898, "learning_rate": 1.8872247634307205e-06, "loss": 0.0357, "step": 5578 }, { "epoch": 2.2284345047923324, "grad_norm": 0.9925711413975752, "learning_rate": 1.883589222361965e-06, "loss": 0.0369, "step": 5580 }, { "epoch": 2.229233226837061, "grad_norm": 1.0786281449106017, "learning_rate": 1.8799563735780873e-06, "loss": 0.0329, "step": 5582 }, { "epoch": 2.230031948881789, "grad_norm": 0.924349636618338, "learning_rate": 1.8763262202175204e-06, "loss": 0.031, "step": 5584 }, { "epoch": 2.2308306709265175, "grad_norm": 0.8786179873798488, "learning_rate": 1.8726987654163753e-06, "loss": 0.0284, "step": 5586 }, { "epoch": 2.231629392971246, "grad_norm": 0.8699008504990049, "learning_rate": 1.8690740123084316e-06, "loss": 0.0329, "step": 5588 }, { "epoch": 2.2324281150159746, "grad_norm": 0.9317629329233429, "learning_rate": 1.8654519640251334e-06, "loss": 0.0334, "step": 5590 }, { "epoch": 2.2332268370607027, "grad_norm": 0.858865932937765, "learning_rate": 1.8618326236955908e-06, "loss": 0.032, "step": 5592 }, { "epoch": 2.234025559105431, "grad_norm": 0.9990448382210804, "learning_rate": 1.858215994446569e-06, "loss": 0.0358, "step": 5594 }, { "epoch": 2.2348242811501597, "grad_norm": 0.9366419998033949, "learning_rate": 1.8546020794024955e-06, "loss": 0.032, "step": 5596 }, { "epoch": 2.2356230031948883, "grad_norm": 0.9425282547585793, "learning_rate": 1.8509908816854527e-06, "loss": 0.0348, "step": 5598 }, { "epoch": 2.236421725239617, "grad_norm": 0.9345193697622752, "learning_rate": 1.8473824044151762e-06, "loss": 0.0348, "step": 5600 }, { "epoch": 2.237220447284345, "grad_norm": 0.9048364378406388, "learning_rate": 1.843776650709046e-06, "loss": 0.0292, "step": 5602 }, { "epoch": 2.2380191693290734, "grad_norm": 1.0024652573185826, "learning_rate": 1.8401736236820933e-06, "loss": 0.0287, "step": 5604 }, { "epoch": 2.238817891373802, "grad_norm": 1.141968574912131, "learning_rate": 1.836573326446997e-06, "loss": 0.0359, "step": 5606 }, { "epoch": 2.2396166134185305, "grad_norm": 1.00331976516427, "learning_rate": 1.8329757621140748e-06, "loss": 0.0341, "step": 5608 }, { "epoch": 2.2404153354632586, "grad_norm": 0.9747936880200038, "learning_rate": 1.8293809337912789e-06, "loss": 0.0379, "step": 5610 }, { "epoch": 2.241214057507987, "grad_norm": 0.9903639773982147, "learning_rate": 1.8257888445842026e-06, "loss": 0.0335, "step": 5612 }, { "epoch": 2.2420127795527156, "grad_norm": 0.9391122747126337, "learning_rate": 1.8221994975960739e-06, "loss": 0.028, "step": 5614 }, { "epoch": 2.242811501597444, "grad_norm": 0.9322142611011831, "learning_rate": 1.81861289592775e-06, "loss": 0.0332, "step": 5616 }, { "epoch": 2.2436102236421727, "grad_norm": 0.926413396097988, "learning_rate": 1.815029042677714e-06, "loss": 0.0363, "step": 5618 }, { "epoch": 2.244408945686901, "grad_norm": 0.9401248489200065, "learning_rate": 1.8114479409420783e-06, "loss": 0.0349, "step": 5620 }, { "epoch": 2.2452076677316293, "grad_norm": 0.9208704620021309, "learning_rate": 1.8078695938145768e-06, "loss": 0.0315, "step": 5622 }, { "epoch": 2.246006389776358, "grad_norm": 0.9916844141557866, "learning_rate": 1.8042940043865658e-06, "loss": 0.0314, "step": 5624 }, { "epoch": 2.2468051118210863, "grad_norm": 1.0110707106239305, "learning_rate": 1.8007211757470117e-06, "loss": 0.0371, "step": 5626 }, { "epoch": 2.247603833865815, "grad_norm": 1.1436090733823392, "learning_rate": 1.7971511109825064e-06, "loss": 0.0347, "step": 5628 }, { "epoch": 2.248402555910543, "grad_norm": 0.9259112088416946, "learning_rate": 1.7935838131772481e-06, "loss": 0.0338, "step": 5630 }, { "epoch": 2.2492012779552715, "grad_norm": 0.9880736506122585, "learning_rate": 1.7900192854130465e-06, "loss": 0.035, "step": 5632 }, { "epoch": 2.25, "grad_norm": 0.8024555556579349, "learning_rate": 1.786457530769314e-06, "loss": 0.0328, "step": 5634 }, { "epoch": 2.2507987220447285, "grad_norm": 0.8899527470220918, "learning_rate": 1.7828985523230725e-06, "loss": 0.0334, "step": 5636 }, { "epoch": 2.251597444089457, "grad_norm": 0.9363903078372263, "learning_rate": 1.779342353148943e-06, "loss": 0.0348, "step": 5638 }, { "epoch": 2.252396166134185, "grad_norm": 0.9060811105083338, "learning_rate": 1.7757889363191484e-06, "loss": 0.034, "step": 5640 }, { "epoch": 2.2531948881789137, "grad_norm": 1.025351909976564, "learning_rate": 1.7722383049035019e-06, "loss": 0.0391, "step": 5642 }, { "epoch": 2.253993610223642, "grad_norm": 1.0426242945439066, "learning_rate": 1.7686904619694156e-06, "loss": 0.0356, "step": 5644 }, { "epoch": 2.2547923322683707, "grad_norm": 0.8919813927820728, "learning_rate": 1.7651454105818915e-06, "loss": 0.0306, "step": 5646 }, { "epoch": 2.255591054313099, "grad_norm": 0.956484596890347, "learning_rate": 1.7616031538035189e-06, "loss": 0.0354, "step": 5648 }, { "epoch": 2.2563897763578273, "grad_norm": 0.90756103740582, "learning_rate": 1.758063694694474e-06, "loss": 0.0336, "step": 5650 }, { "epoch": 2.257188498402556, "grad_norm": 0.8099489111484716, "learning_rate": 1.7545270363125155e-06, "loss": 0.029, "step": 5652 }, { "epoch": 2.2579872204472844, "grad_norm": 1.0364198349070215, "learning_rate": 1.7509931817129821e-06, "loss": 0.0343, "step": 5654 }, { "epoch": 2.258785942492013, "grad_norm": 0.87676918992303, "learning_rate": 1.7474621339487925e-06, "loss": 0.032, "step": 5656 }, { "epoch": 2.2595846645367414, "grad_norm": 0.968068180478562, "learning_rate": 1.7439338960704355e-06, "loss": 0.0318, "step": 5658 }, { "epoch": 2.2603833865814695, "grad_norm": 0.9143786360310998, "learning_rate": 1.7404084711259777e-06, "loss": 0.0306, "step": 5660 }, { "epoch": 2.261182108626198, "grad_norm": 0.983712060636582, "learning_rate": 1.736885862161054e-06, "loss": 0.0334, "step": 5662 }, { "epoch": 2.2619808306709266, "grad_norm": 0.9626625864043248, "learning_rate": 1.7333660722188667e-06, "loss": 0.034, "step": 5664 }, { "epoch": 2.262779552715655, "grad_norm": 0.9609884877774749, "learning_rate": 1.7298491043401794e-06, "loss": 0.0297, "step": 5666 }, { "epoch": 2.263578274760383, "grad_norm": 0.9396750240874573, "learning_rate": 1.7263349615633228e-06, "loss": 0.0332, "step": 5668 }, { "epoch": 2.2643769968051117, "grad_norm": 0.9290418992667302, "learning_rate": 1.7228236469241837e-06, "loss": 0.0298, "step": 5670 }, { "epoch": 2.2651757188498403, "grad_norm": 0.9095375069866244, "learning_rate": 1.7193151634562071e-06, "loss": 0.0304, "step": 5672 }, { "epoch": 2.265974440894569, "grad_norm": 1.010681342107459, "learning_rate": 1.715809514190392e-06, "loss": 0.0307, "step": 5674 }, { "epoch": 2.2667731629392973, "grad_norm": 1.0154601770035847, "learning_rate": 1.712306702155288e-06, "loss": 0.0306, "step": 5676 }, { "epoch": 2.2675718849840254, "grad_norm": 1.0108750470875814, "learning_rate": 1.7088067303769946e-06, "loss": 0.0325, "step": 5678 }, { "epoch": 2.268370607028754, "grad_norm": 1.0636729615862845, "learning_rate": 1.7053096018791588e-06, "loss": 0.0409, "step": 5680 }, { "epoch": 2.2691693290734825, "grad_norm": 0.9202813920561608, "learning_rate": 1.7018153196829662e-06, "loss": 0.0332, "step": 5682 }, { "epoch": 2.269968051118211, "grad_norm": 1.214698931969398, "learning_rate": 1.6983238868071489e-06, "loss": 0.0338, "step": 5684 }, { "epoch": 2.270766773162939, "grad_norm": 0.8519680135270461, "learning_rate": 1.6948353062679752e-06, "loss": 0.0309, "step": 5686 }, { "epoch": 2.2715654952076676, "grad_norm": 0.8609963780131156, "learning_rate": 1.691349581079249e-06, "loss": 0.0316, "step": 5688 }, { "epoch": 2.272364217252396, "grad_norm": 1.03764079744846, "learning_rate": 1.687866714252311e-06, "loss": 0.035, "step": 5690 }, { "epoch": 2.2731629392971247, "grad_norm": 0.9903945590247691, "learning_rate": 1.6843867087960252e-06, "loss": 0.0338, "step": 5692 }, { "epoch": 2.273961661341853, "grad_norm": 0.9835600544969014, "learning_rate": 1.6809095677167897e-06, "loss": 0.033, "step": 5694 }, { "epoch": 2.2747603833865817, "grad_norm": 0.9107299972526909, "learning_rate": 1.6774352940185269e-06, "loss": 0.0329, "step": 5696 }, { "epoch": 2.27555910543131, "grad_norm": 1.0179931131158924, "learning_rate": 1.6739638907026806e-06, "loss": 0.0324, "step": 5698 }, { "epoch": 2.2763578274760383, "grad_norm": 0.8564341030978901, "learning_rate": 1.6704953607682161e-06, "loss": 0.0371, "step": 5700 }, { "epoch": 2.277156549520767, "grad_norm": 0.9606560664042086, "learning_rate": 1.6670297072116165e-06, "loss": 0.0357, "step": 5702 }, { "epoch": 2.2779552715654954, "grad_norm": 0.970530456375879, "learning_rate": 1.663566933026879e-06, "loss": 0.0368, "step": 5704 }, { "epoch": 2.2787539936102235, "grad_norm": 0.823503415647808, "learning_rate": 1.6601070412055154e-06, "loss": 0.0308, "step": 5706 }, { "epoch": 2.279552715654952, "grad_norm": 0.8849782867422896, "learning_rate": 1.6566500347365421e-06, "loss": 0.0339, "step": 5708 }, { "epoch": 2.2803514376996805, "grad_norm": 0.9443290922040906, "learning_rate": 1.6531959166064893e-06, "loss": 0.0382, "step": 5710 }, { "epoch": 2.281150159744409, "grad_norm": 0.9814437843516525, "learning_rate": 1.6497446897993885e-06, "loss": 0.0314, "step": 5712 }, { "epoch": 2.2819488817891376, "grad_norm": 0.9443631962649643, "learning_rate": 1.6462963572967756e-06, "loss": 0.0329, "step": 5714 }, { "epoch": 2.2827476038338657, "grad_norm": 0.9206008196409816, "learning_rate": 1.6428509220776812e-06, "loss": 0.0327, "step": 5716 }, { "epoch": 2.283546325878594, "grad_norm": 0.9303152959167551, "learning_rate": 1.6394083871186362e-06, "loss": 0.0326, "step": 5718 }, { "epoch": 2.2843450479233227, "grad_norm": 0.9131910544498667, "learning_rate": 1.6359687553936714e-06, "loss": 0.0326, "step": 5720 }, { "epoch": 2.2851437699680512, "grad_norm": 1.0967870027836903, "learning_rate": 1.6325320298742986e-06, "loss": 0.0395, "step": 5722 }, { "epoch": 2.2859424920127793, "grad_norm": 0.8427528083071858, "learning_rate": 1.6290982135295269e-06, "loss": 0.0331, "step": 5724 }, { "epoch": 2.286741214057508, "grad_norm": 0.8644733263443983, "learning_rate": 1.6256673093258485e-06, "loss": 0.0345, "step": 5726 }, { "epoch": 2.2875399361022364, "grad_norm": 0.9392972386925895, "learning_rate": 1.6222393202272414e-06, "loss": 0.0304, "step": 5728 }, { "epoch": 2.288338658146965, "grad_norm": 0.9017990006181082, "learning_rate": 1.618814249195167e-06, "loss": 0.0314, "step": 5730 }, { "epoch": 2.2891373801916934, "grad_norm": 0.7993416720434479, "learning_rate": 1.6153920991885591e-06, "loss": 0.0286, "step": 5732 }, { "epoch": 2.289936102236422, "grad_norm": 0.9176139011413571, "learning_rate": 1.6119728731638345e-06, "loss": 0.0369, "step": 5734 }, { "epoch": 2.29073482428115, "grad_norm": 0.874001701788666, "learning_rate": 1.6085565740748825e-06, "loss": 0.033, "step": 5736 }, { "epoch": 2.2915335463258786, "grad_norm": 0.9359227443696032, "learning_rate": 1.605143204873064e-06, "loss": 0.0377, "step": 5738 }, { "epoch": 2.292332268370607, "grad_norm": 0.8690038620673686, "learning_rate": 1.6017327685072047e-06, "loss": 0.031, "step": 5740 }, { "epoch": 2.2931309904153356, "grad_norm": 1.055903086127581, "learning_rate": 1.5983252679236006e-06, "loss": 0.0355, "step": 5742 }, { "epoch": 2.2939297124600637, "grad_norm": 1.1058038934128054, "learning_rate": 1.5949207060660138e-06, "loss": 0.0328, "step": 5744 }, { "epoch": 2.2947284345047922, "grad_norm": 0.9784683752210179, "learning_rate": 1.5915190858756635e-06, "loss": 0.0334, "step": 5746 }, { "epoch": 2.2955271565495208, "grad_norm": 0.9248930707330709, "learning_rate": 1.5881204102912262e-06, "loss": 0.0334, "step": 5748 }, { "epoch": 2.2963258785942493, "grad_norm": 1.028126750218338, "learning_rate": 1.5847246822488388e-06, "loss": 0.0344, "step": 5750 }, { "epoch": 2.297124600638978, "grad_norm": 0.8999392063400926, "learning_rate": 1.581331904682089e-06, "loss": 0.0293, "step": 5752 }, { "epoch": 2.297923322683706, "grad_norm": 0.96073417050456, "learning_rate": 1.5779420805220185e-06, "loss": 0.0332, "step": 5754 }, { "epoch": 2.2987220447284344, "grad_norm": 0.9826008607081279, "learning_rate": 1.574555212697113e-06, "loss": 0.0338, "step": 5756 }, { "epoch": 2.299520766773163, "grad_norm": 0.9259548407344154, "learning_rate": 1.5711713041333077e-06, "loss": 0.0355, "step": 5758 }, { "epoch": 2.3003194888178915, "grad_norm": 0.999628943497661, "learning_rate": 1.5677903577539806e-06, "loss": 0.0353, "step": 5760 }, { "epoch": 2.3011182108626196, "grad_norm": 1.004302579075343, "learning_rate": 1.5644123764799517e-06, "loss": 0.0289, "step": 5762 }, { "epoch": 2.301916932907348, "grad_norm": 1.0927223195156304, "learning_rate": 1.561037363229475e-06, "loss": 0.0378, "step": 5764 }, { "epoch": 2.3027156549520766, "grad_norm": 0.9410698542998797, "learning_rate": 1.5576653209182436e-06, "loss": 0.0331, "step": 5766 }, { "epoch": 2.303514376996805, "grad_norm": 0.8643954286211598, "learning_rate": 1.5542962524593869e-06, "loss": 0.0317, "step": 5768 }, { "epoch": 2.3043130990415337, "grad_norm": 1.0178268360718439, "learning_rate": 1.550930160763462e-06, "loss": 0.0333, "step": 5770 }, { "epoch": 2.3051118210862622, "grad_norm": 1.0794526119781374, "learning_rate": 1.547567048738452e-06, "loss": 0.037, "step": 5772 }, { "epoch": 2.3059105431309903, "grad_norm": 0.9306102687878339, "learning_rate": 1.5442069192897695e-06, "loss": 0.0319, "step": 5774 }, { "epoch": 2.306709265175719, "grad_norm": 0.9959647755317999, "learning_rate": 1.54084977532025e-06, "loss": 0.0304, "step": 5776 }, { "epoch": 2.3075079872204474, "grad_norm": 1.178192165622762, "learning_rate": 1.5374956197301494e-06, "loss": 0.0359, "step": 5778 }, { "epoch": 2.308306709265176, "grad_norm": 0.970629316470673, "learning_rate": 1.5341444554171397e-06, "loss": 0.0323, "step": 5780 }, { "epoch": 2.309105431309904, "grad_norm": 0.9995642863840597, "learning_rate": 1.5307962852763115e-06, "loss": 0.0314, "step": 5782 }, { "epoch": 2.3099041533546325, "grad_norm": 0.8635977267044507, "learning_rate": 1.5274511122001684e-06, "loss": 0.0292, "step": 5784 }, { "epoch": 2.310702875399361, "grad_norm": 0.941234774919912, "learning_rate": 1.524108939078624e-06, "loss": 0.0335, "step": 5786 }, { "epoch": 2.3115015974440896, "grad_norm": 0.9670030899801637, "learning_rate": 1.5207697687990004e-06, "loss": 0.0355, "step": 5788 }, { "epoch": 2.312300319488818, "grad_norm": 1.0863404667557413, "learning_rate": 1.5174336042460264e-06, "loss": 0.0359, "step": 5790 }, { "epoch": 2.313099041533546, "grad_norm": 0.9487702530128579, "learning_rate": 1.5141004483018323e-06, "loss": 0.0303, "step": 5792 }, { "epoch": 2.3138977635782747, "grad_norm": 0.8581962375915709, "learning_rate": 1.5107703038459531e-06, "loss": 0.0329, "step": 5794 }, { "epoch": 2.3146964856230032, "grad_norm": 0.9463896237746494, "learning_rate": 1.5074431737553158e-06, "loss": 0.0344, "step": 5796 }, { "epoch": 2.3154952076677318, "grad_norm": 1.0247492412011727, "learning_rate": 1.5041190609042477e-06, "loss": 0.0322, "step": 5798 }, { "epoch": 2.31629392971246, "grad_norm": 0.8881026553630956, "learning_rate": 1.5007979681644696e-06, "loss": 0.033, "step": 5800 }, { "epoch": 2.3170926517571884, "grad_norm": 0.9632852088358282, "learning_rate": 1.4974798984050941e-06, "loss": 0.0348, "step": 5802 }, { "epoch": 2.317891373801917, "grad_norm": 0.8335404581898861, "learning_rate": 1.4941648544926164e-06, "loss": 0.0312, "step": 5804 }, { "epoch": 2.3186900958466454, "grad_norm": 0.9511476448808138, "learning_rate": 1.4908528392909233e-06, "loss": 0.0343, "step": 5806 }, { "epoch": 2.319488817891374, "grad_norm": 0.9453004464927225, "learning_rate": 1.4875438556612836e-06, "loss": 0.0305, "step": 5808 }, { "epoch": 2.3202875399361025, "grad_norm": 0.9770824243294399, "learning_rate": 1.4842379064623474e-06, "loss": 0.0316, "step": 5810 }, { "epoch": 2.3210862619808306, "grad_norm": 0.9354726575321289, "learning_rate": 1.4809349945501422e-06, "loss": 0.03, "step": 5812 }, { "epoch": 2.321884984025559, "grad_norm": 0.9809930908311559, "learning_rate": 1.4776351227780732e-06, "loss": 0.0327, "step": 5814 }, { "epoch": 2.3226837060702876, "grad_norm": 0.9812165421073059, "learning_rate": 1.474338293996917e-06, "loss": 0.0291, "step": 5816 }, { "epoch": 2.323482428115016, "grad_norm": 1.0065291780758312, "learning_rate": 1.4710445110548255e-06, "loss": 0.0349, "step": 5818 }, { "epoch": 2.3242811501597442, "grad_norm": 1.0293319955302287, "learning_rate": 1.467753776797312e-06, "loss": 0.0331, "step": 5820 }, { "epoch": 2.3250798722044728, "grad_norm": 0.9835892465385753, "learning_rate": 1.4644660940672628e-06, "loss": 0.0328, "step": 5822 }, { "epoch": 2.3258785942492013, "grad_norm": 0.8998383284814865, "learning_rate": 1.4611814657049257e-06, "loss": 0.0329, "step": 5824 }, { "epoch": 2.32667731629393, "grad_norm": 0.9572453991906558, "learning_rate": 1.4578998945479084e-06, "loss": 0.0328, "step": 5826 }, { "epoch": 2.3274760383386583, "grad_norm": 1.0608318737189215, "learning_rate": 1.4546213834311823e-06, "loss": 0.0362, "step": 5828 }, { "epoch": 2.3282747603833864, "grad_norm": 0.9194197731835649, "learning_rate": 1.4513459351870669e-06, "loss": 0.0313, "step": 5830 }, { "epoch": 2.329073482428115, "grad_norm": 0.9174893410490872, "learning_rate": 1.4480735526452427e-06, "loss": 0.0325, "step": 5832 }, { "epoch": 2.3298722044728435, "grad_norm": 0.8639598370634627, "learning_rate": 1.4448042386327394e-06, "loss": 0.0309, "step": 5834 }, { "epoch": 2.330670926517572, "grad_norm": 1.0738100152461907, "learning_rate": 1.4415379959739356e-06, "loss": 0.0376, "step": 5836 }, { "epoch": 2.3314696485623, "grad_norm": 0.8696561197568158, "learning_rate": 1.4382748274905573e-06, "loss": 0.0296, "step": 5838 }, { "epoch": 2.3322683706070286, "grad_norm": 1.0538751425130783, "learning_rate": 1.4350147360016743e-06, "loss": 0.0368, "step": 5840 }, { "epoch": 2.333067092651757, "grad_norm": 1.0257476904138636, "learning_rate": 1.4317577243236968e-06, "loss": 0.0365, "step": 5842 }, { "epoch": 2.3338658146964857, "grad_norm": 0.8938198309680814, "learning_rate": 1.4285037952703785e-06, "loss": 0.0357, "step": 5844 }, { "epoch": 2.334664536741214, "grad_norm": 0.9464804274403688, "learning_rate": 1.425252951652803e-06, "loss": 0.0293, "step": 5846 }, { "epoch": 2.3354632587859427, "grad_norm": 0.9175488309023649, "learning_rate": 1.4220051962793952e-06, "loss": 0.038, "step": 5848 }, { "epoch": 2.336261980830671, "grad_norm": 0.88533327061595, "learning_rate": 1.4187605319559078e-06, "loss": 0.0301, "step": 5850 }, { "epoch": 2.3370607028753994, "grad_norm": 0.9127303239271386, "learning_rate": 1.4155189614854275e-06, "loss": 0.0319, "step": 5852 }, { "epoch": 2.337859424920128, "grad_norm": 0.9168807782532327, "learning_rate": 1.4122804876683616e-06, "loss": 0.0308, "step": 5854 }, { "epoch": 2.3386581469648564, "grad_norm": 1.1257679223953572, "learning_rate": 1.4090451133024473e-06, "loss": 0.0388, "step": 5856 }, { "epoch": 2.3394568690095845, "grad_norm": 0.8696474054620578, "learning_rate": 1.4058128411827432e-06, "loss": 0.0355, "step": 5858 }, { "epoch": 2.340255591054313, "grad_norm": 0.9000167630617952, "learning_rate": 1.4025836741016274e-06, "loss": 0.0341, "step": 5860 }, { "epoch": 2.3410543130990416, "grad_norm": 0.9240511974633537, "learning_rate": 1.399357614848796e-06, "loss": 0.0307, "step": 5862 }, { "epoch": 2.34185303514377, "grad_norm": 0.8909434883498069, "learning_rate": 1.3961346662112585e-06, "loss": 0.0341, "step": 5864 }, { "epoch": 2.3426517571884986, "grad_norm": 1.0135872043495942, "learning_rate": 1.3929148309733392e-06, "loss": 0.0286, "step": 5866 }, { "epoch": 2.3434504792332267, "grad_norm": 0.9580825595304057, "learning_rate": 1.3896981119166741e-06, "loss": 0.0346, "step": 5868 }, { "epoch": 2.344249201277955, "grad_norm": 0.8615986543919892, "learning_rate": 1.3864845118202013e-06, "loss": 0.034, "step": 5870 }, { "epoch": 2.3450479233226837, "grad_norm": 1.1873836274839265, "learning_rate": 1.3832740334601692e-06, "loss": 0.0317, "step": 5872 }, { "epoch": 2.3458466453674123, "grad_norm": 0.8711721095623617, "learning_rate": 1.3800666796101291e-06, "loss": 0.0331, "step": 5874 }, { "epoch": 2.3466453674121404, "grad_norm": 1.010832936665381, "learning_rate": 1.3768624530409324e-06, "loss": 0.0353, "step": 5876 }, { "epoch": 2.347444089456869, "grad_norm": 0.9937261665959263, "learning_rate": 1.373661356520727e-06, "loss": 0.0362, "step": 5878 }, { "epoch": 2.3482428115015974, "grad_norm": 0.9184012307526624, "learning_rate": 1.3704633928149575e-06, "loss": 0.0287, "step": 5880 }, { "epoch": 2.349041533546326, "grad_norm": 0.9982535009466615, "learning_rate": 1.3672685646863653e-06, "loss": 0.033, "step": 5882 }, { "epoch": 2.3498402555910545, "grad_norm": 0.9164440397933561, "learning_rate": 1.3640768748949811e-06, "loss": 0.0293, "step": 5884 }, { "epoch": 2.3506389776357826, "grad_norm": 1.002185223742666, "learning_rate": 1.360888326198121e-06, "loss": 0.0341, "step": 5886 }, { "epoch": 2.351437699680511, "grad_norm": 0.9987751966058985, "learning_rate": 1.3577029213503911e-06, "loss": 0.0321, "step": 5888 }, { "epoch": 2.3522364217252396, "grad_norm": 0.9634859351636175, "learning_rate": 1.354520663103681e-06, "loss": 0.033, "step": 5890 }, { "epoch": 2.353035143769968, "grad_norm": 1.0094199782296815, "learning_rate": 1.351341554207163e-06, "loss": 0.0356, "step": 5892 }, { "epoch": 2.3538338658146967, "grad_norm": 0.953311272783066, "learning_rate": 1.3481655974072845e-06, "loss": 0.0333, "step": 5894 }, { "epoch": 2.3546325878594248, "grad_norm": 0.9161467503492303, "learning_rate": 1.3449927954477732e-06, "loss": 0.0305, "step": 5896 }, { "epoch": 2.3554313099041533, "grad_norm": 1.0625770728133241, "learning_rate": 1.3418231510696312e-06, "loss": 0.0335, "step": 5898 }, { "epoch": 2.356230031948882, "grad_norm": 0.9368140797679172, "learning_rate": 1.3386566670111339e-06, "loss": 0.0351, "step": 5900 }, { "epoch": 2.3570287539936103, "grad_norm": 0.9236886045000889, "learning_rate": 1.3354933460078217e-06, "loss": 0.0314, "step": 5902 }, { "epoch": 2.357827476038339, "grad_norm": 1.137543124830593, "learning_rate": 1.3323331907925046e-06, "loss": 0.038, "step": 5904 }, { "epoch": 2.358626198083067, "grad_norm": 0.9008576498883231, "learning_rate": 1.3291762040952626e-06, "loss": 0.0273, "step": 5906 }, { "epoch": 2.3594249201277955, "grad_norm": 0.9264640068095965, "learning_rate": 1.3260223886434342e-06, "loss": 0.0284, "step": 5908 }, { "epoch": 2.360223642172524, "grad_norm": 1.0330425296416088, "learning_rate": 1.3228717471616153e-06, "loss": 0.0359, "step": 5910 }, { "epoch": 2.3610223642172525, "grad_norm": 0.8006806552776662, "learning_rate": 1.319724282371664e-06, "loss": 0.0328, "step": 5912 }, { "epoch": 2.3618210862619806, "grad_norm": 0.9719478594390628, "learning_rate": 1.3165799969926928e-06, "loss": 0.0359, "step": 5914 }, { "epoch": 2.362619808306709, "grad_norm": 0.9089803281939999, "learning_rate": 1.3134388937410697e-06, "loss": 0.0296, "step": 5916 }, { "epoch": 2.3634185303514377, "grad_norm": 1.0859836987195648, "learning_rate": 1.3103009753304085e-06, "loss": 0.0289, "step": 5918 }, { "epoch": 2.364217252396166, "grad_norm": 0.9338786842895157, "learning_rate": 1.307166244471576e-06, "loss": 0.0303, "step": 5920 }, { "epoch": 2.3650159744408947, "grad_norm": 0.9352879035371544, "learning_rate": 1.3040347038726831e-06, "loss": 0.0313, "step": 5922 }, { "epoch": 2.365814696485623, "grad_norm": 0.9193300328510401, "learning_rate": 1.3009063562390866e-06, "loss": 0.0308, "step": 5924 }, { "epoch": 2.3666134185303513, "grad_norm": 0.8330378897517992, "learning_rate": 1.297781204273385e-06, "loss": 0.0277, "step": 5926 }, { "epoch": 2.36741214057508, "grad_norm": 1.0098776366020665, "learning_rate": 1.2946592506754097e-06, "loss": 0.0292, "step": 5928 }, { "epoch": 2.3682108626198084, "grad_norm": 0.9244538147214044, "learning_rate": 1.2915404981422386e-06, "loss": 0.0358, "step": 5930 }, { "epoch": 2.369009584664537, "grad_norm": 1.009009426826755, "learning_rate": 1.28842494936818e-06, "loss": 0.0307, "step": 5932 }, { "epoch": 2.369808306709265, "grad_norm": 0.8599683395682384, "learning_rate": 1.2853126070447709e-06, "loss": 0.0312, "step": 5934 }, { "epoch": 2.3706070287539935, "grad_norm": 1.2504223376409314, "learning_rate": 1.282203473860783e-06, "loss": 0.0324, "step": 5936 }, { "epoch": 2.371405750798722, "grad_norm": 0.9600194803305533, "learning_rate": 1.2790975525022136e-06, "loss": 0.0332, "step": 5938 }, { "epoch": 2.3722044728434506, "grad_norm": 0.8579303834114889, "learning_rate": 1.275994845652288e-06, "loss": 0.028, "step": 5940 }, { "epoch": 2.373003194888179, "grad_norm": 0.9767263492376159, "learning_rate": 1.2728953559914486e-06, "loss": 0.0343, "step": 5942 }, { "epoch": 2.373801916932907, "grad_norm": 0.9810633979289892, "learning_rate": 1.2697990861973635e-06, "loss": 0.0283, "step": 5944 }, { "epoch": 2.3746006389776357, "grad_norm": 0.8914061929408517, "learning_rate": 1.2667060389449182e-06, "loss": 0.0318, "step": 5946 }, { "epoch": 2.3753993610223643, "grad_norm": 1.1915603298866675, "learning_rate": 1.2636162169062133e-06, "loss": 0.0402, "step": 5948 }, { "epoch": 2.376198083067093, "grad_norm": 0.836740461461381, "learning_rate": 1.260529622750563e-06, "loss": 0.0311, "step": 5950 }, { "epoch": 2.376996805111821, "grad_norm": 0.9210823534934908, "learning_rate": 1.257446259144494e-06, "loss": 0.0314, "step": 5952 }, { "epoch": 2.3777955271565494, "grad_norm": 0.9674053346660237, "learning_rate": 1.2543661287517423e-06, "loss": 0.0301, "step": 5954 }, { "epoch": 2.378594249201278, "grad_norm": 0.9378146372928756, "learning_rate": 1.25128923423325e-06, "loss": 0.028, "step": 5956 }, { "epoch": 2.3793929712460065, "grad_norm": 0.882404097137147, "learning_rate": 1.2482155782471612e-06, "loss": 0.0289, "step": 5958 }, { "epoch": 2.380191693290735, "grad_norm": 1.0244194815400358, "learning_rate": 1.2451451634488264e-06, "loss": 0.0319, "step": 5960 }, { "epoch": 2.380990415335463, "grad_norm": 0.9513392453303519, "learning_rate": 1.242077992490794e-06, "loss": 0.0291, "step": 5962 }, { "epoch": 2.3817891373801916, "grad_norm": 0.8933825432617372, "learning_rate": 1.2390140680228107e-06, "loss": 0.0324, "step": 5964 }, { "epoch": 2.38258785942492, "grad_norm": 0.9739451587254204, "learning_rate": 1.2359533926918193e-06, "loss": 0.0325, "step": 5966 }, { "epoch": 2.3833865814696487, "grad_norm": 1.0284624062364278, "learning_rate": 1.2328959691419517e-06, "loss": 0.0377, "step": 5968 }, { "epoch": 2.384185303514377, "grad_norm": 0.9808325264631864, "learning_rate": 1.2298418000145345e-06, "loss": 0.0311, "step": 5970 }, { "epoch": 2.3849840255591053, "grad_norm": 0.9709468792899423, "learning_rate": 1.2267908879480822e-06, "loss": 0.0333, "step": 5972 }, { "epoch": 2.385782747603834, "grad_norm": 1.033875589705149, "learning_rate": 1.2237432355782947e-06, "loss": 0.0374, "step": 5974 }, { "epoch": 2.3865814696485623, "grad_norm": 1.078337957714353, "learning_rate": 1.2206988455380558e-06, "loss": 0.0326, "step": 5976 }, { "epoch": 2.387380191693291, "grad_norm": 1.0758642600387078, "learning_rate": 1.2176577204574318e-06, "loss": 0.0313, "step": 5978 }, { "epoch": 2.3881789137380194, "grad_norm": 0.901933168392248, "learning_rate": 1.214619862963668e-06, "loss": 0.0319, "step": 5980 }, { "epoch": 2.3889776357827475, "grad_norm": 1.0229488021571116, "learning_rate": 1.2115852756811875e-06, "loss": 0.0367, "step": 5982 }, { "epoch": 2.389776357827476, "grad_norm": 0.8928180495678354, "learning_rate": 1.2085539612315844e-06, "loss": 0.0307, "step": 5984 }, { "epoch": 2.3905750798722045, "grad_norm": 0.9981047761041587, "learning_rate": 1.2055259222336303e-06, "loss": 0.0352, "step": 5986 }, { "epoch": 2.391373801916933, "grad_norm": 0.7912725756162461, "learning_rate": 1.202501161303265e-06, "loss": 0.0272, "step": 5988 }, { "epoch": 2.392172523961661, "grad_norm": 0.9543956236269545, "learning_rate": 1.1994796810535981e-06, "loss": 0.0337, "step": 5990 }, { "epoch": 2.3929712460063897, "grad_norm": 0.8444542368501661, "learning_rate": 1.1964614840949002e-06, "loss": 0.028, "step": 5992 }, { "epoch": 2.393769968051118, "grad_norm": 0.9428028723910442, "learning_rate": 1.1934465730346106e-06, "loss": 0.0373, "step": 5994 }, { "epoch": 2.3945686900958467, "grad_norm": 0.8624521055571825, "learning_rate": 1.1904349504773276e-06, "loss": 0.0317, "step": 5996 }, { "epoch": 2.3953674121405752, "grad_norm": 0.9861824702284993, "learning_rate": 1.1874266190248095e-06, "loss": 0.0334, "step": 5998 }, { "epoch": 2.3961661341853033, "grad_norm": 1.09155387358314, "learning_rate": 1.1844215812759708e-06, "loss": 0.0349, "step": 6000 }, { "epoch": 2.3961661341853033, "eval_loss": 0.1782146841287613, "eval_runtime": 418.6183, "eval_samples_per_second": 42.538, "eval_steps_per_second": 5.317, "step": 6000 }, { "epoch": 2.396964856230032, "grad_norm": 0.8415845821620274, "learning_rate": 1.1814198398268794e-06, "loss": 0.0284, "step": 6002 }, { "epoch": 2.3977635782747604, "grad_norm": 0.8937417948243025, "learning_rate": 1.1784213972707581e-06, "loss": 0.0294, "step": 6004 }, { "epoch": 2.398562300319489, "grad_norm": 0.9462060632845546, "learning_rate": 1.175426256197979e-06, "loss": 0.0339, "step": 6006 }, { "epoch": 2.3993610223642174, "grad_norm": 0.9979933172942926, "learning_rate": 1.1724344191960591e-06, "loss": 0.0329, "step": 6008 }, { "epoch": 2.4001597444089455, "grad_norm": 0.8510571914760195, "learning_rate": 1.169445888849664e-06, "loss": 0.0285, "step": 6010 }, { "epoch": 2.400958466453674, "grad_norm": 0.8796317934735823, "learning_rate": 1.1664606677406025e-06, "loss": 0.0285, "step": 6012 }, { "epoch": 2.4017571884984026, "grad_norm": 1.1059376491146813, "learning_rate": 1.1634787584478257e-06, "loss": 0.0286, "step": 6014 }, { "epoch": 2.402555910543131, "grad_norm": 1.0839859430790064, "learning_rate": 1.1605001635474183e-06, "loss": 0.031, "step": 6016 }, { "epoch": 2.4033546325878596, "grad_norm": 1.0568594830906386, "learning_rate": 1.157524885612607e-06, "loss": 0.0336, "step": 6018 }, { "epoch": 2.4041533546325877, "grad_norm": 0.8544876905209444, "learning_rate": 1.1545529272137496e-06, "loss": 0.0293, "step": 6020 }, { "epoch": 2.4049520766773163, "grad_norm": 0.9202306605617924, "learning_rate": 1.1515842909183422e-06, "loss": 0.0305, "step": 6022 }, { "epoch": 2.405750798722045, "grad_norm": 1.2779812066659912, "learning_rate": 1.1486189792910024e-06, "loss": 0.0327, "step": 6024 }, { "epoch": 2.4065495207667733, "grad_norm": 0.8423188795687913, "learning_rate": 1.1456569948934804e-06, "loss": 0.0323, "step": 6026 }, { "epoch": 2.4073482428115014, "grad_norm": 0.9219710260036055, "learning_rate": 1.142698340284652e-06, "loss": 0.0367, "step": 6028 }, { "epoch": 2.40814696485623, "grad_norm": 0.8836691940601481, "learning_rate": 1.139743018020517e-06, "loss": 0.031, "step": 6030 }, { "epoch": 2.4089456869009584, "grad_norm": 1.009462941023179, "learning_rate": 1.1367910306541918e-06, "loss": 0.0313, "step": 6032 }, { "epoch": 2.409744408945687, "grad_norm": 0.9333397367826699, "learning_rate": 1.133842380735916e-06, "loss": 0.0289, "step": 6034 }, { "epoch": 2.4105431309904155, "grad_norm": 0.9518139468601376, "learning_rate": 1.1308970708130458e-06, "loss": 0.0329, "step": 6036 }, { "epoch": 2.4113418530351436, "grad_norm": 1.0093135425496413, "learning_rate": 1.1279551034300523e-06, "loss": 0.0293, "step": 6038 }, { "epoch": 2.412140575079872, "grad_norm": 0.8766410436250559, "learning_rate": 1.1250164811285148e-06, "loss": 0.0314, "step": 6040 }, { "epoch": 2.4129392971246006, "grad_norm": 0.7888244567724153, "learning_rate": 1.1220812064471248e-06, "loss": 0.0302, "step": 6042 }, { "epoch": 2.413738019169329, "grad_norm": 0.9752687497406552, "learning_rate": 1.119149281921687e-06, "loss": 0.0338, "step": 6044 }, { "epoch": 2.4145367412140573, "grad_norm": 1.015706721424374, "learning_rate": 1.1162207100851069e-06, "loss": 0.0316, "step": 6046 }, { "epoch": 2.415335463258786, "grad_norm": 0.9186099679113181, "learning_rate": 1.1132954934673911e-06, "loss": 0.0348, "step": 6048 }, { "epoch": 2.4161341853035143, "grad_norm": 0.9014882003540872, "learning_rate": 1.110373634595653e-06, "loss": 0.0279, "step": 6050 }, { "epoch": 2.416932907348243, "grad_norm": 0.8554968617937979, "learning_rate": 1.1074551359941022e-06, "loss": 0.0274, "step": 6052 }, { "epoch": 2.4177316293929714, "grad_norm": 1.023254004503027, "learning_rate": 1.1045400001840474e-06, "loss": 0.0346, "step": 6054 }, { "epoch": 2.4185303514377, "grad_norm": 1.0199229730833819, "learning_rate": 1.1016282296838887e-06, "loss": 0.0372, "step": 6056 }, { "epoch": 2.419329073482428, "grad_norm": 0.8141953630986705, "learning_rate": 1.0987198270091225e-06, "loss": 0.0297, "step": 6058 }, { "epoch": 2.4201277955271565, "grad_norm": 0.9824196187903327, "learning_rate": 1.0958147946723341e-06, "loss": 0.0295, "step": 6060 }, { "epoch": 2.420926517571885, "grad_norm": 1.116417391297416, "learning_rate": 1.0929131351831974e-06, "loss": 0.0339, "step": 6062 }, { "epoch": 2.4217252396166136, "grad_norm": 0.9588644560235544, "learning_rate": 1.090014851048473e-06, "loss": 0.0322, "step": 6064 }, { "epoch": 2.4225239616613417, "grad_norm": 0.876473284974182, "learning_rate": 1.0871199447720022e-06, "loss": 0.0337, "step": 6066 }, { "epoch": 2.42332268370607, "grad_norm": 0.8818096541129938, "learning_rate": 1.0842284188547142e-06, "loss": 0.0299, "step": 6068 }, { "epoch": 2.4241214057507987, "grad_norm": 0.9093312218826864, "learning_rate": 1.0813402757946145e-06, "loss": 0.0326, "step": 6070 }, { "epoch": 2.4249201277955272, "grad_norm": 0.8911104729900233, "learning_rate": 1.078455518086784e-06, "loss": 0.0311, "step": 6072 }, { "epoch": 2.4257188498402558, "grad_norm": 0.9634405906730059, "learning_rate": 1.0755741482233822e-06, "loss": 0.0321, "step": 6074 }, { "epoch": 2.426517571884984, "grad_norm": 0.8619867335268869, "learning_rate": 1.0726961686936406e-06, "loss": 0.0274, "step": 6076 }, { "epoch": 2.4273162939297124, "grad_norm": 0.8467036432543594, "learning_rate": 1.069821581983862e-06, "loss": 0.0308, "step": 6078 }, { "epoch": 2.428115015974441, "grad_norm": 0.9553299696013161, "learning_rate": 1.0669503905774198e-06, "loss": 0.0305, "step": 6080 }, { "epoch": 2.4289137380191694, "grad_norm": 0.9434220839422669, "learning_rate": 1.0640825969547498e-06, "loss": 0.032, "step": 6082 }, { "epoch": 2.4297124600638975, "grad_norm": 0.907723226631111, "learning_rate": 1.061218203593356e-06, "loss": 0.0292, "step": 6084 }, { "epoch": 2.430511182108626, "grad_norm": 1.0082338096229233, "learning_rate": 1.0583572129678043e-06, "loss": 0.0345, "step": 6086 }, { "epoch": 2.4313099041533546, "grad_norm": 0.9128983473815063, "learning_rate": 1.055499627549722e-06, "loss": 0.0312, "step": 6088 }, { "epoch": 2.432108626198083, "grad_norm": 0.9047232760560915, "learning_rate": 1.0526454498077892e-06, "loss": 0.0333, "step": 6090 }, { "epoch": 2.4329073482428116, "grad_norm": 0.9748694127638722, "learning_rate": 1.0497946822077504e-06, "loss": 0.0301, "step": 6092 }, { "epoch": 2.43370607028754, "grad_norm": 0.9900482589709648, "learning_rate": 1.0469473272123998e-06, "loss": 0.0312, "step": 6094 }, { "epoch": 2.4345047923322682, "grad_norm": 1.0325102686584386, "learning_rate": 1.0441033872815804e-06, "loss": 0.0334, "step": 6096 }, { "epoch": 2.4353035143769968, "grad_norm": 0.8638115070891277, "learning_rate": 1.0412628648721895e-06, "loss": 0.0296, "step": 6098 }, { "epoch": 2.4361022364217253, "grad_norm": 1.1255119248973784, "learning_rate": 1.0384257624381705e-06, "loss": 0.0305, "step": 6100 }, { "epoch": 2.436900958466454, "grad_norm": 0.9446843609413517, "learning_rate": 1.0355920824305127e-06, "loss": 0.0323, "step": 6102 }, { "epoch": 2.437699680511182, "grad_norm": 0.9313934818835989, "learning_rate": 1.0327618272972484e-06, "loss": 0.0322, "step": 6104 }, { "epoch": 2.4384984025559104, "grad_norm": 1.1986399439946025, "learning_rate": 1.0299349994834497e-06, "loss": 0.0369, "step": 6106 }, { "epoch": 2.439297124600639, "grad_norm": 1.0849811906791489, "learning_rate": 1.0271116014312293e-06, "loss": 0.0274, "step": 6108 }, { "epoch": 2.4400958466453675, "grad_norm": 0.9885919632765612, "learning_rate": 1.0242916355797372e-06, "loss": 0.0294, "step": 6110 }, { "epoch": 2.440894568690096, "grad_norm": 1.1666414296370116, "learning_rate": 1.0214751043651582e-06, "loss": 0.0327, "step": 6112 }, { "epoch": 2.441693290734824, "grad_norm": 1.0373974615204293, "learning_rate": 1.018662010220709e-06, "loss": 0.0345, "step": 6114 }, { "epoch": 2.4424920127795526, "grad_norm": 0.8806182333785598, "learning_rate": 1.0158523555766375e-06, "loss": 0.03, "step": 6116 }, { "epoch": 2.443290734824281, "grad_norm": 0.8377676265174371, "learning_rate": 1.0130461428602206e-06, "loss": 0.0286, "step": 6118 }, { "epoch": 2.4440894568690097, "grad_norm": 0.9542688751670757, "learning_rate": 1.010243374495763e-06, "loss": 0.0296, "step": 6120 }, { "epoch": 2.4448881789137378, "grad_norm": 0.845828630003871, "learning_rate": 1.0074440529045882e-06, "loss": 0.0299, "step": 6122 }, { "epoch": 2.4456869009584663, "grad_norm": 0.838659411317932, "learning_rate": 1.0046481805050484e-06, "loss": 0.0286, "step": 6124 }, { "epoch": 2.446485623003195, "grad_norm": 0.964366696594686, "learning_rate": 1.001855759712513e-06, "loss": 0.0309, "step": 6126 }, { "epoch": 2.4472843450479234, "grad_norm": 1.0550865306282715, "learning_rate": 9.990667929393715e-07, "loss": 0.0357, "step": 6128 }, { "epoch": 2.448083067092652, "grad_norm": 0.9665555146057448, "learning_rate": 9.962812825950252e-07, "loss": 0.0334, "step": 6130 }, { "epoch": 2.4488817891373804, "grad_norm": 1.1013658647610531, "learning_rate": 9.934992310858944e-07, "loss": 0.0358, "step": 6132 }, { "epoch": 2.4496805111821085, "grad_norm": 0.9451008760428278, "learning_rate": 9.90720640815408e-07, "loss": 0.0319, "step": 6134 }, { "epoch": 2.450479233226837, "grad_norm": 0.9344422669847151, "learning_rate": 9.879455141840067e-07, "loss": 0.0298, "step": 6136 }, { "epoch": 2.4512779552715656, "grad_norm": 0.8508316348278806, "learning_rate": 9.851738535891375e-07, "loss": 0.0266, "step": 6138 }, { "epoch": 2.452076677316294, "grad_norm": 0.9862824857730245, "learning_rate": 9.824056614252542e-07, "loss": 0.0314, "step": 6140 }, { "epoch": 2.452875399361022, "grad_norm": 0.8320616360939587, "learning_rate": 9.79640940083813e-07, "loss": 0.0299, "step": 6142 }, { "epoch": 2.4536741214057507, "grad_norm": 0.9792614234160211, "learning_rate": 9.768796919532742e-07, "loss": 0.0314, "step": 6144 }, { "epoch": 2.4544728434504792, "grad_norm": 0.9293003315650749, "learning_rate": 9.741219194190925e-07, "loss": 0.0338, "step": 6146 }, { "epoch": 2.4552715654952078, "grad_norm": 0.9682041377572445, "learning_rate": 9.71367624863725e-07, "loss": 0.0329, "step": 6148 }, { "epoch": 2.4560702875399363, "grad_norm": 0.8325832618656944, "learning_rate": 9.686168106666216e-07, "loss": 0.031, "step": 6150 }, { "epoch": 2.4568690095846644, "grad_norm": 0.9350572549104323, "learning_rate": 9.658694792042284e-07, "loss": 0.0288, "step": 6152 }, { "epoch": 2.457667731629393, "grad_norm": 0.8277288114049663, "learning_rate": 9.631256328499772e-07, "loss": 0.031, "step": 6154 }, { "epoch": 2.4584664536741214, "grad_norm": 0.9213528535489943, "learning_rate": 9.603852739742941e-07, "loss": 0.0284, "step": 6156 }, { "epoch": 2.45926517571885, "grad_norm": 0.8009301927639629, "learning_rate": 9.576484049445895e-07, "loss": 0.029, "step": 6158 }, { "epoch": 2.460063897763578, "grad_norm": 0.8921062677918278, "learning_rate": 9.549150281252633e-07, "loss": 0.0354, "step": 6160 }, { "epoch": 2.4608626198083066, "grad_norm": 0.8850346677550753, "learning_rate": 9.521851458776915e-07, "loss": 0.0297, "step": 6162 }, { "epoch": 2.461661341853035, "grad_norm": 0.9779735465184336, "learning_rate": 9.494587605602368e-07, "loss": 0.0289, "step": 6164 }, { "epoch": 2.4624600638977636, "grad_norm": 0.9498573454203734, "learning_rate": 9.467358745282379e-07, "loss": 0.0279, "step": 6166 }, { "epoch": 2.463258785942492, "grad_norm": 0.9196211437633159, "learning_rate": 9.440164901340127e-07, "loss": 0.0311, "step": 6168 }, { "epoch": 2.4640575079872207, "grad_norm": 0.8765231457695842, "learning_rate": 9.413006097268512e-07, "loss": 0.0243, "step": 6170 }, { "epoch": 2.4648562300319488, "grad_norm": 0.853315794604095, "learning_rate": 9.385882356530179e-07, "loss": 0.026, "step": 6172 }, { "epoch": 2.4656549520766773, "grad_norm": 0.9844044641457295, "learning_rate": 9.358793702557489e-07, "loss": 0.0293, "step": 6174 }, { "epoch": 2.466453674121406, "grad_norm": 0.9636119419245541, "learning_rate": 9.331740158752495e-07, "loss": 0.0334, "step": 6176 }, { "epoch": 2.4672523961661343, "grad_norm": 1.1029243201298184, "learning_rate": 9.304721748486878e-07, "loss": 0.0344, "step": 6178 }, { "epoch": 2.4680511182108624, "grad_norm": 0.9591374043106669, "learning_rate": 9.277738495102012e-07, "loss": 0.0308, "step": 6180 }, { "epoch": 2.468849840255591, "grad_norm": 0.9656785120322073, "learning_rate": 9.250790421908862e-07, "loss": 0.0261, "step": 6182 }, { "epoch": 2.4696485623003195, "grad_norm": 1.0686970243259972, "learning_rate": 9.223877552188065e-07, "loss": 0.0325, "step": 6184 }, { "epoch": 2.470447284345048, "grad_norm": 0.9422767101505445, "learning_rate": 9.196999909189764e-07, "loss": 0.0293, "step": 6186 }, { "epoch": 2.4712460063897765, "grad_norm": 0.9128608441727557, "learning_rate": 9.17015751613371e-07, "loss": 0.0278, "step": 6188 }, { "epoch": 2.4720447284345046, "grad_norm": 1.0182227621831177, "learning_rate": 9.14335039620921e-07, "loss": 0.0291, "step": 6190 }, { "epoch": 2.472843450479233, "grad_norm": 1.0323032210841876, "learning_rate": 9.116578572575091e-07, "loss": 0.0381, "step": 6192 }, { "epoch": 2.4736421725239617, "grad_norm": 1.007977038062956, "learning_rate": 9.089842068359661e-07, "loss": 0.0328, "step": 6194 }, { "epoch": 2.47444089456869, "grad_norm": 1.0276724023266843, "learning_rate": 9.06314090666075e-07, "loss": 0.0353, "step": 6196 }, { "epoch": 2.4752396166134183, "grad_norm": 1.033358203994531, "learning_rate": 9.03647511054564e-07, "loss": 0.0291, "step": 6198 }, { "epoch": 2.476038338658147, "grad_norm": 0.8075474991970624, "learning_rate": 9.009844703051063e-07, "loss": 0.0276, "step": 6200 }, { "epoch": 2.4768370607028753, "grad_norm": 0.9186532720625925, "learning_rate": 8.98324970718319e-07, "loss": 0.0283, "step": 6202 }, { "epoch": 2.477635782747604, "grad_norm": 0.9745581052474273, "learning_rate": 8.956690145917557e-07, "loss": 0.0337, "step": 6204 }, { "epoch": 2.4784345047923324, "grad_norm": 0.8345103534692212, "learning_rate": 8.930166042199146e-07, "loss": 0.0256, "step": 6206 }, { "epoch": 2.479233226837061, "grad_norm": 0.9005598945512714, "learning_rate": 8.903677418942292e-07, "loss": 0.0285, "step": 6208 }, { "epoch": 2.480031948881789, "grad_norm": 1.0271916802595848, "learning_rate": 8.877224299030629e-07, "loss": 0.0317, "step": 6210 }, { "epoch": 2.4808306709265175, "grad_norm": 1.075693558366458, "learning_rate": 8.850806705317183e-07, "loss": 0.0368, "step": 6212 }, { "epoch": 2.481629392971246, "grad_norm": 0.8722330378555192, "learning_rate": 8.824424660624247e-07, "loss": 0.029, "step": 6214 }, { "epoch": 2.4824281150159746, "grad_norm": 1.0601895552621947, "learning_rate": 8.79807818774343e-07, "loss": 0.0293, "step": 6216 }, { "epoch": 2.4832268370607027, "grad_norm": 0.8707900195092941, "learning_rate": 8.771767309435614e-07, "loss": 0.0304, "step": 6218 }, { "epoch": 2.484025559105431, "grad_norm": 0.9733820547813553, "learning_rate": 8.745492048430876e-07, "loss": 0.0343, "step": 6220 }, { "epoch": 2.4848242811501597, "grad_norm": 0.9317695338486248, "learning_rate": 8.719252427428582e-07, "loss": 0.0303, "step": 6222 }, { "epoch": 2.4856230031948883, "grad_norm": 1.033404563883472, "learning_rate": 8.693048469097293e-07, "loss": 0.0325, "step": 6224 }, { "epoch": 2.486421725239617, "grad_norm": 0.9136265608554973, "learning_rate": 8.666880196074767e-07, "loss": 0.0308, "step": 6226 }, { "epoch": 2.487220447284345, "grad_norm": 1.1596479673050444, "learning_rate": 8.640747630967883e-07, "loss": 0.0294, "step": 6228 }, { "epoch": 2.4880191693290734, "grad_norm": 1.0426530374304792, "learning_rate": 8.614650796352747e-07, "loss": 0.0306, "step": 6230 }, { "epoch": 2.488817891373802, "grad_norm": 0.9242364015680927, "learning_rate": 8.58858971477457e-07, "loss": 0.0327, "step": 6232 }, { "epoch": 2.4896166134185305, "grad_norm": 0.9313580110161238, "learning_rate": 8.562564408747637e-07, "loss": 0.0306, "step": 6234 }, { "epoch": 2.4904153354632586, "grad_norm": 1.067403189080394, "learning_rate": 8.536574900755367e-07, "loss": 0.0317, "step": 6236 }, { "epoch": 2.491214057507987, "grad_norm": 0.9285372176981973, "learning_rate": 8.510621213250248e-07, "loss": 0.0305, "step": 6238 }, { "epoch": 2.4920127795527156, "grad_norm": 0.8346074025633845, "learning_rate": 8.484703368653812e-07, "loss": 0.0273, "step": 6240 }, { "epoch": 2.492811501597444, "grad_norm": 0.9152125909961968, "learning_rate": 8.458821389356647e-07, "loss": 0.0303, "step": 6242 }, { "epoch": 2.4936102236421727, "grad_norm": 0.9838042565841876, "learning_rate": 8.432975297718321e-07, "loss": 0.0299, "step": 6244 }, { "epoch": 2.494408945686901, "grad_norm": 0.9237463657264794, "learning_rate": 8.407165116067423e-07, "loss": 0.0304, "step": 6246 }, { "epoch": 2.4952076677316293, "grad_norm": 1.1889649482379565, "learning_rate": 8.381390866701517e-07, "loss": 0.0354, "step": 6248 }, { "epoch": 2.496006389776358, "grad_norm": 0.9830687823209033, "learning_rate": 8.355652571887135e-07, "loss": 0.0317, "step": 6250 }, { "epoch": 2.4968051118210863, "grad_norm": 0.8992861344610856, "learning_rate": 8.329950253859703e-07, "loss": 0.0299, "step": 6252 }, { "epoch": 2.497603833865815, "grad_norm": 1.0201169404321446, "learning_rate": 8.304283934823626e-07, "loss": 0.0337, "step": 6254 }, { "epoch": 2.498402555910543, "grad_norm": 0.996273309697261, "learning_rate": 8.278653636952177e-07, "loss": 0.0309, "step": 6256 }, { "epoch": 2.4992012779552715, "grad_norm": 0.8898734510036184, "learning_rate": 8.25305938238753e-07, "loss": 0.031, "step": 6258 }, { "epoch": 2.5, "grad_norm": 0.8809017767144187, "learning_rate": 8.227501193240673e-07, "loss": 0.0344, "step": 6260 }, { "epoch": 2.5007987220447285, "grad_norm": 0.9543528237861113, "learning_rate": 8.201979091591488e-07, "loss": 0.0282, "step": 6262 }, { "epoch": 2.501597444089457, "grad_norm": 1.111241667863821, "learning_rate": 8.176493099488664e-07, "loss": 0.0358, "step": 6264 }, { "epoch": 2.502396166134185, "grad_norm": 0.869787895901821, "learning_rate": 8.151043238949697e-07, "loss": 0.0272, "step": 6266 }, { "epoch": 2.5031948881789137, "grad_norm": 0.9762765697943674, "learning_rate": 8.125629531960849e-07, "loss": 0.0299, "step": 6268 }, { "epoch": 2.503993610223642, "grad_norm": 0.9008988645129132, "learning_rate": 8.100252000477177e-07, "loss": 0.0295, "step": 6270 }, { "epoch": 2.5047923322683707, "grad_norm": 0.9858370192539189, "learning_rate": 8.074910666422475e-07, "loss": 0.0316, "step": 6272 }, { "epoch": 2.505591054313099, "grad_norm": 0.9950028382350193, "learning_rate": 8.049605551689255e-07, "loss": 0.0314, "step": 6274 }, { "epoch": 2.5063897763578273, "grad_norm": 0.8706385388900013, "learning_rate": 8.024336678138761e-07, "loss": 0.03, "step": 6276 }, { "epoch": 2.507188498402556, "grad_norm": 0.9317584954844236, "learning_rate": 7.999104067600904e-07, "loss": 0.0319, "step": 6278 }, { "epoch": 2.5079872204472844, "grad_norm": 0.9028386865531697, "learning_rate": 7.973907741874287e-07, "loss": 0.0319, "step": 6280 }, { "epoch": 2.508785942492013, "grad_norm": 1.0429573535057794, "learning_rate": 7.948747722726169e-07, "loss": 0.0362, "step": 6282 }, { "epoch": 2.5095846645367414, "grad_norm": 0.9075149519955287, "learning_rate": 7.923624031892402e-07, "loss": 0.0297, "step": 6284 }, { "epoch": 2.5103833865814695, "grad_norm": 0.9244441640851673, "learning_rate": 7.898536691077508e-07, "loss": 0.0356, "step": 6286 }, { "epoch": 2.511182108626198, "grad_norm": 0.9718862084825739, "learning_rate": 7.873485721954572e-07, "loss": 0.0327, "step": 6288 }, { "epoch": 2.5119808306709266, "grad_norm": 0.8694408724126972, "learning_rate": 7.848471146165287e-07, "loss": 0.0297, "step": 6290 }, { "epoch": 2.512779552715655, "grad_norm": 0.8938047550855578, "learning_rate": 7.823492985319858e-07, "loss": 0.0275, "step": 6292 }, { "epoch": 2.513578274760383, "grad_norm": 0.8065785132438869, "learning_rate": 7.798551260997067e-07, "loss": 0.0312, "step": 6294 }, { "epoch": 2.5143769968051117, "grad_norm": 0.929264264045834, "learning_rate": 7.773645994744222e-07, "loss": 0.0256, "step": 6296 }, { "epoch": 2.5151757188498403, "grad_norm": 1.3557337175166702, "learning_rate": 7.748777208077118e-07, "loss": 0.0288, "step": 6298 }, { "epoch": 2.515974440894569, "grad_norm": 1.1432470693302257, "learning_rate": 7.723944922480037e-07, "loss": 0.0343, "step": 6300 }, { "epoch": 2.5167731629392973, "grad_norm": 1.1285169628245737, "learning_rate": 7.699149159405734e-07, "loss": 0.0344, "step": 6302 }, { "epoch": 2.5175718849840254, "grad_norm": 0.9264387543909826, "learning_rate": 7.674389940275406e-07, "loss": 0.0299, "step": 6304 }, { "epoch": 2.518370607028754, "grad_norm": 0.9885710066492656, "learning_rate": 7.649667286478696e-07, "loss": 0.0349, "step": 6306 }, { "epoch": 2.5191693290734825, "grad_norm": 0.8115719655821809, "learning_rate": 7.624981219373623e-07, "loss": 0.0301, "step": 6308 }, { "epoch": 2.519968051118211, "grad_norm": 0.8265082284522801, "learning_rate": 7.600331760286627e-07, "loss": 0.0291, "step": 6310 }, { "epoch": 2.520766773162939, "grad_norm": 0.8510973944883193, "learning_rate": 7.575718930512516e-07, "loss": 0.0259, "step": 6312 }, { "epoch": 2.5215654952076676, "grad_norm": 0.9955720237715131, "learning_rate": 7.551142751314455e-07, "loss": 0.0309, "step": 6314 }, { "epoch": 2.522364217252396, "grad_norm": 0.94544787132954, "learning_rate": 7.526603243923958e-07, "loss": 0.0278, "step": 6316 }, { "epoch": 2.5231629392971247, "grad_norm": 0.7689821541255585, "learning_rate": 7.502100429540815e-07, "loss": 0.0283, "step": 6318 }, { "epoch": 2.523961661341853, "grad_norm": 0.8326325372584937, "learning_rate": 7.47763432933315e-07, "loss": 0.0288, "step": 6320 }, { "epoch": 2.5247603833865817, "grad_norm": 0.8468951802399899, "learning_rate": 7.453204964437394e-07, "loss": 0.0261, "step": 6322 }, { "epoch": 2.52555910543131, "grad_norm": 0.9023769818457569, "learning_rate": 7.428812355958181e-07, "loss": 0.0316, "step": 6324 }, { "epoch": 2.5263578274760383, "grad_norm": 0.7923001953812997, "learning_rate": 7.404456524968445e-07, "loss": 0.0246, "step": 6326 }, { "epoch": 2.527156549520767, "grad_norm": 0.8829690105277411, "learning_rate": 7.380137492509309e-07, "loss": 0.0262, "step": 6328 }, { "epoch": 2.527955271565495, "grad_norm": 0.7804684297568552, "learning_rate": 7.355855279590146e-07, "loss": 0.0243, "step": 6330 }, { "epoch": 2.5287539936102235, "grad_norm": 1.0507641034180348, "learning_rate": 7.33160990718847e-07, "loss": 0.0318, "step": 6332 }, { "epoch": 2.529552715654952, "grad_norm": 0.9578888020234547, "learning_rate": 7.307401396250008e-07, "loss": 0.0255, "step": 6334 }, { "epoch": 2.5303514376996805, "grad_norm": 0.9269807171184239, "learning_rate": 7.283229767688627e-07, "loss": 0.0308, "step": 6336 }, { "epoch": 2.531150159744409, "grad_norm": 0.8790871744926378, "learning_rate": 7.259095042386338e-07, "loss": 0.0301, "step": 6338 }, { "epoch": 2.5319488817891376, "grad_norm": 0.9895066251185489, "learning_rate": 7.23499724119327e-07, "loss": 0.0291, "step": 6340 }, { "epoch": 2.5327476038338657, "grad_norm": 0.8899644518567821, "learning_rate": 7.210936384927631e-07, "loss": 0.0311, "step": 6342 }, { "epoch": 2.533546325878594, "grad_norm": 1.0693235257658584, "learning_rate": 7.186912494375736e-07, "loss": 0.0323, "step": 6344 }, { "epoch": 2.5343450479233227, "grad_norm": 1.044274627458564, "learning_rate": 7.162925590291986e-07, "loss": 0.0294, "step": 6346 }, { "epoch": 2.5351437699680512, "grad_norm": 0.9703071429232554, "learning_rate": 7.13897569339877e-07, "loss": 0.0292, "step": 6348 }, { "epoch": 2.5359424920127793, "grad_norm": 0.9567409636353577, "learning_rate": 7.115062824386554e-07, "loss": 0.0262, "step": 6350 }, { "epoch": 2.536741214057508, "grad_norm": 0.8461463315662133, "learning_rate": 7.091187003913802e-07, "loss": 0.0298, "step": 6352 }, { "epoch": 2.5375399361022364, "grad_norm": 0.9718174385536262, "learning_rate": 7.067348252606965e-07, "loss": 0.031, "step": 6354 }, { "epoch": 2.538338658146965, "grad_norm": 0.8730259562016701, "learning_rate": 7.043546591060485e-07, "loss": 0.0299, "step": 6356 }, { "epoch": 2.5391373801916934, "grad_norm": 0.9333674604962541, "learning_rate": 7.019782039836737e-07, "loss": 0.0295, "step": 6358 }, { "epoch": 2.539936102236422, "grad_norm": 0.888933835128418, "learning_rate": 6.996054619466053e-07, "loss": 0.0322, "step": 6360 }, { "epoch": 2.54073482428115, "grad_norm": 0.9530139786909172, "learning_rate": 6.972364350446698e-07, "loss": 0.032, "step": 6362 }, { "epoch": 2.5415335463258786, "grad_norm": 0.8887252902742203, "learning_rate": 6.948711253244827e-07, "loss": 0.0312, "step": 6364 }, { "epoch": 2.542332268370607, "grad_norm": 0.8352734988241693, "learning_rate": 6.92509534829447e-07, "loss": 0.0272, "step": 6366 }, { "epoch": 2.543130990415335, "grad_norm": 0.9214285326384319, "learning_rate": 6.901516655997536e-07, "loss": 0.0279, "step": 6368 }, { "epoch": 2.5439297124600637, "grad_norm": 1.0207519839454122, "learning_rate": 6.877975196723824e-07, "loss": 0.0295, "step": 6370 }, { "epoch": 2.5447284345047922, "grad_norm": 1.0846736644854258, "learning_rate": 6.854470990810907e-07, "loss": 0.0371, "step": 6372 }, { "epoch": 2.5455271565495208, "grad_norm": 0.929449666083272, "learning_rate": 6.831004058564211e-07, "loss": 0.03, "step": 6374 }, { "epoch": 2.5463258785942493, "grad_norm": 0.9865284947555943, "learning_rate": 6.80757442025694e-07, "loss": 0.0311, "step": 6376 }, { "epoch": 2.547124600638978, "grad_norm": 0.9826417414313666, "learning_rate": 6.784182096130104e-07, "loss": 0.0285, "step": 6378 }, { "epoch": 2.547923322683706, "grad_norm": 0.9343640996748256, "learning_rate": 6.76082710639247e-07, "loss": 0.0292, "step": 6380 }, { "epoch": 2.5487220447284344, "grad_norm": 0.9782586764772694, "learning_rate": 6.737509471220527e-07, "loss": 0.0327, "step": 6382 }, { "epoch": 2.549520766773163, "grad_norm": 0.9736964982154661, "learning_rate": 6.714229210758516e-07, "loss": 0.0292, "step": 6384 }, { "epoch": 2.5503194888178915, "grad_norm": 0.9366260036987168, "learning_rate": 6.690986345118389e-07, "loss": 0.0308, "step": 6386 }, { "epoch": 2.5511182108626196, "grad_norm": 1.052050189264594, "learning_rate": 6.667780894379799e-07, "loss": 0.0308, "step": 6388 }, { "epoch": 2.551916932907348, "grad_norm": 0.9827812854065209, "learning_rate": 6.644612878590034e-07, "loss": 0.0316, "step": 6390 }, { "epoch": 2.5527156549520766, "grad_norm": 0.8386586055394432, "learning_rate": 6.621482317764105e-07, "loss": 0.0305, "step": 6392 }, { "epoch": 2.553514376996805, "grad_norm": 0.9894645955986272, "learning_rate": 6.598389231884628e-07, "loss": 0.0297, "step": 6394 }, { "epoch": 2.5543130990415337, "grad_norm": 0.8839682059943562, "learning_rate": 6.575333640901855e-07, "loss": 0.0285, "step": 6396 }, { "epoch": 2.5551118210862622, "grad_norm": 0.8887779152215409, "learning_rate": 6.552315564733625e-07, "loss": 0.0285, "step": 6398 }, { "epoch": 2.5559105431309903, "grad_norm": 1.0490578369091348, "learning_rate": 6.529335023265387e-07, "loss": 0.0289, "step": 6400 }, { "epoch": 2.556709265175719, "grad_norm": 1.0894646563356374, "learning_rate": 6.506392036350168e-07, "loss": 0.0322, "step": 6402 }, { "epoch": 2.5575079872204474, "grad_norm": 0.9316306126563935, "learning_rate": 6.483486623808555e-07, "loss": 0.0282, "step": 6404 }, { "epoch": 2.5583067092651754, "grad_norm": 1.0138875975414274, "learning_rate": 6.460618805428637e-07, "loss": 0.029, "step": 6406 }, { "epoch": 2.559105431309904, "grad_norm": 0.9409011790958446, "learning_rate": 6.437788600966066e-07, "loss": 0.0275, "step": 6408 }, { "epoch": 2.5599041533546325, "grad_norm": 1.0950975759175423, "learning_rate": 6.414996030143982e-07, "loss": 0.026, "step": 6410 }, { "epoch": 2.560702875399361, "grad_norm": 0.8621907379188767, "learning_rate": 6.392241112653031e-07, "loss": 0.0263, "step": 6412 }, { "epoch": 2.5615015974440896, "grad_norm": 1.0987324235841187, "learning_rate": 6.369523868151278e-07, "loss": 0.03, "step": 6414 }, { "epoch": 2.562300319488818, "grad_norm": 1.0180249667885073, "learning_rate": 6.346844316264312e-07, "loss": 0.0337, "step": 6416 }, { "epoch": 2.563099041533546, "grad_norm": 0.918284409010965, "learning_rate": 6.324202476585112e-07, "loss": 0.0293, "step": 6418 }, { "epoch": 2.5638977635782747, "grad_norm": 0.8900922129741325, "learning_rate": 6.301598368674106e-07, "loss": 0.0262, "step": 6420 }, { "epoch": 2.5646964856230032, "grad_norm": 0.8698502363272664, "learning_rate": 6.279032012059089e-07, "loss": 0.0279, "step": 6422 }, { "epoch": 2.5654952076677318, "grad_norm": 0.7647567338632194, "learning_rate": 6.256503426235277e-07, "loss": 0.0259, "step": 6424 }, { "epoch": 2.56629392971246, "grad_norm": 0.9236140855263162, "learning_rate": 6.234012630665237e-07, "loss": 0.0309, "step": 6426 }, { "epoch": 2.5670926517571884, "grad_norm": 0.9418851113359151, "learning_rate": 6.211559644778908e-07, "loss": 0.0287, "step": 6428 }, { "epoch": 2.567891373801917, "grad_norm": 0.9410493586581912, "learning_rate": 6.189144487973531e-07, "loss": 0.0245, "step": 6430 }, { "epoch": 2.5686900958466454, "grad_norm": 0.8715415469545029, "learning_rate": 6.166767179613691e-07, "loss": 0.0272, "step": 6432 }, { "epoch": 2.569488817891374, "grad_norm": 1.0011250141948451, "learning_rate": 6.144427739031284e-07, "loss": 0.0336, "step": 6434 }, { "epoch": 2.5702875399361025, "grad_norm": 0.8930124959505643, "learning_rate": 6.122126185525462e-07, "loss": 0.0287, "step": 6436 }, { "epoch": 2.5710862619808306, "grad_norm": 0.8345328277096289, "learning_rate": 6.099862538362678e-07, "loss": 0.0274, "step": 6438 }, { "epoch": 2.571884984025559, "grad_norm": 0.8429043604050549, "learning_rate": 6.077636816776611e-07, "loss": 0.0259, "step": 6440 }, { "epoch": 2.5726837060702876, "grad_norm": 0.9282997534416909, "learning_rate": 6.055449039968197e-07, "loss": 0.0308, "step": 6442 }, { "epoch": 2.5734824281150157, "grad_norm": 1.078810276103343, "learning_rate": 6.033299227105588e-07, "loss": 0.03, "step": 6444 }, { "epoch": 2.5742811501597442, "grad_norm": 0.9006283103160382, "learning_rate": 6.011187397324114e-07, "loss": 0.0277, "step": 6446 }, { "epoch": 2.5750798722044728, "grad_norm": 1.0725617589003342, "learning_rate": 5.989113569726312e-07, "loss": 0.0312, "step": 6448 }, { "epoch": 2.5758785942492013, "grad_norm": 1.0044942875803178, "learning_rate": 5.967077763381895e-07, "loss": 0.0269, "step": 6450 }, { "epoch": 2.57667731629393, "grad_norm": 1.0483372526983028, "learning_rate": 5.945079997327713e-07, "loss": 0.0303, "step": 6452 }, { "epoch": 2.5774760383386583, "grad_norm": 0.885373336335105, "learning_rate": 5.923120290567779e-07, "loss": 0.0292, "step": 6454 }, { "epoch": 2.5782747603833864, "grad_norm": 0.9919843795418669, "learning_rate": 5.901198662073188e-07, "loss": 0.0372, "step": 6456 }, { "epoch": 2.579073482428115, "grad_norm": 1.300703442861457, "learning_rate": 5.87931513078216e-07, "loss": 0.0246, "step": 6458 }, { "epoch": 2.5798722044728435, "grad_norm": 1.0888147527555645, "learning_rate": 5.8574697156e-07, "loss": 0.0368, "step": 6460 }, { "epoch": 2.580670926517572, "grad_norm": 1.183957007842984, "learning_rate": 5.835662435399098e-07, "loss": 0.0345, "step": 6462 }, { "epoch": 2.5814696485623, "grad_norm": 0.9387448812321908, "learning_rate": 5.813893309018881e-07, "loss": 0.026, "step": 6464 }, { "epoch": 2.5822683706070286, "grad_norm": 0.9875605986563711, "learning_rate": 5.792162355265812e-07, "loss": 0.0323, "step": 6466 }, { "epoch": 2.583067092651757, "grad_norm": 0.8843843609089413, "learning_rate": 5.770469592913408e-07, "loss": 0.029, "step": 6468 }, { "epoch": 2.5838658146964857, "grad_norm": 0.9754923443275886, "learning_rate": 5.748815040702138e-07, "loss": 0.0307, "step": 6470 }, { "epoch": 2.584664536741214, "grad_norm": 1.005644213615307, "learning_rate": 5.727198717339511e-07, "loss": 0.0324, "step": 6472 }, { "epoch": 2.5854632587859427, "grad_norm": 0.9932008626732668, "learning_rate": 5.705620641499981e-07, "loss": 0.0306, "step": 6474 }, { "epoch": 2.586261980830671, "grad_norm": 0.9873244519751645, "learning_rate": 5.684080831824978e-07, "loss": 0.0331, "step": 6476 }, { "epoch": 2.5870607028753994, "grad_norm": 0.9241878530410826, "learning_rate": 5.662579306922872e-07, "loss": 0.0322, "step": 6478 }, { "epoch": 2.587859424920128, "grad_norm": 1.0995821963687364, "learning_rate": 5.641116085368931e-07, "loss": 0.0308, "step": 6480 }, { "epoch": 2.588658146964856, "grad_norm": 0.8378812796834532, "learning_rate": 5.619691185705356e-07, "loss": 0.0285, "step": 6482 }, { "epoch": 2.5894568690095845, "grad_norm": 0.8605425287906566, "learning_rate": 5.598304626441264e-07, "loss": 0.0251, "step": 6484 }, { "epoch": 2.590255591054313, "grad_norm": 1.0050366580290326, "learning_rate": 5.576956426052605e-07, "loss": 0.0287, "step": 6486 }, { "epoch": 2.5910543130990416, "grad_norm": 1.065340889189702, "learning_rate": 5.555646602982207e-07, "loss": 0.0336, "step": 6488 }, { "epoch": 2.59185303514377, "grad_norm": 0.9304491062820247, "learning_rate": 5.53437517563975e-07, "loss": 0.0282, "step": 6490 }, { "epoch": 2.5926517571884986, "grad_norm": 0.9396575473059782, "learning_rate": 5.513142162401746e-07, "loss": 0.0258, "step": 6492 }, { "epoch": 2.5934504792332267, "grad_norm": 0.9393766036210395, "learning_rate": 5.491947581611517e-07, "loss": 0.0266, "step": 6494 }, { "epoch": 2.594249201277955, "grad_norm": 0.9435384740988189, "learning_rate": 5.470791451579172e-07, "loss": 0.0317, "step": 6496 }, { "epoch": 2.5950479233226837, "grad_norm": 0.9321052021346842, "learning_rate": 5.449673790581611e-07, "loss": 0.0299, "step": 6498 }, { "epoch": 2.5958466453674123, "grad_norm": 0.9112996056101765, "learning_rate": 5.428594616862504e-07, "loss": 0.0269, "step": 6500 }, { "epoch": 2.5958466453674123, "eval_loss": 0.18101496994495392, "eval_runtime": 417.6423, "eval_samples_per_second": 42.637, "eval_steps_per_second": 5.33, "step": 6500 }, { "epoch": 2.5966453674121404, "grad_norm": 0.9226194319100446, "learning_rate": 5.407553948632277e-07, "loss": 0.0304, "step": 6502 }, { "epoch": 2.597444089456869, "grad_norm": 0.9405971080377886, "learning_rate": 5.386551804068063e-07, "loss": 0.0314, "step": 6504 }, { "epoch": 2.5982428115015974, "grad_norm": 0.9385959964411323, "learning_rate": 5.365588201313737e-07, "loss": 0.0297, "step": 6506 }, { "epoch": 2.599041533546326, "grad_norm": 0.84541282801553, "learning_rate": 5.344663158479901e-07, "loss": 0.0268, "step": 6508 }, { "epoch": 2.5998402555910545, "grad_norm": 0.9713712244776086, "learning_rate": 5.323776693643784e-07, "loss": 0.0306, "step": 6510 }, { "epoch": 2.600638977635783, "grad_norm": 0.8383820244075577, "learning_rate": 5.302928824849335e-07, "loss": 0.0265, "step": 6512 }, { "epoch": 2.601437699680511, "grad_norm": 1.124289996932428, "learning_rate": 5.282119570107147e-07, "loss": 0.0322, "step": 6514 }, { "epoch": 2.6022364217252396, "grad_norm": 0.9423889108993881, "learning_rate": 5.261348947394451e-07, "loss": 0.0281, "step": 6516 }, { "epoch": 2.603035143769968, "grad_norm": 0.9129093506596754, "learning_rate": 5.240616974655116e-07, "loss": 0.0279, "step": 6518 }, { "epoch": 2.6038338658146962, "grad_norm": 0.7842673666707356, "learning_rate": 5.219923669799587e-07, "loss": 0.0241, "step": 6520 }, { "epoch": 2.6046325878594248, "grad_norm": 1.1436169228498474, "learning_rate": 5.199269050704935e-07, "loss": 0.0325, "step": 6522 }, { "epoch": 2.6054313099041533, "grad_norm": 0.8690653883300302, "learning_rate": 5.178653135214811e-07, "loss": 0.0298, "step": 6524 }, { "epoch": 2.606230031948882, "grad_norm": 0.8819467380201907, "learning_rate": 5.158075941139429e-07, "loss": 0.0291, "step": 6526 }, { "epoch": 2.6070287539936103, "grad_norm": 0.9898669074800155, "learning_rate": 5.137537486255517e-07, "loss": 0.0306, "step": 6528 }, { "epoch": 2.607827476038339, "grad_norm": 1.129969353766448, "learning_rate": 5.117037788306367e-07, "loss": 0.029, "step": 6530 }, { "epoch": 2.608626198083067, "grad_norm": 1.18791037882053, "learning_rate": 5.096576865001802e-07, "loss": 0.0308, "step": 6532 }, { "epoch": 2.6094249201277955, "grad_norm": 0.9477100539787796, "learning_rate": 5.07615473401813e-07, "loss": 0.0277, "step": 6534 }, { "epoch": 2.610223642172524, "grad_norm": 0.9068341807452427, "learning_rate": 5.055771412998122e-07, "loss": 0.0289, "step": 6536 }, { "epoch": 2.6110223642172525, "grad_norm": 0.895582110683765, "learning_rate": 5.035426919551062e-07, "loss": 0.0268, "step": 6538 }, { "epoch": 2.6118210862619806, "grad_norm": 0.9258207040673696, "learning_rate": 5.015121271252659e-07, "loss": 0.0287, "step": 6540 }, { "epoch": 2.612619808306709, "grad_norm": 0.8949264605397048, "learning_rate": 4.994854485645106e-07, "loss": 0.0291, "step": 6542 }, { "epoch": 2.6134185303514377, "grad_norm": 1.0785382270595747, "learning_rate": 4.974626580236957e-07, "loss": 0.0302, "step": 6544 }, { "epoch": 2.614217252396166, "grad_norm": 1.0483244361602544, "learning_rate": 4.954437572503235e-07, "loss": 0.031, "step": 6546 }, { "epoch": 2.6150159744408947, "grad_norm": 1.028765681064945, "learning_rate": 4.934287479885336e-07, "loss": 0.0303, "step": 6548 }, { "epoch": 2.6158146964856233, "grad_norm": 0.9557621074130701, "learning_rate": 4.914176319791037e-07, "loss": 0.025, "step": 6550 }, { "epoch": 2.6166134185303513, "grad_norm": 0.8620869517168456, "learning_rate": 4.894104109594466e-07, "loss": 0.0275, "step": 6552 }, { "epoch": 2.61741214057508, "grad_norm": 0.8437067964701872, "learning_rate": 4.874070866636149e-07, "loss": 0.0274, "step": 6554 }, { "epoch": 2.6182108626198084, "grad_norm": 0.8006419089172357, "learning_rate": 4.854076608222901e-07, "loss": 0.0303, "step": 6556 }, { "epoch": 2.6190095846645365, "grad_norm": 0.8823397281540788, "learning_rate": 4.834121351627885e-07, "loss": 0.0254, "step": 6558 }, { "epoch": 2.619808306709265, "grad_norm": 0.8499615378338659, "learning_rate": 4.814205114090543e-07, "loss": 0.0239, "step": 6560 }, { "epoch": 2.6206070287539935, "grad_norm": 1.03836923861178, "learning_rate": 4.794327912816637e-07, "loss": 0.0337, "step": 6562 }, { "epoch": 2.621405750798722, "grad_norm": 1.064067294335829, "learning_rate": 4.774489764978185e-07, "loss": 0.0267, "step": 6564 }, { "epoch": 2.6222044728434506, "grad_norm": 0.9357645207898694, "learning_rate": 4.754690687713498e-07, "loss": 0.0281, "step": 6566 }, { "epoch": 2.623003194888179, "grad_norm": 0.9735104123969658, "learning_rate": 4.734930698127077e-07, "loss": 0.0294, "step": 6568 }, { "epoch": 2.623801916932907, "grad_norm": 1.0247460222068252, "learning_rate": 4.715209813289706e-07, "loss": 0.0307, "step": 6570 }, { "epoch": 2.6246006389776357, "grad_norm": 0.9357520689720981, "learning_rate": 4.695528050238368e-07, "loss": 0.031, "step": 6572 }, { "epoch": 2.6253993610223643, "grad_norm": 0.9787665294252466, "learning_rate": 4.675885425976251e-07, "loss": 0.031, "step": 6574 }, { "epoch": 2.626198083067093, "grad_norm": 1.0180864802210927, "learning_rate": 4.6562819574727304e-07, "loss": 0.0281, "step": 6576 }, { "epoch": 2.626996805111821, "grad_norm": 0.8926200488656815, "learning_rate": 4.6367176616633426e-07, "loss": 0.0295, "step": 6578 }, { "epoch": 2.6277955271565494, "grad_norm": 1.0500600630048418, "learning_rate": 4.6171925554498066e-07, "loss": 0.0333, "step": 6580 }, { "epoch": 2.628594249201278, "grad_norm": 1.182205096455108, "learning_rate": 4.597706655699974e-07, "loss": 0.0343, "step": 6582 }, { "epoch": 2.6293929712460065, "grad_norm": 0.8380383559912178, "learning_rate": 4.578259979247801e-07, "loss": 0.0249, "step": 6584 }, { "epoch": 2.630191693290735, "grad_norm": 0.9217249683454195, "learning_rate": 4.558852542893405e-07, "loss": 0.0274, "step": 6586 }, { "epoch": 2.6309904153354635, "grad_norm": 0.9654985885991184, "learning_rate": 4.539484363402963e-07, "loss": 0.0313, "step": 6588 }, { "epoch": 2.6317891373801916, "grad_norm": 0.9416775486966591, "learning_rate": 4.520155457508768e-07, "loss": 0.0278, "step": 6590 }, { "epoch": 2.63258785942492, "grad_norm": 1.0149290675646854, "learning_rate": 4.500865841909169e-07, "loss": 0.0283, "step": 6592 }, { "epoch": 2.6333865814696487, "grad_norm": 1.0066112830843752, "learning_rate": 4.4816155332685687e-07, "loss": 0.0321, "step": 6594 }, { "epoch": 2.6341853035143767, "grad_norm": 0.9399055283517556, "learning_rate": 4.462404548217414e-07, "loss": 0.0274, "step": 6596 }, { "epoch": 2.6349840255591053, "grad_norm": 0.9221492425778504, "learning_rate": 4.4432329033521903e-07, "loss": 0.0295, "step": 6598 }, { "epoch": 2.635782747603834, "grad_norm": 0.890695263755349, "learning_rate": 4.4241006152353885e-07, "loss": 0.0253, "step": 6600 }, { "epoch": 2.6365814696485623, "grad_norm": 0.9178158597505024, "learning_rate": 4.405007700395497e-07, "loss": 0.0281, "step": 6602 }, { "epoch": 2.637380191693291, "grad_norm": 1.0639158204341026, "learning_rate": 4.385954175326995e-07, "loss": 0.0264, "step": 6604 }, { "epoch": 2.6381789137380194, "grad_norm": 1.012323859666103, "learning_rate": 4.366940056490343e-07, "loss": 0.0308, "step": 6606 }, { "epoch": 2.6389776357827475, "grad_norm": 0.9727641969282357, "learning_rate": 4.3479653603119287e-07, "loss": 0.0271, "step": 6608 }, { "epoch": 2.639776357827476, "grad_norm": 0.8772595568384365, "learning_rate": 4.329030103184095e-07, "loss": 0.0259, "step": 6610 }, { "epoch": 2.6405750798722045, "grad_norm": 0.8227589976875681, "learning_rate": 4.3101343014651356e-07, "loss": 0.0277, "step": 6612 }, { "epoch": 2.641373801916933, "grad_norm": 0.9510948759982237, "learning_rate": 4.2912779714792296e-07, "loss": 0.0265, "step": 6614 }, { "epoch": 2.642172523961661, "grad_norm": 1.0097839001611448, "learning_rate": 4.2724611295164755e-07, "loss": 0.026, "step": 6616 }, { "epoch": 2.6429712460063897, "grad_norm": 0.9098862544268895, "learning_rate": 4.2536837918328353e-07, "loss": 0.0306, "step": 6618 }, { "epoch": 2.643769968051118, "grad_norm": 0.8802136378624498, "learning_rate": 4.2349459746501674e-07, "loss": 0.0249, "step": 6620 }, { "epoch": 2.6445686900958467, "grad_norm": 0.9041537710345738, "learning_rate": 4.2162476941561723e-07, "loss": 0.0291, "step": 6622 }, { "epoch": 2.6453674121405752, "grad_norm": 1.007668717045686, "learning_rate": 4.197588966504401e-07, "loss": 0.0269, "step": 6624 }, { "epoch": 2.6461661341853038, "grad_norm": 0.9464297822234001, "learning_rate": 4.178969807814237e-07, "loss": 0.0292, "step": 6626 }, { "epoch": 2.646964856230032, "grad_norm": 0.9545384276330572, "learning_rate": 4.1603902341708804e-07, "loss": 0.0315, "step": 6628 }, { "epoch": 2.6477635782747604, "grad_norm": 0.9924287706034195, "learning_rate": 4.1418502616253185e-07, "loss": 0.0293, "step": 6630 }, { "epoch": 2.648562300319489, "grad_norm": 0.8072318680535587, "learning_rate": 4.123349906194357e-07, "loss": 0.0281, "step": 6632 }, { "epoch": 2.649361022364217, "grad_norm": 0.8755579059821392, "learning_rate": 4.1048891838605386e-07, "loss": 0.0243, "step": 6634 }, { "epoch": 2.6501597444089455, "grad_norm": 1.1069033803045907, "learning_rate": 4.0864681105721895e-07, "loss": 0.031, "step": 6636 }, { "epoch": 2.650958466453674, "grad_norm": 0.9327831704930124, "learning_rate": 4.068086702243379e-07, "loss": 0.0283, "step": 6638 }, { "epoch": 2.6517571884984026, "grad_norm": 1.0816491366183623, "learning_rate": 4.0497449747539217e-07, "loss": 0.0324, "step": 6640 }, { "epoch": 2.652555910543131, "grad_norm": 0.9653480667623733, "learning_rate": 4.031442943949321e-07, "loss": 0.0271, "step": 6642 }, { "epoch": 2.6533546325878596, "grad_norm": 1.025462257232587, "learning_rate": 4.013180625640811e-07, "loss": 0.0268, "step": 6644 }, { "epoch": 2.6541533546325877, "grad_norm": 1.0073308408827422, "learning_rate": 3.994958035605323e-07, "loss": 0.0335, "step": 6646 }, { "epoch": 2.6549520766773163, "grad_norm": 0.7759996809586879, "learning_rate": 3.9767751895854467e-07, "loss": 0.0278, "step": 6648 }, { "epoch": 2.655750798722045, "grad_norm": 0.9648450344658905, "learning_rate": 3.958632103289439e-07, "loss": 0.0323, "step": 6650 }, { "epoch": 2.6565495207667733, "grad_norm": 0.8621409106150557, "learning_rate": 3.940528792391224e-07, "loss": 0.0223, "step": 6652 }, { "epoch": 2.6573482428115014, "grad_norm": 0.8191488899403426, "learning_rate": 3.9224652725303514e-07, "loss": 0.0286, "step": 6654 }, { "epoch": 2.65814696485623, "grad_norm": 1.0056039386415438, "learning_rate": 3.904441559312006e-07, "loss": 0.0311, "step": 6656 }, { "epoch": 2.6589456869009584, "grad_norm": 1.0006415392629169, "learning_rate": 3.886457668306959e-07, "loss": 0.031, "step": 6658 }, { "epoch": 2.659744408945687, "grad_norm": 1.01436188125872, "learning_rate": 3.8685136150516056e-07, "loss": 0.032, "step": 6660 }, { "epoch": 2.6605431309904155, "grad_norm": 0.9807096116942686, "learning_rate": 3.8506094150479125e-07, "loss": 0.0266, "step": 6662 }, { "epoch": 2.661341853035144, "grad_norm": 1.0204227532844772, "learning_rate": 3.8327450837634284e-07, "loss": 0.0283, "step": 6664 }, { "epoch": 2.662140575079872, "grad_norm": 0.8193622993500249, "learning_rate": 3.8149206366312365e-07, "loss": 0.0269, "step": 6666 }, { "epoch": 2.6629392971246006, "grad_norm": 0.9734485918188198, "learning_rate": 3.7971360890499686e-07, "loss": 0.0266, "step": 6668 }, { "epoch": 2.663738019169329, "grad_norm": 0.8642796045293297, "learning_rate": 3.7793914563838187e-07, "loss": 0.0251, "step": 6670 }, { "epoch": 2.6645367412140573, "grad_norm": 1.033909020021237, "learning_rate": 3.7616867539624733e-07, "loss": 0.0352, "step": 6672 }, { "epoch": 2.665335463258786, "grad_norm": 0.9998346566813847, "learning_rate": 3.7440219970811155e-07, "loss": 0.0321, "step": 6674 }, { "epoch": 2.6661341853035143, "grad_norm": 1.1423057215989467, "learning_rate": 3.7263972010004256e-07, "loss": 0.0324, "step": 6676 }, { "epoch": 2.666932907348243, "grad_norm": 0.8695749763174104, "learning_rate": 3.708812380946569e-07, "loss": 0.0256, "step": 6678 }, { "epoch": 2.6677316293929714, "grad_norm": 0.9934106793723311, "learning_rate": 3.691267552111183e-07, "loss": 0.0302, "step": 6680 }, { "epoch": 2.6685303514377, "grad_norm": 1.0381623580396713, "learning_rate": 3.67376272965132e-07, "loss": 0.0281, "step": 6682 }, { "epoch": 2.669329073482428, "grad_norm": 0.8905369484200444, "learning_rate": 3.6562979286895115e-07, "loss": 0.0255, "step": 6684 }, { "epoch": 2.6701277955271565, "grad_norm": 0.8948659099592012, "learning_rate": 3.6388731643136944e-07, "loss": 0.0252, "step": 6686 }, { "epoch": 2.670926517571885, "grad_norm": 0.9124920896558937, "learning_rate": 3.621488451577221e-07, "loss": 0.0293, "step": 6688 }, { "epoch": 2.6717252396166136, "grad_norm": 0.7777444783097159, "learning_rate": 3.60414380549885e-07, "loss": 0.0251, "step": 6690 }, { "epoch": 2.6725239616613417, "grad_norm": 0.9000009491244071, "learning_rate": 3.586839241062695e-07, "loss": 0.0254, "step": 6692 }, { "epoch": 2.67332268370607, "grad_norm": 0.9763457977314538, "learning_rate": 3.5695747732182873e-07, "loss": 0.0307, "step": 6694 }, { "epoch": 2.6741214057507987, "grad_norm": 1.1307665134525087, "learning_rate": 3.552350416880507e-07, "loss": 0.0297, "step": 6696 }, { "epoch": 2.6749201277955272, "grad_norm": 0.864801567907232, "learning_rate": 3.535166186929556e-07, "loss": 0.0296, "step": 6698 }, { "epoch": 2.6757188498402558, "grad_norm": 0.8870307933812397, "learning_rate": 3.518022098210988e-07, "loss": 0.0292, "step": 6700 }, { "epoch": 2.6765175718849843, "grad_norm": 1.0409321048802573, "learning_rate": 3.500918165535683e-07, "loss": 0.0345, "step": 6702 }, { "epoch": 2.6773162939297124, "grad_norm": 0.9716769842609737, "learning_rate": 3.483854403679832e-07, "loss": 0.0296, "step": 6704 }, { "epoch": 2.678115015974441, "grad_norm": 1.257329352898907, "learning_rate": 3.4668308273848985e-07, "loss": 0.0267, "step": 6706 }, { "epoch": 2.6789137380191694, "grad_norm": 1.0441757060258385, "learning_rate": 3.4498474513576574e-07, "loss": 0.0317, "step": 6708 }, { "epoch": 2.6797124600638975, "grad_norm": 1.1129495022310048, "learning_rate": 3.432904290270139e-07, "loss": 0.0286, "step": 6710 }, { "epoch": 2.680511182108626, "grad_norm": 0.9584352042603548, "learning_rate": 3.416001358759635e-07, "loss": 0.028, "step": 6712 }, { "epoch": 2.6813099041533546, "grad_norm": 1.1251450657365984, "learning_rate": 3.3991386714286924e-07, "loss": 0.0278, "step": 6714 }, { "epoch": 2.682108626198083, "grad_norm": 0.9969919886738752, "learning_rate": 3.382316242845074e-07, "loss": 0.0328, "step": 6716 }, { "epoch": 2.6829073482428116, "grad_norm": 1.1126055490955078, "learning_rate": 3.365534087541772e-07, "loss": 0.0277, "step": 6718 }, { "epoch": 2.68370607028754, "grad_norm": 1.0455467801276128, "learning_rate": 3.3487922200169944e-07, "loss": 0.0314, "step": 6720 }, { "epoch": 2.6845047923322682, "grad_norm": 0.9840651799523558, "learning_rate": 3.332090654734116e-07, "loss": 0.0325, "step": 6722 }, { "epoch": 2.6853035143769968, "grad_norm": 0.9993440868747537, "learning_rate": 3.315429406121723e-07, "loss": 0.0285, "step": 6724 }, { "epoch": 2.6861022364217253, "grad_norm": 1.041727821587971, "learning_rate": 3.2988084885735684e-07, "loss": 0.0351, "step": 6726 }, { "epoch": 2.686900958466454, "grad_norm": 0.979266522959519, "learning_rate": 3.2822279164485494e-07, "loss": 0.0267, "step": 6728 }, { "epoch": 2.687699680511182, "grad_norm": 0.8894076061044695, "learning_rate": 3.2656877040707247e-07, "loss": 0.0306, "step": 6730 }, { "epoch": 2.6884984025559104, "grad_norm": 1.1099161112289966, "learning_rate": 3.2491878657292643e-07, "loss": 0.029, "step": 6732 }, { "epoch": 2.689297124600639, "grad_norm": 0.7905372930822995, "learning_rate": 3.2327284156784765e-07, "loss": 0.0248, "step": 6734 }, { "epoch": 2.6900958466453675, "grad_norm": 0.8972437845079343, "learning_rate": 3.2163093681377765e-07, "loss": 0.0322, "step": 6736 }, { "epoch": 2.690894568690096, "grad_norm": 0.7811198103953365, "learning_rate": 3.1999307372916675e-07, "loss": 0.0211, "step": 6738 }, { "epoch": 2.6916932907348246, "grad_norm": 0.8481986944549397, "learning_rate": 3.183592537289748e-07, "loss": 0.0258, "step": 6740 }, { "epoch": 2.6924920127795526, "grad_norm": 0.8974957564028379, "learning_rate": 3.1672947822466714e-07, "loss": 0.0307, "step": 6742 }, { "epoch": 2.693290734824281, "grad_norm": 0.8069628364489961, "learning_rate": 3.151037486242181e-07, "loss": 0.0253, "step": 6744 }, { "epoch": 2.6940894568690097, "grad_norm": 1.0619428207925592, "learning_rate": 3.13482066332102e-07, "loss": 0.0328, "step": 6746 }, { "epoch": 2.6948881789137378, "grad_norm": 1.0230784961867097, "learning_rate": 3.1186443274930035e-07, "loss": 0.033, "step": 6748 }, { "epoch": 2.6956869009584663, "grad_norm": 0.9466613888969522, "learning_rate": 3.102508492732964e-07, "loss": 0.0271, "step": 6750 }, { "epoch": 2.696485623003195, "grad_norm": 0.9853478032894241, "learning_rate": 3.08641317298074e-07, "loss": 0.0321, "step": 6752 }, { "epoch": 2.6972843450479234, "grad_norm": 1.0460709064794675, "learning_rate": 3.07035838214117e-07, "loss": 0.0354, "step": 6754 }, { "epoch": 2.698083067092652, "grad_norm": 0.9909587345397065, "learning_rate": 3.0543441340840696e-07, "loss": 0.0324, "step": 6756 }, { "epoch": 2.6988817891373804, "grad_norm": 1.0147873081466183, "learning_rate": 3.0383704426442396e-07, "loss": 0.0325, "step": 6758 }, { "epoch": 2.6996805111821085, "grad_norm": 0.9477745651098728, "learning_rate": 3.022437321621452e-07, "loss": 0.0307, "step": 6760 }, { "epoch": 2.700479233226837, "grad_norm": 0.8576252217343889, "learning_rate": 3.006544784780413e-07, "loss": 0.0261, "step": 6762 }, { "epoch": 2.7012779552715656, "grad_norm": 0.929114631572117, "learning_rate": 2.9906928458507735e-07, "loss": 0.0298, "step": 6764 }, { "epoch": 2.702076677316294, "grad_norm": 0.8245393411728102, "learning_rate": 2.9748815185271174e-07, "loss": 0.0273, "step": 6766 }, { "epoch": 2.702875399361022, "grad_norm": 1.0143221518300005, "learning_rate": 2.959110816468935e-07, "loss": 0.0294, "step": 6768 }, { "epoch": 2.7036741214057507, "grad_norm": 1.1519123024674627, "learning_rate": 2.94338075330064e-07, "loss": 0.0287, "step": 6770 }, { "epoch": 2.7044728434504792, "grad_norm": 0.9244500535568194, "learning_rate": 2.927691342611505e-07, "loss": 0.032, "step": 6772 }, { "epoch": 2.7052715654952078, "grad_norm": 0.8604757808842944, "learning_rate": 2.9120425979557e-07, "loss": 0.0244, "step": 6774 }, { "epoch": 2.7060702875399363, "grad_norm": 0.8443290403852196, "learning_rate": 2.896434532852277e-07, "loss": 0.0266, "step": 6776 }, { "epoch": 2.706869009584665, "grad_norm": 0.9923984530744449, "learning_rate": 2.880867160785128e-07, "loss": 0.0299, "step": 6778 }, { "epoch": 2.707667731629393, "grad_norm": 0.9535512491511288, "learning_rate": 2.865340495202984e-07, "loss": 0.0289, "step": 6780 }, { "epoch": 2.7084664536741214, "grad_norm": 0.9510201264153861, "learning_rate": 2.849854549519426e-07, "loss": 0.0294, "step": 6782 }, { "epoch": 2.70926517571885, "grad_norm": 0.9033130404655279, "learning_rate": 2.834409337112842e-07, "loss": 0.0273, "step": 6784 }, { "epoch": 2.710063897763578, "grad_norm": 1.0548477729978563, "learning_rate": 2.8190048713264586e-07, "loss": 0.0311, "step": 6786 }, { "epoch": 2.7108626198083066, "grad_norm": 0.9948422314656845, "learning_rate": 2.8036411654682627e-07, "loss": 0.0304, "step": 6788 }, { "epoch": 2.711661341853035, "grad_norm": 0.8822930619200026, "learning_rate": 2.7883182328110494e-07, "loss": 0.0281, "step": 6790 }, { "epoch": 2.7124600638977636, "grad_norm": 1.0705668513390736, "learning_rate": 2.7730360865923954e-07, "loss": 0.0291, "step": 6792 }, { "epoch": 2.713258785942492, "grad_norm": 0.9950549300332445, "learning_rate": 2.75779474001463e-07, "loss": 0.0306, "step": 6794 }, { "epoch": 2.7140575079872207, "grad_norm": 0.9852425718587328, "learning_rate": 2.7425942062448254e-07, "loss": 0.0291, "step": 6796 }, { "epoch": 2.7148562300319488, "grad_norm": 0.9530301493388222, "learning_rate": 2.727434498414827e-07, "loss": 0.0305, "step": 6798 }, { "epoch": 2.7156549520766773, "grad_norm": 0.9617585154543935, "learning_rate": 2.712315629621176e-07, "loss": 0.0304, "step": 6800 }, { "epoch": 2.716453674121406, "grad_norm": 0.8160240654532492, "learning_rate": 2.697237612925169e-07, "loss": 0.0254, "step": 6802 }, { "epoch": 2.7172523961661343, "grad_norm": 1.0732849620331193, "learning_rate": 2.682200461352763e-07, "loss": 0.029, "step": 6804 }, { "epoch": 2.7180511182108624, "grad_norm": 1.1148345917930245, "learning_rate": 2.6672041878946507e-07, "loss": 0.0364, "step": 6806 }, { "epoch": 2.718849840255591, "grad_norm": 0.9845373546577002, "learning_rate": 2.6522488055062076e-07, "loss": 0.0323, "step": 6808 }, { "epoch": 2.7196485623003195, "grad_norm": 0.9694597420527851, "learning_rate": 2.6373343271074657e-07, "loss": 0.0305, "step": 6810 }, { "epoch": 2.720447284345048, "grad_norm": 0.8865423882611635, "learning_rate": 2.6224607655831236e-07, "loss": 0.0301, "step": 6812 }, { "epoch": 2.7212460063897765, "grad_norm": 1.007509492035804, "learning_rate": 2.607628133782536e-07, "loss": 0.0306, "step": 6814 }, { "epoch": 2.722044728434505, "grad_norm": 0.8870706562836438, "learning_rate": 2.5928364445196975e-07, "loss": 0.0294, "step": 6816 }, { "epoch": 2.722843450479233, "grad_norm": 0.9708567758541311, "learning_rate": 2.578085710573247e-07, "loss": 0.028, "step": 6818 }, { "epoch": 2.7236421725239617, "grad_norm": 0.8934830296672568, "learning_rate": 2.563375944686397e-07, "loss": 0.0306, "step": 6820 }, { "epoch": 2.72444089456869, "grad_norm": 1.072610973986754, "learning_rate": 2.548707159567021e-07, "loss": 0.0288, "step": 6822 }, { "epoch": 2.7252396166134183, "grad_norm": 0.8222963836412455, "learning_rate": 2.534079367887549e-07, "loss": 0.0278, "step": 6824 }, { "epoch": 2.726038338658147, "grad_norm": 0.9085074348725313, "learning_rate": 2.519492582285027e-07, "loss": 0.0304, "step": 6826 }, { "epoch": 2.7268370607028753, "grad_norm": 0.9177610792802552, "learning_rate": 2.504946815361065e-07, "loss": 0.0331, "step": 6828 }, { "epoch": 2.727635782747604, "grad_norm": 1.094130825800624, "learning_rate": 2.4904420796818097e-07, "loss": 0.0303, "step": 6830 }, { "epoch": 2.7284345047923324, "grad_norm": 1.0338426208891864, "learning_rate": 2.475978387778e-07, "loss": 0.0311, "step": 6832 }, { "epoch": 2.729233226837061, "grad_norm": 0.8384072672060133, "learning_rate": 2.461555752144912e-07, "loss": 0.0292, "step": 6834 }, { "epoch": 2.730031948881789, "grad_norm": 1.015906477269788, "learning_rate": 2.447174185242324e-07, "loss": 0.0277, "step": 6836 }, { "epoch": 2.7308306709265175, "grad_norm": 0.9215903866822917, "learning_rate": 2.432833699494558e-07, "loss": 0.0277, "step": 6838 }, { "epoch": 2.731629392971246, "grad_norm": 0.913267265555896, "learning_rate": 2.4185343072904376e-07, "loss": 0.0248, "step": 6840 }, { "epoch": 2.7324281150159746, "grad_norm": 0.9333269872247636, "learning_rate": 2.404276020983304e-07, "loss": 0.0292, "step": 6842 }, { "epoch": 2.7332268370607027, "grad_norm": 1.0187672549108133, "learning_rate": 2.3900588528909475e-07, "loss": 0.0344, "step": 6844 }, { "epoch": 2.734025559105431, "grad_norm": 0.9419705337564647, "learning_rate": 2.375882815295677e-07, "loss": 0.0286, "step": 6846 }, { "epoch": 2.7348242811501597, "grad_norm": 0.918071642194504, "learning_rate": 2.3617479204442462e-07, "loss": 0.028, "step": 6848 }, { "epoch": 2.7356230031948883, "grad_norm": 1.0220943478986433, "learning_rate": 2.3476541805478647e-07, "loss": 0.0287, "step": 6850 }, { "epoch": 2.736421725239617, "grad_norm": 1.0904072022439066, "learning_rate": 2.3336016077822154e-07, "loss": 0.0296, "step": 6852 }, { "epoch": 2.737220447284345, "grad_norm": 0.9166980890980327, "learning_rate": 2.3195902142873593e-07, "loss": 0.0269, "step": 6854 }, { "epoch": 2.7380191693290734, "grad_norm": 0.9240202525664635, "learning_rate": 2.305620012167853e-07, "loss": 0.028, "step": 6856 }, { "epoch": 2.738817891373802, "grad_norm": 0.9499828140577764, "learning_rate": 2.2916910134926197e-07, "loss": 0.0273, "step": 6858 }, { "epoch": 2.7396166134185305, "grad_norm": 1.0046665742939103, "learning_rate": 2.2778032302949948e-07, "loss": 0.0299, "step": 6860 }, { "epoch": 2.7404153354632586, "grad_norm": 0.9346433823459767, "learning_rate": 2.2639566745727203e-07, "loss": 0.0301, "step": 6862 }, { "epoch": 2.741214057507987, "grad_norm": 1.0474839299661065, "learning_rate": 2.2501513582879108e-07, "loss": 0.035, "step": 6864 }, { "epoch": 2.7420127795527156, "grad_norm": 0.9326384987743596, "learning_rate": 2.236387293367054e-07, "loss": 0.0267, "step": 6866 }, { "epoch": 2.742811501597444, "grad_norm": 0.9513219854071373, "learning_rate": 2.2226644917010153e-07, "loss": 0.0299, "step": 6868 }, { "epoch": 2.7436102236421727, "grad_norm": 0.8426342593529366, "learning_rate": 2.208982965144979e-07, "loss": 0.0245, "step": 6870 }, { "epoch": 2.744408945686901, "grad_norm": 0.967982836150597, "learning_rate": 2.1953427255185122e-07, "loss": 0.0333, "step": 6872 }, { "epoch": 2.7452076677316293, "grad_norm": 0.9628141955997789, "learning_rate": 2.1817437846054889e-07, "loss": 0.032, "step": 6874 }, { "epoch": 2.746006389776358, "grad_norm": 0.9095860682951128, "learning_rate": 2.1681861541541117e-07, "loss": 0.027, "step": 6876 }, { "epoch": 2.7468051118210863, "grad_norm": 0.9999018913590277, "learning_rate": 2.1546698458768888e-07, "loss": 0.0266, "step": 6878 }, { "epoch": 2.747603833865815, "grad_norm": 0.9155500721532669, "learning_rate": 2.1411948714506414e-07, "loss": 0.0248, "step": 6880 }, { "epoch": 2.748402555910543, "grad_norm": 0.899083057917597, "learning_rate": 2.1277612425164796e-07, "loss": 0.0291, "step": 6882 }, { "epoch": 2.7492012779552715, "grad_norm": 1.0507417274391886, "learning_rate": 2.1143689706797809e-07, "loss": 0.0304, "step": 6884 }, { "epoch": 2.75, "grad_norm": 0.9237015927908853, "learning_rate": 2.101018067510213e-07, "loss": 0.0288, "step": 6886 }, { "epoch": 2.7507987220447285, "grad_norm": 0.9396213034157446, "learning_rate": 2.0877085445416889e-07, "loss": 0.0313, "step": 6888 }, { "epoch": 2.751597444089457, "grad_norm": 0.893230887911447, "learning_rate": 2.0744404132723882e-07, "loss": 0.0271, "step": 6890 }, { "epoch": 2.752396166134185, "grad_norm": 1.1227883203425046, "learning_rate": 2.0612136851647258e-07, "loss": 0.028, "step": 6892 }, { "epoch": 2.7531948881789137, "grad_norm": 1.1000266312956004, "learning_rate": 2.0480283716453388e-07, "loss": 0.0316, "step": 6894 }, { "epoch": 2.753993610223642, "grad_norm": 1.0036859264242648, "learning_rate": 2.034884484105093e-07, "loss": 0.032, "step": 6896 }, { "epoch": 2.7547923322683707, "grad_norm": 0.8664572975169675, "learning_rate": 2.0217820338990723e-07, "loss": 0.0293, "step": 6898 }, { "epoch": 2.755591054313099, "grad_norm": 1.152106392290739, "learning_rate": 2.0087210323465555e-07, "loss": 0.0271, "step": 6900 }, { "epoch": 2.7563897763578273, "grad_norm": 0.9072929731158895, "learning_rate": 1.9957014907310224e-07, "loss": 0.0297, "step": 6902 }, { "epoch": 2.757188498402556, "grad_norm": 0.899746462852363, "learning_rate": 1.98272342030012e-07, "loss": 0.0269, "step": 6904 }, { "epoch": 2.7579872204472844, "grad_norm": 0.9831117034435982, "learning_rate": 1.96978683226568e-07, "loss": 0.0286, "step": 6906 }, { "epoch": 2.758785942492013, "grad_norm": 0.9654610196040313, "learning_rate": 1.9568917378037012e-07, "loss": 0.0286, "step": 6908 }, { "epoch": 2.7595846645367414, "grad_norm": 0.8702316945781633, "learning_rate": 1.9440381480543115e-07, "loss": 0.0292, "step": 6910 }, { "epoch": 2.7603833865814695, "grad_norm": 0.8701781601681666, "learning_rate": 1.9312260741218114e-07, "loss": 0.0296, "step": 6912 }, { "epoch": 2.761182108626198, "grad_norm": 1.0906874136348748, "learning_rate": 1.9184555270746198e-07, "loss": 0.032, "step": 6914 }, { "epoch": 2.7619808306709266, "grad_norm": 0.8789894864802608, "learning_rate": 1.9057265179452945e-07, "loss": 0.0283, "step": 6916 }, { "epoch": 2.762779552715655, "grad_norm": 0.8289933518344175, "learning_rate": 1.8930390577304836e-07, "loss": 0.0254, "step": 6918 }, { "epoch": 2.763578274760383, "grad_norm": 1.109173584946323, "learning_rate": 1.8803931573909584e-07, "loss": 0.0325, "step": 6920 }, { "epoch": 2.7643769968051117, "grad_norm": 0.8437516857169298, "learning_rate": 1.8677888278515854e-07, "loss": 0.0275, "step": 6922 }, { "epoch": 2.7651757188498403, "grad_norm": 0.957579586236523, "learning_rate": 1.8552260800013266e-07, "loss": 0.0324, "step": 6924 }, { "epoch": 2.765974440894569, "grad_norm": 1.133247804875847, "learning_rate": 1.8427049246932005e-07, "loss": 0.0303, "step": 6926 }, { "epoch": 2.7667731629392973, "grad_norm": 0.9042200415102326, "learning_rate": 1.8302253727443041e-07, "loss": 0.0261, "step": 6928 }, { "epoch": 2.7675718849840254, "grad_norm": 1.0109916146672329, "learning_rate": 1.817787434935797e-07, "loss": 0.0317, "step": 6930 }, { "epoch": 2.768370607028754, "grad_norm": 0.8886867588028137, "learning_rate": 1.805391122012884e-07, "loss": 0.0307, "step": 6932 }, { "epoch": 2.7691693290734825, "grad_norm": 0.9881453864615531, "learning_rate": 1.7930364446848035e-07, "loss": 0.0282, "step": 6934 }, { "epoch": 2.769968051118211, "grad_norm": 0.9753621627206346, "learning_rate": 1.7807234136248296e-07, "loss": 0.0316, "step": 6936 }, { "epoch": 2.770766773162939, "grad_norm": 0.9729298132394265, "learning_rate": 1.7684520394702697e-07, "loss": 0.0322, "step": 6938 }, { "epoch": 2.7715654952076676, "grad_norm": 1.0067163359277183, "learning_rate": 1.7562223328224327e-07, "loss": 0.0273, "step": 6940 }, { "epoch": 2.772364217252396, "grad_norm": 1.014978444523247, "learning_rate": 1.7440343042466225e-07, "loss": 0.0222, "step": 6942 }, { "epoch": 2.7731629392971247, "grad_norm": 0.9727814177226888, "learning_rate": 1.731887964272144e-07, "loss": 0.0284, "step": 6944 }, { "epoch": 2.773961661341853, "grad_norm": 0.8674877180013327, "learning_rate": 1.7197833233922933e-07, "loss": 0.0266, "step": 6946 }, { "epoch": 2.7747603833865817, "grad_norm": 0.92093939870497, "learning_rate": 1.7077203920643548e-07, "loss": 0.0277, "step": 6948 }, { "epoch": 2.77555910543131, "grad_norm": 1.0354340580208805, "learning_rate": 1.695699180709537e-07, "loss": 0.0318, "step": 6950 }, { "epoch": 2.7763578274760383, "grad_norm": 0.8959582446725475, "learning_rate": 1.6837196997130434e-07, "loss": 0.0273, "step": 6952 }, { "epoch": 2.777156549520767, "grad_norm": 0.9082092205680304, "learning_rate": 1.671781959424018e-07, "loss": 0.0283, "step": 6954 }, { "epoch": 2.777955271565495, "grad_norm": 0.8526839728174191, "learning_rate": 1.6598859701555448e-07, "loss": 0.0279, "step": 6956 }, { "epoch": 2.7787539936102235, "grad_norm": 0.9183824299916817, "learning_rate": 1.648031742184619e-07, "loss": 0.0221, "step": 6958 }, { "epoch": 2.779552715654952, "grad_norm": 0.8785486078394016, "learning_rate": 1.6362192857521942e-07, "loss": 0.028, "step": 6960 }, { "epoch": 2.7803514376996805, "grad_norm": 0.984747118195936, "learning_rate": 1.6244486110631062e-07, "loss": 0.0343, "step": 6962 }, { "epoch": 2.781150159744409, "grad_norm": 0.8833396654406691, "learning_rate": 1.6127197282861106e-07, "loss": 0.0275, "step": 6964 }, { "epoch": 2.7819488817891376, "grad_norm": 0.8586218999503099, "learning_rate": 1.6010326475538628e-07, "loss": 0.0267, "step": 6966 }, { "epoch": 2.7827476038338657, "grad_norm": 1.1132610814922481, "learning_rate": 1.5893873789628812e-07, "loss": 0.0357, "step": 6968 }, { "epoch": 2.783546325878594, "grad_norm": 1.0377705902735281, "learning_rate": 1.5777839325735955e-07, "loss": 0.0348, "step": 6970 }, { "epoch": 2.7843450479233227, "grad_norm": 0.8553477275906, "learning_rate": 1.5662223184102876e-07, "loss": 0.0282, "step": 6972 }, { "epoch": 2.7851437699680512, "grad_norm": 0.9887534929150167, "learning_rate": 1.55470254646109e-07, "loss": 0.0339, "step": 6974 }, { "epoch": 2.7859424920127793, "grad_norm": 0.8463180888040372, "learning_rate": 1.5432246266780083e-07, "loss": 0.0265, "step": 6976 }, { "epoch": 2.786741214057508, "grad_norm": 0.9279956838643539, "learning_rate": 1.5317885689768775e-07, "loss": 0.028, "step": 6978 }, { "epoch": 2.7875399361022364, "grad_norm": 0.898362447794585, "learning_rate": 1.520394383237378e-07, "loss": 0.0254, "step": 6980 }, { "epoch": 2.788338658146965, "grad_norm": 0.999242247390689, "learning_rate": 1.5090420793030025e-07, "loss": 0.026, "step": 6982 }, { "epoch": 2.7891373801916934, "grad_norm": 1.0069528441967504, "learning_rate": 1.4977316669810782e-07, "loss": 0.0299, "step": 6984 }, { "epoch": 2.789936102236422, "grad_norm": 0.9597592645048728, "learning_rate": 1.4864631560427277e-07, "loss": 0.0286, "step": 6986 }, { "epoch": 2.79073482428115, "grad_norm": 0.8133798258927686, "learning_rate": 1.4752365562228865e-07, "loss": 0.0256, "step": 6988 }, { "epoch": 2.7915335463258786, "grad_norm": 1.0299858930198784, "learning_rate": 1.4640518772202794e-07, "loss": 0.0287, "step": 6990 }, { "epoch": 2.792332268370607, "grad_norm": 0.8756875752154023, "learning_rate": 1.4529091286973994e-07, "loss": 0.0285, "step": 6992 }, { "epoch": 2.793130990415335, "grad_norm": 0.9409132612586905, "learning_rate": 1.4418083202805467e-07, "loss": 0.031, "step": 6994 }, { "epoch": 2.7939297124600637, "grad_norm": 0.9200301064033846, "learning_rate": 1.4307494615597716e-07, "loss": 0.0311, "step": 6996 }, { "epoch": 2.7947284345047922, "grad_norm": 0.8145829376320072, "learning_rate": 1.4197325620888714e-07, "loss": 0.0246, "step": 6998 }, { "epoch": 2.7955271565495208, "grad_norm": 0.9865390880493666, "learning_rate": 1.4087576313854212e-07, "loss": 0.0296, "step": 7000 }, { "epoch": 2.7955271565495208, "eval_loss": 0.1819150149822235, "eval_runtime": 417.4017, "eval_samples_per_second": 42.662, "eval_steps_per_second": 5.333, "step": 7000 }, { "epoch": 2.7963258785942493, "grad_norm": 1.0878362509592698, "learning_rate": 1.397824678930715e-07, "loss": 0.0269, "step": 7002 }, { "epoch": 2.797124600638978, "grad_norm": 0.8833659039797074, "learning_rate": 1.386933714169797e-07, "loss": 0.0295, "step": 7004 }, { "epoch": 2.797923322683706, "grad_norm": 1.198555362781041, "learning_rate": 1.3760847465114413e-07, "loss": 0.0231, "step": 7006 }, { "epoch": 2.7987220447284344, "grad_norm": 0.9766022877177375, "learning_rate": 1.365277785328123e-07, "loss": 0.0316, "step": 7008 }, { "epoch": 2.799520766773163, "grad_norm": 0.9931479598265185, "learning_rate": 1.3545128399560349e-07, "loss": 0.0307, "step": 7010 }, { "epoch": 2.8003194888178915, "grad_norm": 0.9470485404305614, "learning_rate": 1.3437899196950765e-07, "loss": 0.03, "step": 7012 }, { "epoch": 2.8011182108626196, "grad_norm": 1.0425827226683049, "learning_rate": 1.3331090338088437e-07, "loss": 0.0291, "step": 7014 }, { "epoch": 2.801916932907348, "grad_norm": 0.9479599981229176, "learning_rate": 1.3224701915246053e-07, "loss": 0.033, "step": 7016 }, { "epoch": 2.8027156549520766, "grad_norm": 1.0722352133209674, "learning_rate": 1.3118734020333257e-07, "loss": 0.0305, "step": 7018 }, { "epoch": 2.803514376996805, "grad_norm": 1.1283019296777128, "learning_rate": 1.3013186744896323e-07, "loss": 0.0322, "step": 7020 }, { "epoch": 2.8043130990415337, "grad_norm": 1.0443979108148196, "learning_rate": 1.2908060180118088e-07, "loss": 0.0279, "step": 7022 }, { "epoch": 2.8051118210862622, "grad_norm": 0.9204477323838135, "learning_rate": 1.280335441681796e-07, "loss": 0.026, "step": 7024 }, { "epoch": 2.8059105431309903, "grad_norm": 1.091376167660413, "learning_rate": 1.2699069545451858e-07, "loss": 0.0303, "step": 7026 }, { "epoch": 2.806709265175719, "grad_norm": 0.8186223224524324, "learning_rate": 1.2595205656112164e-07, "loss": 0.0251, "step": 7028 }, { "epoch": 2.8075079872204474, "grad_norm": 0.9426669872933783, "learning_rate": 1.2491762838527376e-07, "loss": 0.0278, "step": 7030 }, { "epoch": 2.8083067092651754, "grad_norm": 0.8562334740689089, "learning_rate": 1.2388741182062348e-07, "loss": 0.0254, "step": 7032 }, { "epoch": 2.809105431309904, "grad_norm": 1.0003749268195539, "learning_rate": 1.2286140775718048e-07, "loss": 0.0254, "step": 7034 }, { "epoch": 2.8099041533546325, "grad_norm": 0.9287476537267818, "learning_rate": 1.2183961708131574e-07, "loss": 0.0331, "step": 7036 }, { "epoch": 2.810702875399361, "grad_norm": 1.0100009275296533, "learning_rate": 1.2082204067576043e-07, "loss": 0.0339, "step": 7038 }, { "epoch": 2.8115015974440896, "grad_norm": 1.0019967241849932, "learning_rate": 1.198086794196035e-07, "loss": 0.034, "step": 7040 }, { "epoch": 2.812300319488818, "grad_norm": 1.0274376107046983, "learning_rate": 1.187995341882947e-07, "loss": 0.0296, "step": 7042 }, { "epoch": 2.813099041533546, "grad_norm": 0.9758492633679886, "learning_rate": 1.1779460585363945e-07, "loss": 0.0304, "step": 7044 }, { "epoch": 2.8138977635782747, "grad_norm": 1.0055929014699148, "learning_rate": 1.1679389528380159e-07, "loss": 0.0285, "step": 7046 }, { "epoch": 2.8146964856230032, "grad_norm": 0.9825352483369219, "learning_rate": 1.1579740334330014e-07, "loss": 0.0279, "step": 7048 }, { "epoch": 2.8154952076677318, "grad_norm": 0.9663886663279524, "learning_rate": 1.1480513089301037e-07, "loss": 0.0238, "step": 7050 }, { "epoch": 2.81629392971246, "grad_norm": 0.7632311003499087, "learning_rate": 1.1381707879016158e-07, "loss": 0.0228, "step": 7052 }, { "epoch": 2.8170926517571884, "grad_norm": 0.8411185807947373, "learning_rate": 1.1283324788833872e-07, "loss": 0.0261, "step": 7054 }, { "epoch": 2.817891373801917, "grad_norm": 1.0106383088967026, "learning_rate": 1.1185363903747748e-07, "loss": 0.0249, "step": 7056 }, { "epoch": 2.8186900958466454, "grad_norm": 1.0568473872652466, "learning_rate": 1.1087825308386812e-07, "loss": 0.0328, "step": 7058 }, { "epoch": 2.819488817891374, "grad_norm": 1.0600837943890258, "learning_rate": 1.0990709087015217e-07, "loss": 0.0288, "step": 7060 }, { "epoch": 2.8202875399361025, "grad_norm": 0.9659386588248836, "learning_rate": 1.0894015323532181e-07, "loss": 0.0294, "step": 7062 }, { "epoch": 2.8210862619808306, "grad_norm": 0.9822944918742355, "learning_rate": 1.0797744101472052e-07, "loss": 0.0298, "step": 7064 }, { "epoch": 2.821884984025559, "grad_norm": 0.9411392883906924, "learning_rate": 1.0701895504004022e-07, "loss": 0.0268, "step": 7066 }, { "epoch": 2.8226837060702876, "grad_norm": 0.8205930442237354, "learning_rate": 1.0606469613932247e-07, "loss": 0.0285, "step": 7068 }, { "epoch": 2.8234824281150157, "grad_norm": 0.9585034609165559, "learning_rate": 1.0511466513695778e-07, "loss": 0.0295, "step": 7070 }, { "epoch": 2.8242811501597442, "grad_norm": 1.017339428173637, "learning_rate": 1.0416886285368188e-07, "loss": 0.032, "step": 7072 }, { "epoch": 2.8250798722044728, "grad_norm": 0.9480053562427759, "learning_rate": 1.032272901065795e-07, "loss": 0.0276, "step": 7074 }, { "epoch": 2.8258785942492013, "grad_norm": 0.9241183013217804, "learning_rate": 1.0228994770908052e-07, "loss": 0.0313, "step": 7076 }, { "epoch": 2.82667731629393, "grad_norm": 0.8600565904245382, "learning_rate": 1.0135683647096107e-07, "loss": 0.0312, "step": 7078 }, { "epoch": 2.8274760383386583, "grad_norm": 0.857256000234177, "learning_rate": 1.0042795719833964e-07, "loss": 0.0304, "step": 7080 }, { "epoch": 2.8282747603833864, "grad_norm": 0.9257782290418327, "learning_rate": 9.950331069368102e-08, "loss": 0.029, "step": 7082 }, { "epoch": 2.829073482428115, "grad_norm": 0.9014008656073322, "learning_rate": 9.858289775579289e-08, "loss": 0.0277, "step": 7084 }, { "epoch": 2.8298722044728435, "grad_norm": 0.9335945085748736, "learning_rate": 9.766671917982529e-08, "loss": 0.0258, "step": 7086 }, { "epoch": 2.830670926517572, "grad_norm": 0.893406276113889, "learning_rate": 9.675477575726954e-08, "loss": 0.0286, "step": 7088 }, { "epoch": 2.8314696485623, "grad_norm": 0.9135616915468692, "learning_rate": 9.58470682759588e-08, "loss": 0.0267, "step": 7090 }, { "epoch": 2.8322683706070286, "grad_norm": 0.8914005786007674, "learning_rate": 9.494359752006687e-08, "loss": 0.0267, "step": 7092 }, { "epoch": 2.833067092651757, "grad_norm": 0.9868182622898214, "learning_rate": 9.404436427010777e-08, "loss": 0.0259, "step": 7094 }, { "epoch": 2.8338658146964857, "grad_norm": 0.8736365212101689, "learning_rate": 9.314936930293283e-08, "loss": 0.0238, "step": 7096 }, { "epoch": 2.834664536741214, "grad_norm": 0.8341663421385362, "learning_rate": 9.225861339173415e-08, "loss": 0.0296, "step": 7098 }, { "epoch": 2.8354632587859427, "grad_norm": 0.9404555958593751, "learning_rate": 9.137209730604113e-08, "loss": 0.029, "step": 7100 }, { "epoch": 2.836261980830671, "grad_norm": 0.9105942982702757, "learning_rate": 9.048982181171895e-08, "loss": 0.0289, "step": 7102 }, { "epoch": 2.8370607028753994, "grad_norm": 1.0814449020759576, "learning_rate": 8.961178767097178e-08, "loss": 0.0308, "step": 7104 }, { "epoch": 2.837859424920128, "grad_norm": 0.8932338308675243, "learning_rate": 8.873799564233676e-08, "loss": 0.0301, "step": 7106 }, { "epoch": 2.838658146964856, "grad_norm": 1.012162689411644, "learning_rate": 8.786844648068837e-08, "loss": 0.0313, "step": 7108 }, { "epoch": 2.8394568690095845, "grad_norm": 1.0902329946901537, "learning_rate": 8.700314093723572e-08, "loss": 0.0315, "step": 7110 }, { "epoch": 2.840255591054313, "grad_norm": 0.9095047254271645, "learning_rate": 8.614207975952083e-08, "loss": 0.0311, "step": 7112 }, { "epoch": 2.8410543130990416, "grad_norm": 0.9218237866635856, "learning_rate": 8.528526369141809e-08, "loss": 0.0302, "step": 7114 }, { "epoch": 2.84185303514377, "grad_norm": 0.8660219136797084, "learning_rate": 8.443269347313765e-08, "loss": 0.0265, "step": 7116 }, { "epoch": 2.8426517571884986, "grad_norm": 0.9364401145220109, "learning_rate": 8.358436984121865e-08, "loss": 0.0273, "step": 7118 }, { "epoch": 2.8434504792332267, "grad_norm": 1.023124638752514, "learning_rate": 8.274029352853264e-08, "loss": 0.0276, "step": 7120 }, { "epoch": 2.844249201277955, "grad_norm": 0.8561164210287661, "learning_rate": 8.190046526428241e-08, "loss": 0.0291, "step": 7122 }, { "epoch": 2.8450479233226837, "grad_norm": 0.8438795870319646, "learning_rate": 8.106488577399985e-08, "loss": 0.0264, "step": 7124 }, { "epoch": 2.8458466453674123, "grad_norm": 0.9601387385173464, "learning_rate": 8.02335557795464e-08, "loss": 0.0268, "step": 7126 }, { "epoch": 2.8466453674121404, "grad_norm": 1.0416344807221194, "learning_rate": 7.940647599911477e-08, "loss": 0.0274, "step": 7128 }, { "epoch": 2.847444089456869, "grad_norm": 1.030804565632412, "learning_rate": 7.858364714722122e-08, "loss": 0.0297, "step": 7130 }, { "epoch": 2.8482428115015974, "grad_norm": 0.848403943487444, "learning_rate": 7.776506993471323e-08, "loss": 0.0262, "step": 7132 }, { "epoch": 2.849041533546326, "grad_norm": 0.9248426395931749, "learning_rate": 7.695074506876566e-08, "loss": 0.0275, "step": 7134 }, { "epoch": 2.8498402555910545, "grad_norm": 1.0380277597027192, "learning_rate": 7.614067325287632e-08, "loss": 0.029, "step": 7136 }, { "epoch": 2.850638977635783, "grad_norm": 1.0174264329449134, "learning_rate": 7.533485518687211e-08, "loss": 0.0262, "step": 7138 }, { "epoch": 2.851437699680511, "grad_norm": 0.9035723830180614, "learning_rate": 7.453329156690337e-08, "loss": 0.0312, "step": 7140 }, { "epoch": 2.8522364217252396, "grad_norm": 0.8948154240149504, "learning_rate": 7.373598308544505e-08, "loss": 0.0286, "step": 7142 }, { "epoch": 2.853035143769968, "grad_norm": 0.8767549377755169, "learning_rate": 7.294293043129785e-08, "loss": 0.0271, "step": 7144 }, { "epoch": 2.8538338658146962, "grad_norm": 1.0359127327227582, "learning_rate": 7.215413428958263e-08, "loss": 0.0299, "step": 7146 }, { "epoch": 2.8546325878594248, "grad_norm": 1.054388868239098, "learning_rate": 7.136959534174592e-08, "loss": 0.0295, "step": 7148 }, { "epoch": 2.8554313099041533, "grad_norm": 0.9105886167898716, "learning_rate": 7.058931426555449e-08, "loss": 0.0278, "step": 7150 }, { "epoch": 2.856230031948882, "grad_norm": 0.9273333490997623, "learning_rate": 6.981329173509909e-08, "loss": 0.0287, "step": 7152 }, { "epoch": 2.8570287539936103, "grad_norm": 0.8079326225611323, "learning_rate": 6.904152842078848e-08, "loss": 0.0269, "step": 7154 }, { "epoch": 2.857827476038339, "grad_norm": 0.9892975697892658, "learning_rate": 6.827402498935377e-08, "loss": 0.0336, "step": 7156 }, { "epoch": 2.858626198083067, "grad_norm": 0.9535750532619426, "learning_rate": 6.75107821038462e-08, "loss": 0.029, "step": 7158 }, { "epoch": 2.8594249201277955, "grad_norm": 0.8407797191652399, "learning_rate": 6.675180042363505e-08, "loss": 0.0254, "step": 7160 }, { "epoch": 2.860223642172524, "grad_norm": 1.0055696204335296, "learning_rate": 6.599708060440857e-08, "loss": 0.0291, "step": 7162 }, { "epoch": 2.8610223642172525, "grad_norm": 0.860800590976241, "learning_rate": 6.524662329817411e-08, "loss": 0.0244, "step": 7164 }, { "epoch": 2.8618210862619806, "grad_norm": 0.9470630865172524, "learning_rate": 6.450042915325527e-08, "loss": 0.0288, "step": 7166 }, { "epoch": 2.862619808306709, "grad_norm": 0.917809030684146, "learning_rate": 6.375849881429418e-08, "loss": 0.0278, "step": 7168 }, { "epoch": 2.8634185303514377, "grad_norm": 0.9384143679170448, "learning_rate": 6.302083292224814e-08, "loss": 0.0312, "step": 7170 }, { "epoch": 2.864217252396166, "grad_norm": 0.8768613951600329, "learning_rate": 6.22874321143907e-08, "loss": 0.0249, "step": 7172 }, { "epoch": 2.8650159744408947, "grad_norm": 0.7174846049452049, "learning_rate": 6.15582970243117e-08, "loss": 0.0223, "step": 7174 }, { "epoch": 2.8658146964856233, "grad_norm": 1.0161897086560874, "learning_rate": 6.083342828191453e-08, "loss": 0.0269, "step": 7176 }, { "epoch": 2.8666134185303513, "grad_norm": 0.9854394127230478, "learning_rate": 6.011282651341655e-08, "loss": 0.0288, "step": 7178 }, { "epoch": 2.86741214057508, "grad_norm": 0.901696634188199, "learning_rate": 5.9396492341351475e-08, "loss": 0.0276, "step": 7180 }, { "epoch": 2.8682108626198084, "grad_norm": 0.9841680067526405, "learning_rate": 5.868442638456373e-08, "loss": 0.0288, "step": 7182 }, { "epoch": 2.8690095846645365, "grad_norm": 0.9134522070907273, "learning_rate": 5.797662925821068e-08, "loss": 0.03, "step": 7184 }, { "epoch": 2.869808306709265, "grad_norm": 1.0297579858038646, "learning_rate": 5.7273101573762644e-08, "loss": 0.0297, "step": 7186 }, { "epoch": 2.8706070287539935, "grad_norm": 1.002668230554201, "learning_rate": 5.6573843939001224e-08, "loss": 0.0321, "step": 7188 }, { "epoch": 2.871405750798722, "grad_norm": 0.8448908330163806, "learning_rate": 5.5878856958018755e-08, "loss": 0.0275, "step": 7190 }, { "epoch": 2.8722044728434506, "grad_norm": 0.8951090877529749, "learning_rate": 5.518814123121885e-08, "loss": 0.028, "step": 7192 }, { "epoch": 2.873003194888179, "grad_norm": 0.8887400334865615, "learning_rate": 5.450169735531419e-08, "loss": 0.0266, "step": 7194 }, { "epoch": 2.873801916932907, "grad_norm": 0.9527424771255477, "learning_rate": 5.381952592332762e-08, "loss": 0.0292, "step": 7196 }, { "epoch": 2.8746006389776357, "grad_norm": 0.9236990470777641, "learning_rate": 5.3141627524591066e-08, "loss": 0.0268, "step": 7198 }, { "epoch": 2.8753993610223643, "grad_norm": 0.99547523531349, "learning_rate": 5.246800274474439e-08, "loss": 0.0287, "step": 7200 }, { "epoch": 2.876198083067093, "grad_norm": 1.0395745492495505, "learning_rate": 5.179865216573654e-08, "loss": 0.0272, "step": 7202 }, { "epoch": 2.876996805111821, "grad_norm": 0.9239013292846188, "learning_rate": 5.1133576365823277e-08, "loss": 0.03, "step": 7204 }, { "epoch": 2.8777955271565494, "grad_norm": 0.8510701957695702, "learning_rate": 5.047277591956668e-08, "loss": 0.0237, "step": 7206 }, { "epoch": 2.878594249201278, "grad_norm": 1.0550009694911764, "learning_rate": 4.981625139783619e-08, "loss": 0.0264, "step": 7208 }, { "epoch": 2.8793929712460065, "grad_norm": 0.8651740589602106, "learning_rate": 4.916400336780758e-08, "loss": 0.0289, "step": 7210 }, { "epoch": 2.880191693290735, "grad_norm": 0.9833410681522953, "learning_rate": 4.851603239296065e-08, "loss": 0.0263, "step": 7212 }, { "epoch": 2.8809904153354635, "grad_norm": 0.9156057914045268, "learning_rate": 4.787233903308208e-08, "loss": 0.0266, "step": 7214 }, { "epoch": 2.8817891373801916, "grad_norm": 0.9881205371437639, "learning_rate": 4.723292384426203e-08, "loss": 0.0295, "step": 7216 }, { "epoch": 2.88258785942492, "grad_norm": 0.9124860856419786, "learning_rate": 4.65977873788942e-08, "loss": 0.0287, "step": 7218 }, { "epoch": 2.8833865814696487, "grad_norm": 0.9969073107367281, "learning_rate": 4.596693018567744e-08, "loss": 0.0288, "step": 7220 }, { "epoch": 2.8841853035143767, "grad_norm": 0.8841157317264496, "learning_rate": 4.534035280961191e-08, "loss": 0.0309, "step": 7222 }, { "epoch": 2.8849840255591053, "grad_norm": 1.0078494252596897, "learning_rate": 4.471805579200239e-08, "loss": 0.0312, "step": 7224 }, { "epoch": 2.885782747603834, "grad_norm": 0.9932395040774775, "learning_rate": 4.41000396704544e-08, "loss": 0.0283, "step": 7226 }, { "epoch": 2.8865814696485623, "grad_norm": 0.9777222302672404, "learning_rate": 4.3486304978875294e-08, "loss": 0.0286, "step": 7228 }, { "epoch": 2.887380191693291, "grad_norm": 0.9488138150582978, "learning_rate": 4.287685224747373e-08, "loss": 0.0287, "step": 7230 }, { "epoch": 2.8881789137380194, "grad_norm": 0.9515551976834303, "learning_rate": 4.227168200276077e-08, "loss": 0.0301, "step": 7232 }, { "epoch": 2.8889776357827475, "grad_norm": 1.0020004748739546, "learning_rate": 4.167079476754432e-08, "loss": 0.0303, "step": 7234 }, { "epoch": 2.889776357827476, "grad_norm": 0.9940040887434322, "learning_rate": 4.1074191060935794e-08, "loss": 0.0314, "step": 7236 }, { "epoch": 2.8905750798722045, "grad_norm": 0.9574123979978048, "learning_rate": 4.048187139834403e-08, "loss": 0.0266, "step": 7238 }, { "epoch": 2.891373801916933, "grad_norm": 0.9872954533945656, "learning_rate": 3.989383629147747e-08, "loss": 0.0305, "step": 7240 }, { "epoch": 2.892172523961661, "grad_norm": 1.093299647415339, "learning_rate": 3.9310086248342536e-08, "loss": 0.0309, "step": 7242 }, { "epoch": 2.8929712460063897, "grad_norm": 0.9207625460483227, "learning_rate": 3.873062177324472e-08, "loss": 0.0293, "step": 7244 }, { "epoch": 2.893769968051118, "grad_norm": 0.8613042189991472, "learning_rate": 3.8155443366785786e-08, "loss": 0.0278, "step": 7246 }, { "epoch": 2.8945686900958467, "grad_norm": 0.9093281736899324, "learning_rate": 3.758455152586715e-08, "loss": 0.0273, "step": 7248 }, { "epoch": 2.8953674121405752, "grad_norm": 0.8976668145886221, "learning_rate": 3.7017946743683754e-08, "loss": 0.0279, "step": 7250 }, { "epoch": 2.8961661341853038, "grad_norm": 0.9438489705868893, "learning_rate": 3.645562950973014e-08, "loss": 0.0288, "step": 7252 }, { "epoch": 2.896964856230032, "grad_norm": 0.9039303228109454, "learning_rate": 3.589760030979439e-08, "loss": 0.029, "step": 7254 }, { "epoch": 2.8977635782747604, "grad_norm": 0.8665135816874094, "learning_rate": 3.534385962596143e-08, "loss": 0.0257, "step": 7256 }, { "epoch": 2.898562300319489, "grad_norm": 0.9885112089994623, "learning_rate": 3.479440793661082e-08, "loss": 0.0287, "step": 7258 }, { "epoch": 2.899361022364217, "grad_norm": 0.9313125684039982, "learning_rate": 3.4249245716417303e-08, "loss": 0.0311, "step": 7260 }, { "epoch": 2.9001597444089455, "grad_norm": 1.0211354686014047, "learning_rate": 3.370837343634914e-08, "loss": 0.0281, "step": 7262 }, { "epoch": 2.900958466453674, "grad_norm": 0.9261447629517585, "learning_rate": 3.3171791563669785e-08, "loss": 0.0285, "step": 7264 }, { "epoch": 2.9017571884984026, "grad_norm": 0.9543015423065228, "learning_rate": 3.263950056193455e-08, "loss": 0.0285, "step": 7266 }, { "epoch": 2.902555910543131, "grad_norm": 0.876741688352455, "learning_rate": 3.211150089099224e-08, "loss": 0.0261, "step": 7268 }, { "epoch": 2.9033546325878596, "grad_norm": 0.9732009390231579, "learning_rate": 3.1587793006985224e-08, "loss": 0.0308, "step": 7270 }, { "epoch": 2.9041533546325877, "grad_norm": 0.924537711179532, "learning_rate": 3.10683773623488e-08, "loss": 0.028, "step": 7272 }, { "epoch": 2.9049520766773163, "grad_norm": 0.8328671231098053, "learning_rate": 3.055325440580736e-08, "loss": 0.0231, "step": 7274 }, { "epoch": 2.905750798722045, "grad_norm": 0.8481571211750596, "learning_rate": 3.004242458237994e-08, "loss": 0.029, "step": 7276 }, { "epoch": 2.9065495207667733, "grad_norm": 1.0264649887625952, "learning_rate": 2.9535888333374064e-08, "loss": 0.0316, "step": 7278 }, { "epoch": 2.9073482428115014, "grad_norm": 0.9904821324548735, "learning_rate": 2.9033646096390255e-08, "loss": 0.0317, "step": 7280 }, { "epoch": 2.90814696485623, "grad_norm": 0.9529343344181515, "learning_rate": 2.853569830531755e-08, "loss": 0.0294, "step": 7282 }, { "epoch": 2.9089456869009584, "grad_norm": 0.8950285806155511, "learning_rate": 2.8042045390336835e-08, "loss": 0.027, "step": 7284 }, { "epoch": 2.909744408945687, "grad_norm": 0.9915085372273561, "learning_rate": 2.7552687777916976e-08, "loss": 0.029, "step": 7286 }, { "epoch": 2.9105431309904155, "grad_norm": 1.0843075888116502, "learning_rate": 2.706762589081646e-08, "loss": 0.0329, "step": 7288 }, { "epoch": 2.911341853035144, "grad_norm": 1.0723208662350674, "learning_rate": 2.6586860148084537e-08, "loss": 0.0314, "step": 7290 }, { "epoch": 2.912140575079872, "grad_norm": 0.9855410494694304, "learning_rate": 2.6110390965055632e-08, "loss": 0.0332, "step": 7292 }, { "epoch": 2.9129392971246006, "grad_norm": 0.9456950838662169, "learning_rate": 2.563821875335437e-08, "loss": 0.0306, "step": 7294 }, { "epoch": 2.913738019169329, "grad_norm": 0.9431752923514247, "learning_rate": 2.517034392089446e-08, "loss": 0.0242, "step": 7296 }, { "epoch": 2.9145367412140573, "grad_norm": 0.9335986279046339, "learning_rate": 2.4706766871874232e-08, "loss": 0.029, "step": 7298 }, { "epoch": 2.915335463258786, "grad_norm": 0.772170384658901, "learning_rate": 2.4247488006781116e-08, "loss": 0.0259, "step": 7300 }, { "epoch": 2.9161341853035143, "grad_norm": 0.9779269003823648, "learning_rate": 2.3792507722388835e-08, "loss": 0.0301, "step": 7302 }, { "epoch": 2.916932907348243, "grad_norm": 0.9611794936794353, "learning_rate": 2.3341826411756863e-08, "loss": 0.0305, "step": 7304 }, { "epoch": 2.9177316293929714, "grad_norm": 0.8596768995810088, "learning_rate": 2.2895444464232087e-08, "loss": 0.0279, "step": 7306 }, { "epoch": 2.9185303514377, "grad_norm": 1.050550890425119, "learning_rate": 2.2453362265445477e-08, "loss": 0.0266, "step": 7308 }, { "epoch": 2.919329073482428, "grad_norm": 1.0172145059261652, "learning_rate": 2.2015580197314868e-08, "loss": 0.0295, "step": 7310 }, { "epoch": 2.9201277955271565, "grad_norm": 0.8631988032830935, "learning_rate": 2.158209863804217e-08, "loss": 0.0258, "step": 7312 }, { "epoch": 2.920926517571885, "grad_norm": 1.0172194609395517, "learning_rate": 2.1152917962115606e-08, "loss": 0.0311, "step": 7314 }, { "epoch": 2.9217252396166136, "grad_norm": 0.8656946266189468, "learning_rate": 2.0728038540305807e-08, "loss": 0.025, "step": 7316 }, { "epoch": 2.9225239616613417, "grad_norm": 1.2040154733832662, "learning_rate": 2.030746073966916e-08, "loss": 0.0299, "step": 7318 }, { "epoch": 2.92332268370607, "grad_norm": 0.981108582856345, "learning_rate": 1.9891184923544472e-08, "loss": 0.0312, "step": 7320 }, { "epoch": 2.9241214057507987, "grad_norm": 1.0050547681748903, "learning_rate": 1.9479211451555735e-08, "loss": 0.0306, "step": 7322 }, { "epoch": 2.9249201277955272, "grad_norm": 0.9723436269119015, "learning_rate": 1.9071540679608815e-08, "loss": 0.0274, "step": 7324 }, { "epoch": 2.9257188498402558, "grad_norm": 0.890152909346874, "learning_rate": 1.8668172959891985e-08, "loss": 0.0286, "step": 7326 }, { "epoch": 2.9265175718849843, "grad_norm": 1.0167534719731643, "learning_rate": 1.826910864087761e-08, "loss": 0.0293, "step": 7328 }, { "epoch": 2.9273162939297124, "grad_norm": 0.9875484380951924, "learning_rate": 1.7874348067319912e-08, "loss": 0.031, "step": 7330 }, { "epoch": 2.928115015974441, "grad_norm": 1.1309464473632505, "learning_rate": 1.7483891580253877e-08, "loss": 0.0304, "step": 7332 }, { "epoch": 2.9289137380191694, "grad_norm": 0.8279929400769935, "learning_rate": 1.7097739516997447e-08, "loss": 0.0268, "step": 7334 }, { "epoch": 2.9297124600638975, "grad_norm": 0.9993434381726024, "learning_rate": 1.6715892211150442e-08, "loss": 0.0286, "step": 7336 }, { "epoch": 2.930511182108626, "grad_norm": 1.007539806056888, "learning_rate": 1.6338349992591763e-08, "loss": 0.0294, "step": 7338 }, { "epoch": 2.9313099041533546, "grad_norm": 1.0452749714129563, "learning_rate": 1.5965113187482174e-08, "loss": 0.0328, "step": 7340 }, { "epoch": 2.932108626198083, "grad_norm": 0.8474586458371822, "learning_rate": 1.5596182118264303e-08, "loss": 0.0256, "step": 7342 }, { "epoch": 2.9329073482428116, "grad_norm": 0.9287019035012101, "learning_rate": 1.5231557103658755e-08, "loss": 0.0286, "step": 7344 }, { "epoch": 2.93370607028754, "grad_norm": 1.0230975340136683, "learning_rate": 1.4871238458667447e-08, "loss": 0.0308, "step": 7346 }, { "epoch": 2.9345047923322682, "grad_norm": 0.9679221553199847, "learning_rate": 1.4515226494571376e-08, "loss": 0.0279, "step": 7348 }, { "epoch": 2.9353035143769968, "grad_norm": 1.0129002341137234, "learning_rate": 1.41635215189323e-08, "loss": 0.0243, "step": 7350 }, { "epoch": 2.9361022364217253, "grad_norm": 1.159004848542131, "learning_rate": 1.3816123835588835e-08, "loss": 0.0309, "step": 7352 }, { "epoch": 2.936900958466454, "grad_norm": 1.0452986083737144, "learning_rate": 1.3473033744660358e-08, "loss": 0.0297, "step": 7354 }, { "epoch": 2.937699680511182, "grad_norm": 0.944187394021172, "learning_rate": 1.3134251542544774e-08, "loss": 0.0255, "step": 7356 }, { "epoch": 2.9384984025559104, "grad_norm": 1.1423229379322075, "learning_rate": 1.2799777521916856e-08, "loss": 0.0306, "step": 7358 }, { "epoch": 2.939297124600639, "grad_norm": 0.9042757983900201, "learning_rate": 1.2469611971731576e-08, "loss": 0.028, "step": 7360 }, { "epoch": 2.9400958466453675, "grad_norm": 1.219554574679699, "learning_rate": 1.2143755177220774e-08, "loss": 0.0292, "step": 7362 }, { "epoch": 2.940894568690096, "grad_norm": 1.0160171362099817, "learning_rate": 1.1822207419893151e-08, "loss": 0.0316, "step": 7364 }, { "epoch": 2.9416932907348246, "grad_norm": 0.9410768507733369, "learning_rate": 1.1504968977536502e-08, "loss": 0.0311, "step": 7366 }, { "epoch": 2.9424920127795526, "grad_norm": 0.9541185215999987, "learning_rate": 1.1192040124214931e-08, "loss": 0.0251, "step": 7368 }, { "epoch": 2.943290734824281, "grad_norm": 1.1401434841727258, "learning_rate": 1.0883421130268857e-08, "loss": 0.0389, "step": 7370 }, { "epoch": 2.9440894568690097, "grad_norm": 0.9102452503551742, "learning_rate": 1.0579112262316116e-08, "loss": 0.0299, "step": 7372 }, { "epoch": 2.9448881789137378, "grad_norm": 0.8297271919791628, "learning_rate": 1.027911378325086e-08, "loss": 0.0266, "step": 7374 }, { "epoch": 2.9456869009584663, "grad_norm": 0.977055359117076, "learning_rate": 9.983425952243552e-09, "loss": 0.0275, "step": 7376 }, { "epoch": 2.946485623003195, "grad_norm": 0.8856220307196366, "learning_rate": 9.692049024740968e-09, "loss": 0.0273, "step": 7378 }, { "epoch": 2.9472843450479234, "grad_norm": 1.0185133421121422, "learning_rate": 9.404983252464528e-09, "loss": 0.0308, "step": 7380 }, { "epoch": 2.948083067092652, "grad_norm": 0.9226722623416709, "learning_rate": 9.12222888341252e-09, "loss": 0.0278, "step": 7382 }, { "epoch": 2.9488817891373804, "grad_norm": 0.8681408122208755, "learning_rate": 8.84378616185788e-09, "loss": 0.0289, "step": 7384 }, { "epoch": 2.9496805111821085, "grad_norm": 0.9820066250150342, "learning_rate": 8.569655328349302e-09, "loss": 0.0306, "step": 7386 }, { "epoch": 2.950479233226837, "grad_norm": 0.9224388253014746, "learning_rate": 8.299836619709011e-09, "loss": 0.0252, "step": 7388 }, { "epoch": 2.9512779552715656, "grad_norm": 1.078281742555588, "learning_rate": 8.034330269034995e-09, "loss": 0.0339, "step": 7390 }, { "epoch": 2.952076677316294, "grad_norm": 1.1121638039606885, "learning_rate": 7.773136505700995e-09, "loss": 0.0315, "step": 7392 }, { "epoch": 2.952875399361022, "grad_norm": 0.8527091048044436, "learning_rate": 7.516255555352069e-09, "loss": 0.0277, "step": 7394 }, { "epoch": 2.9536741214057507, "grad_norm": 0.9156141320301706, "learning_rate": 7.2636876399107e-09, "loss": 0.0269, "step": 7396 }, { "epoch": 2.9544728434504792, "grad_norm": 1.3378705563366167, "learning_rate": 7.015432977570679e-09, "loss": 0.0277, "step": 7398 }, { "epoch": 2.9552715654952078, "grad_norm": 0.9035834422975698, "learning_rate": 6.7714917828004545e-09, "loss": 0.0281, "step": 7400 }, { "epoch": 2.9560702875399363, "grad_norm": 1.0788469833239718, "learning_rate": 6.531864266343113e-09, "loss": 0.0311, "step": 7402 }, { "epoch": 2.956869009584665, "grad_norm": 1.0030626593801413, "learning_rate": 6.296550635213616e-09, "loss": 0.0282, "step": 7404 }, { "epoch": 2.957667731629393, "grad_norm": 0.9736720908524453, "learning_rate": 6.0655510927010165e-09, "loss": 0.0309, "step": 7406 }, { "epoch": 2.9584664536741214, "grad_norm": 0.976953889137446, "learning_rate": 5.838865838366792e-09, "loss": 0.03, "step": 7408 }, { "epoch": 2.95926517571885, "grad_norm": 0.96316953344491, "learning_rate": 5.616495068046513e-09, "loss": 0.0306, "step": 7410 }, { "epoch": 2.960063897763578, "grad_norm": 0.8882325546800076, "learning_rate": 5.398438973845954e-09, "loss": 0.0328, "step": 7412 }, { "epoch": 2.9608626198083066, "grad_norm": 1.0434465347465718, "learning_rate": 5.184697744146094e-09, "loss": 0.0296, "step": 7414 }, { "epoch": 2.961661341853035, "grad_norm": 1.1676463120664913, "learning_rate": 4.975271563599227e-09, "loss": 0.0342, "step": 7416 }, { "epoch": 2.9624600638977636, "grad_norm": 1.125598033675333, "learning_rate": 4.770160613129515e-09, "loss": 0.0305, "step": 7418 }, { "epoch": 2.963258785942492, "grad_norm": 0.7862908310858415, "learning_rate": 4.569365069933551e-09, "loss": 0.0256, "step": 7420 }, { "epoch": 2.9640575079872207, "grad_norm": 0.9443569000718535, "learning_rate": 4.372885107479796e-09, "loss": 0.0306, "step": 7422 }, { "epoch": 2.9648562300319488, "grad_norm": 0.800637449098191, "learning_rate": 4.180720895508028e-09, "loss": 0.0293, "step": 7424 }, { "epoch": 2.9656549520766773, "grad_norm": 0.9430328576301563, "learning_rate": 3.992872600030451e-09, "loss": 0.0271, "step": 7426 }, { "epoch": 2.966453674121406, "grad_norm": 0.9058280169647789, "learning_rate": 3.809340383330584e-09, "loss": 0.0265, "step": 7428 }, { "epoch": 2.9672523961661343, "grad_norm": 0.976846038410627, "learning_rate": 3.630124403961599e-09, "loss": 0.0308, "step": 7430 }, { "epoch": 2.9680511182108624, "grad_norm": 0.9428661317946987, "learning_rate": 3.4552248167507576e-09, "loss": 0.0271, "step": 7432 }, { "epoch": 2.968849840255591, "grad_norm": 0.9860305014063514, "learning_rate": 3.284641772793862e-09, "loss": 0.0299, "step": 7434 }, { "epoch": 2.9696485623003195, "grad_norm": 0.8472915574856456, "learning_rate": 3.118375419458586e-09, "loss": 0.0231, "step": 7436 }, { "epoch": 2.970447284345048, "grad_norm": 0.9416785407994653, "learning_rate": 2.956425900383919e-09, "loss": 0.0333, "step": 7438 }, { "epoch": 2.9712460063897765, "grad_norm": 0.7967253694302693, "learning_rate": 2.798793355478502e-09, "loss": 0.03, "step": 7440 }, { "epoch": 2.972044728434505, "grad_norm": 0.9101321750296127, "learning_rate": 2.6454779209217353e-09, "loss": 0.0288, "step": 7442 }, { "epoch": 2.972843450479233, "grad_norm": 0.960448163452539, "learning_rate": 2.496479729164891e-09, "loss": 0.0307, "step": 7444 }, { "epoch": 2.9736421725239617, "grad_norm": 0.9953358583136821, "learning_rate": 2.3517989089272274e-09, "loss": 0.027, "step": 7446 }, { "epoch": 2.97444089456869, "grad_norm": 0.9262672112649574, "learning_rate": 2.2114355851993175e-09, "loss": 0.0252, "step": 7448 }, { "epoch": 2.9752396166134183, "grad_norm": 1.0161598052132237, "learning_rate": 2.0753898792424954e-09, "loss": 0.03, "step": 7450 }, { "epoch": 2.976038338658147, "grad_norm": 0.8238758679662858, "learning_rate": 1.943661908586636e-09, "loss": 0.0226, "step": 7452 }, { "epoch": 2.9768370607028753, "grad_norm": 1.1294195598874084, "learning_rate": 1.81625178703293e-09, "loss": 0.0314, "step": 7454 }, { "epoch": 2.977635782747604, "grad_norm": 0.7948759771645315, "learning_rate": 1.6931596246516636e-09, "loss": 0.0237, "step": 7456 }, { "epoch": 2.9784345047923324, "grad_norm": 0.9603638334498134, "learning_rate": 1.5743855277822185e-09, "loss": 0.0271, "step": 7458 }, { "epoch": 2.979233226837061, "grad_norm": 1.004902038270927, "learning_rate": 1.4599295990352924e-09, "loss": 0.0274, "step": 7460 }, { "epoch": 2.980031948881789, "grad_norm": 1.3727183415621365, "learning_rate": 1.3497919372890135e-09, "loss": 0.0299, "step": 7462 }, { "epoch": 2.9808306709265175, "grad_norm": 1.0113889053741916, "learning_rate": 1.24397263769227e-09, "loss": 0.0334, "step": 7464 }, { "epoch": 2.981629392971246, "grad_norm": 1.058301608314339, "learning_rate": 1.1424717916630468e-09, "loss": 0.0314, "step": 7466 }, { "epoch": 2.9824281150159746, "grad_norm": 1.0685945832080852, "learning_rate": 1.0452894868884235e-09, "loss": 0.0287, "step": 7468 }, { "epoch": 2.9832268370607027, "grad_norm": 0.9439988444316789, "learning_rate": 9.52425807324575e-10, "loss": 0.0279, "step": 7470 }, { "epoch": 2.984025559105431, "grad_norm": 1.0539650853101212, "learning_rate": 8.638808331973281e-10, "loss": 0.0306, "step": 7472 }, { "epoch": 2.9848242811501597, "grad_norm": 0.8544922347586041, "learning_rate": 7.796546410004934e-10, "loss": 0.0302, "step": 7474 }, { "epoch": 2.9856230031948883, "grad_norm": 0.9303223740142429, "learning_rate": 6.997473034986435e-10, "loss": 0.0281, "step": 7476 }, { "epoch": 2.986421725239617, "grad_norm": 0.9400919377169601, "learning_rate": 6.241588897232253e-10, "loss": 0.0276, "step": 7478 }, { "epoch": 2.987220447284345, "grad_norm": 0.9420254495627837, "learning_rate": 5.528894649758921e-10, "loss": 0.0252, "step": 7480 }, { "epoch": 2.9880191693290734, "grad_norm": 0.8844462225772538, "learning_rate": 4.859390908268369e-10, "loss": 0.0271, "step": 7482 }, { "epoch": 2.988817891373802, "grad_norm": 0.9000038358376025, "learning_rate": 4.2330782511423865e-10, "loss": 0.0291, "step": 7484 }, { "epoch": 2.9896166134185305, "grad_norm": 1.0116302989209616, "learning_rate": 3.649957219464817e-10, "loss": 0.0312, "step": 7486 }, { "epoch": 2.9904153354632586, "grad_norm": 1.0168619061466087, "learning_rate": 3.1100283169938074e-10, "loss": 0.0257, "step": 7488 }, { "epoch": 2.991214057507987, "grad_norm": 0.9317368624862982, "learning_rate": 2.613292010172908e-10, "loss": 0.0326, "step": 7490 }, { "epoch": 2.9920127795527156, "grad_norm": 1.0576362124675651, "learning_rate": 2.1597487281366236e-10, "loss": 0.0279, "step": 7492 }, { "epoch": 2.992811501597444, "grad_norm": 0.9509971386881086, "learning_rate": 1.7493988627104164e-10, "loss": 0.0318, "step": 7494 }, { "epoch": 2.9936102236421727, "grad_norm": 0.8370706486653864, "learning_rate": 1.3822427683884975e-10, "loss": 0.0273, "step": 7496 }, { "epoch": 2.994408945686901, "grad_norm": 1.0612898069636711, "learning_rate": 1.0582807623671365e-10, "loss": 0.0352, "step": 7498 }, { "epoch": 2.9952076677316293, "grad_norm": 0.805238580681461, "learning_rate": 7.775131245169044e-11, "loss": 0.0258, "step": 7500 }, { "epoch": 2.9952076677316293, "eval_loss": 0.18183551728725433, "eval_runtime": 419.4635, "eval_samples_per_second": 42.452, "eval_steps_per_second": 5.307, "step": 7500 }, { "epoch": 2.996006389776358, "grad_norm": 0.8616048488279973, "learning_rate": 5.399400973882252e-11, "loss": 0.0255, "step": 7502 }, { "epoch": 2.9968051118210863, "grad_norm": 0.8828142582638174, "learning_rate": 3.4556188622802964e-11, "loss": 0.0298, "step": 7504 }, { "epoch": 2.997603833865815, "grad_norm": 1.0530041672485366, "learning_rate": 1.9437865895755027e-11, "loss": 0.0281, "step": 7506 }, { "epoch": 2.998402555910543, "grad_norm": 0.8820484624604644, "learning_rate": 8.639054618897468e-12, "loss": 0.0258, "step": 7508 }, { "epoch": 2.9992012779552715, "grad_norm": 1.0521391566214802, "learning_rate": 2.1597641214343714e-12, "loss": 0.028, "step": 7510 }, { "epoch": 3.0, "grad_norm": 1.0637471844248951, "learning_rate": 0.0, "loss": 0.0258, "step": 7512 }, { "epoch": 3.0, "step": 7512, "total_flos": 730578500321280.0, "train_loss": 0.11755168595261023, "train_runtime": 63778.2421, "train_samples_per_second": 7.538, "train_steps_per_second": 0.118 } ], "logging_steps": 2, "max_steps": 7512, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 730578500321280.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }