diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,26454 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 7512, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007987220447284345, + "grad_norm": 63.611212563977595, + "learning_rate": 2.6595744680851065e-08, + "loss": 1.2894, + "step": 2 + }, + { + "epoch": 0.001597444089456869, + "grad_norm": 105.73905369340336, + "learning_rate": 5.319148936170213e-08, + "loss": 1.1515, + "step": 4 + }, + { + "epoch": 0.0023961661341853034, + "grad_norm": 97.48653856616731, + "learning_rate": 7.978723404255319e-08, + "loss": 1.1725, + "step": 6 + }, + { + "epoch": 0.003194888178913738, + "grad_norm": 113.1691721809818, + "learning_rate": 1.0638297872340426e-07, + "loss": 1.2009, + "step": 8 + }, + { + "epoch": 0.003993610223642172, + "grad_norm": 234.6545065934056, + "learning_rate": 1.3297872340425533e-07, + "loss": 1.2124, + "step": 10 + }, + { + "epoch": 0.004792332268370607, + "grad_norm": 56.50421981052306, + "learning_rate": 1.5957446808510638e-07, + "loss": 1.2292, + "step": 12 + }, + { + "epoch": 0.005591054313099041, + "grad_norm": 67.76208027590963, + "learning_rate": 1.8617021276595745e-07, + "loss": 1.1731, + "step": 14 + }, + { + "epoch": 0.006389776357827476, + "grad_norm": 19.773545305324223, + "learning_rate": 2.1276595744680852e-07, + "loss": 1.1682, + "step": 16 + }, + { + "epoch": 0.00718849840255591, + "grad_norm": 108.48956883104074, + "learning_rate": 2.393617021276596e-07, + "loss": 1.1885, + "step": 18 + }, + { + "epoch": 0.007987220447284345, + "grad_norm": 74.594447029578, + "learning_rate": 2.6595744680851066e-07, + "loss": 1.1143, + "step": 20 + }, + { + "epoch": 0.00878594249201278, + "grad_norm": 193.76064469208063, + "learning_rate": 2.9255319148936174e-07, + "loss": 1.1366, + "step": 22 + }, + { + "epoch": 0.009584664536741214, + "grad_norm": 165.84272450497056, + "learning_rate": 3.1914893617021275e-07, + "loss": 1.1753, + "step": 24 + }, + { + "epoch": 0.010383386581469648, + "grad_norm": 150.10218395823307, + "learning_rate": 3.457446808510639e-07, + "loss": 1.1228, + "step": 26 + }, + { + "epoch": 0.011182108626198083, + "grad_norm": 25.354463533053462, + "learning_rate": 3.723404255319149e-07, + "loss": 1.138, + "step": 28 + }, + { + "epoch": 0.011980830670926517, + "grad_norm": 106.72660838009219, + "learning_rate": 3.98936170212766e-07, + "loss": 1.1287, + "step": 30 + }, + { + "epoch": 0.012779552715654952, + "grad_norm": 83.97368303722932, + "learning_rate": 4.2553191489361704e-07, + "loss": 1.0818, + "step": 32 + }, + { + "epoch": 0.013578274760383386, + "grad_norm": 72.42974677624225, + "learning_rate": 4.5212765957446816e-07, + "loss": 0.974, + "step": 34 + }, + { + "epoch": 0.01437699680511182, + "grad_norm": 98.949142462491, + "learning_rate": 4.787234042553192e-07, + "loss": 1.0153, + "step": 36 + }, + { + "epoch": 0.015175718849840255, + "grad_norm": 81.09462884756049, + "learning_rate": 5.053191489361702e-07, + "loss": 1.0437, + "step": 38 + }, + { + "epoch": 0.01597444089456869, + "grad_norm": 71.5460678669271, + "learning_rate": 5.319148936170213e-07, + "loss": 0.9027, + "step": 40 + }, + { + "epoch": 0.016773162939297124, + "grad_norm": 97.98883013825395, + "learning_rate": 5.585106382978723e-07, + "loss": 0.8252, + "step": 42 + }, + { + "epoch": 0.01757188498402556, + "grad_norm": 130.2877391749827, + "learning_rate": 5.851063829787235e-07, + "loss": 0.7532, + "step": 44 + }, + { + "epoch": 0.018370607028753993, + "grad_norm": 98.81399161595317, + "learning_rate": 6.117021276595745e-07, + "loss": 0.7013, + "step": 46 + }, + { + "epoch": 0.019169329073482427, + "grad_norm": 22.295409656345377, + "learning_rate": 6.382978723404255e-07, + "loss": 0.6467, + "step": 48 + }, + { + "epoch": 0.019968051118210862, + "grad_norm": 34.582920133699176, + "learning_rate": 6.648936170212766e-07, + "loss": 0.6351, + "step": 50 + }, + { + "epoch": 0.020766773162939296, + "grad_norm": 50.861520008988386, + "learning_rate": 6.914893617021278e-07, + "loss": 0.5912, + "step": 52 + }, + { + "epoch": 0.02156549520766773, + "grad_norm": 49.12926837947303, + "learning_rate": 7.180851063829789e-07, + "loss": 0.5413, + "step": 54 + }, + { + "epoch": 0.022364217252396165, + "grad_norm": 20.811523309346935, + "learning_rate": 7.446808510638298e-07, + "loss": 0.479, + "step": 56 + }, + { + "epoch": 0.0231629392971246, + "grad_norm": 12.332350980787394, + "learning_rate": 7.712765957446809e-07, + "loss": 0.4714, + "step": 58 + }, + { + "epoch": 0.023961661341853034, + "grad_norm": 15.193345562611738, + "learning_rate": 7.97872340425532e-07, + "loss": 0.4622, + "step": 60 + }, + { + "epoch": 0.02476038338658147, + "grad_norm": 5.637530351411472, + "learning_rate": 8.24468085106383e-07, + "loss": 0.4079, + "step": 62 + }, + { + "epoch": 0.025559105431309903, + "grad_norm": 11.657766039541796, + "learning_rate": 8.510638297872341e-07, + "loss": 0.3716, + "step": 64 + }, + { + "epoch": 0.026357827476038338, + "grad_norm": 8.003062600948699, + "learning_rate": 8.776595744680852e-07, + "loss": 0.4022, + "step": 66 + }, + { + "epoch": 0.027156549520766772, + "grad_norm": 9.863778660497967, + "learning_rate": 9.042553191489363e-07, + "loss": 0.4061, + "step": 68 + }, + { + "epoch": 0.027955271565495207, + "grad_norm": 26.781259322239684, + "learning_rate": 9.308510638297872e-07, + "loss": 0.3638, + "step": 70 + }, + { + "epoch": 0.02875399361022364, + "grad_norm": 5.445843816912823, + "learning_rate": 9.574468085106384e-07, + "loss": 0.3581, + "step": 72 + }, + { + "epoch": 0.029552715654952075, + "grad_norm": 9.213363331667637, + "learning_rate": 9.840425531914895e-07, + "loss": 0.3702, + "step": 74 + }, + { + "epoch": 0.03035143769968051, + "grad_norm": 4.727611360243214, + "learning_rate": 1.0106382978723404e-06, + "loss": 0.3487, + "step": 76 + }, + { + "epoch": 0.031150159744408944, + "grad_norm": 18.19406307616779, + "learning_rate": 1.0372340425531915e-06, + "loss": 0.3222, + "step": 78 + }, + { + "epoch": 0.03194888178913738, + "grad_norm": 9.595831277572767, + "learning_rate": 1.0638297872340427e-06, + "loss": 0.3258, + "step": 80 + }, + { + "epoch": 0.03274760383386582, + "grad_norm": 35.38120873285774, + "learning_rate": 1.0904255319148938e-06, + "loss": 0.3055, + "step": 82 + }, + { + "epoch": 0.03354632587859425, + "grad_norm": 5.084938749131369, + "learning_rate": 1.1170212765957447e-06, + "loss": 0.3174, + "step": 84 + }, + { + "epoch": 0.034345047923322686, + "grad_norm": 7.402514872898367, + "learning_rate": 1.1436170212765958e-06, + "loss": 0.3097, + "step": 86 + }, + { + "epoch": 0.03514376996805112, + "grad_norm": 3.949621968931367, + "learning_rate": 1.170212765957447e-06, + "loss": 0.3152, + "step": 88 + }, + { + "epoch": 0.035942492012779555, + "grad_norm": 10.814942895226139, + "learning_rate": 1.196808510638298e-06, + "loss": 0.3037, + "step": 90 + }, + { + "epoch": 0.036741214057507986, + "grad_norm": 14.354678286624155, + "learning_rate": 1.223404255319149e-06, + "loss": 0.3261, + "step": 92 + }, + { + "epoch": 0.037539936102236424, + "grad_norm": 6.168542351180327, + "learning_rate": 1.25e-06, + "loss": 0.31, + "step": 94 + }, + { + "epoch": 0.038338658146964855, + "grad_norm": 11.539898124756117, + "learning_rate": 1.276595744680851e-06, + "loss": 0.311, + "step": 96 + }, + { + "epoch": 0.03913738019169329, + "grad_norm": 4.013221758787171, + "learning_rate": 1.3031914893617024e-06, + "loss": 0.2996, + "step": 98 + }, + { + "epoch": 0.039936102236421724, + "grad_norm": 3.860547140813227, + "learning_rate": 1.3297872340425533e-06, + "loss": 0.2763, + "step": 100 + }, + { + "epoch": 0.04073482428115016, + "grad_norm": 17.85966416524752, + "learning_rate": 1.3563829787234042e-06, + "loss": 0.2904, + "step": 102 + }, + { + "epoch": 0.04153354632587859, + "grad_norm": 16.697164957447292, + "learning_rate": 1.3829787234042555e-06, + "loss": 0.2832, + "step": 104 + }, + { + "epoch": 0.04233226837060703, + "grad_norm": 25.51601683379472, + "learning_rate": 1.4095744680851064e-06, + "loss": 0.2861, + "step": 106 + }, + { + "epoch": 0.04313099041533546, + "grad_norm": 43.70792394022941, + "learning_rate": 1.4361702127659578e-06, + "loss": 0.2882, + "step": 108 + }, + { + "epoch": 0.0439297124600639, + "grad_norm": 21.39234312256687, + "learning_rate": 1.4627659574468087e-06, + "loss": 0.2868, + "step": 110 + }, + { + "epoch": 0.04472843450479233, + "grad_norm": 4.299149147995996, + "learning_rate": 1.4893617021276596e-06, + "loss": 0.2802, + "step": 112 + }, + { + "epoch": 0.04552715654952077, + "grad_norm": 4.853881749446152, + "learning_rate": 1.515957446808511e-06, + "loss": 0.2766, + "step": 114 + }, + { + "epoch": 0.0463258785942492, + "grad_norm": 14.252871564540007, + "learning_rate": 1.5425531914893618e-06, + "loss": 0.2696, + "step": 116 + }, + { + "epoch": 0.04712460063897764, + "grad_norm": 4.330757958493644, + "learning_rate": 1.5691489361702128e-06, + "loss": 0.2868, + "step": 118 + }, + { + "epoch": 0.04792332268370607, + "grad_norm": 5.144959414872726, + "learning_rate": 1.595744680851064e-06, + "loss": 0.2985, + "step": 120 + }, + { + "epoch": 0.048722044728434506, + "grad_norm": 13.510588438287128, + "learning_rate": 1.622340425531915e-06, + "loss": 0.2903, + "step": 122 + }, + { + "epoch": 0.04952076677316294, + "grad_norm": 9.984785828599879, + "learning_rate": 1.648936170212766e-06, + "loss": 0.3019, + "step": 124 + }, + { + "epoch": 0.050319488817891375, + "grad_norm": 11.150832562553619, + "learning_rate": 1.6755319148936172e-06, + "loss": 0.271, + "step": 126 + }, + { + "epoch": 0.051118210862619806, + "grad_norm": 14.058578339810987, + "learning_rate": 1.7021276595744682e-06, + "loss": 0.273, + "step": 128 + }, + { + "epoch": 0.051916932907348244, + "grad_norm": 5.00151816005258, + "learning_rate": 1.7287234042553195e-06, + "loss": 0.2607, + "step": 130 + }, + { + "epoch": 0.052715654952076675, + "grad_norm": 15.004005157673818, + "learning_rate": 1.7553191489361704e-06, + "loss": 0.2654, + "step": 132 + }, + { + "epoch": 0.05351437699680511, + "grad_norm": 7.002483899819058, + "learning_rate": 1.7819148936170213e-06, + "loss": 0.2623, + "step": 134 + }, + { + "epoch": 0.054313099041533544, + "grad_norm": 3.9841361364612062, + "learning_rate": 1.8085106382978727e-06, + "loss": 0.2934, + "step": 136 + }, + { + "epoch": 0.05511182108626198, + "grad_norm": 3.8494401246050054, + "learning_rate": 1.8351063829787236e-06, + "loss": 0.274, + "step": 138 + }, + { + "epoch": 0.05591054313099041, + "grad_norm": 5.497525399265219, + "learning_rate": 1.8617021276595745e-06, + "loss": 0.2647, + "step": 140 + }, + { + "epoch": 0.05670926517571885, + "grad_norm": 4.980587322498712, + "learning_rate": 1.8882978723404258e-06, + "loss": 0.273, + "step": 142 + }, + { + "epoch": 0.05750798722044728, + "grad_norm": 3.7891589019522653, + "learning_rate": 1.9148936170212767e-06, + "loss": 0.2517, + "step": 144 + }, + { + "epoch": 0.05830670926517572, + "grad_norm": 4.92828775714686, + "learning_rate": 1.941489361702128e-06, + "loss": 0.2704, + "step": 146 + }, + { + "epoch": 0.05910543130990415, + "grad_norm": 4.103755865538166, + "learning_rate": 1.968085106382979e-06, + "loss": 0.2591, + "step": 148 + }, + { + "epoch": 0.05990415335463259, + "grad_norm": 3.435413585690327, + "learning_rate": 1.99468085106383e-06, + "loss": 0.2509, + "step": 150 + }, + { + "epoch": 0.06070287539936102, + "grad_norm": 4.277721208929287, + "learning_rate": 2.021276595744681e-06, + "loss": 0.2584, + "step": 152 + }, + { + "epoch": 0.06150159744408946, + "grad_norm": 3.799249236289544, + "learning_rate": 2.047872340425532e-06, + "loss": 0.2589, + "step": 154 + }, + { + "epoch": 0.06230031948881789, + "grad_norm": 3.7296853670347825, + "learning_rate": 2.074468085106383e-06, + "loss": 0.2567, + "step": 156 + }, + { + "epoch": 0.06309904153354633, + "grad_norm": 4.512870491032634, + "learning_rate": 2.101063829787234e-06, + "loss": 0.2481, + "step": 158 + }, + { + "epoch": 0.06389776357827476, + "grad_norm": 9.719375176261458, + "learning_rate": 2.1276595744680853e-06, + "loss": 0.24, + "step": 160 + }, + { + "epoch": 0.06469648562300319, + "grad_norm": 5.994224591000894, + "learning_rate": 2.1542553191489364e-06, + "loss": 0.245, + "step": 162 + }, + { + "epoch": 0.06549520766773163, + "grad_norm": 5.4415526514767025, + "learning_rate": 2.1808510638297876e-06, + "loss": 0.2653, + "step": 164 + }, + { + "epoch": 0.06629392971246006, + "grad_norm": 5.939080958451638, + "learning_rate": 2.2074468085106387e-06, + "loss": 0.2472, + "step": 166 + }, + { + "epoch": 0.0670926517571885, + "grad_norm": 5.160493886252985, + "learning_rate": 2.2340425531914894e-06, + "loss": 0.2471, + "step": 168 + }, + { + "epoch": 0.06789137380191693, + "grad_norm": 3.652646803021308, + "learning_rate": 2.2606382978723405e-06, + "loss": 0.2324, + "step": 170 + }, + { + "epoch": 0.06869009584664537, + "grad_norm": 3.8466095536871343, + "learning_rate": 2.2872340425531916e-06, + "loss": 0.2452, + "step": 172 + }, + { + "epoch": 0.0694888178913738, + "grad_norm": 3.3542675856152537, + "learning_rate": 2.3138297872340428e-06, + "loss": 0.2419, + "step": 174 + }, + { + "epoch": 0.07028753993610223, + "grad_norm": 4.481843921006337, + "learning_rate": 2.340425531914894e-06, + "loss": 0.2451, + "step": 176 + }, + { + "epoch": 0.07108626198083066, + "grad_norm": 3.03610682279202, + "learning_rate": 2.367021276595745e-06, + "loss": 0.2183, + "step": 178 + }, + { + "epoch": 0.07188498402555911, + "grad_norm": 3.1175806494594482, + "learning_rate": 2.393617021276596e-06, + "loss": 0.2349, + "step": 180 + }, + { + "epoch": 0.07268370607028754, + "grad_norm": 3.4236470265390033, + "learning_rate": 2.420212765957447e-06, + "loss": 0.2406, + "step": 182 + }, + { + "epoch": 0.07348242811501597, + "grad_norm": 3.604740428294787, + "learning_rate": 2.446808510638298e-06, + "loss": 0.2349, + "step": 184 + }, + { + "epoch": 0.0742811501597444, + "grad_norm": 3.255303266074429, + "learning_rate": 2.473404255319149e-06, + "loss": 0.2402, + "step": 186 + }, + { + "epoch": 0.07507987220447285, + "grad_norm": 3.030059130242592, + "learning_rate": 2.5e-06, + "loss": 0.2233, + "step": 188 + }, + { + "epoch": 0.07587859424920128, + "grad_norm": 5.396885260460128, + "learning_rate": 2.5265957446808513e-06, + "loss": 0.2264, + "step": 190 + }, + { + "epoch": 0.07667731629392971, + "grad_norm": 3.4037128475892384, + "learning_rate": 2.553191489361702e-06, + "loss": 0.2266, + "step": 192 + }, + { + "epoch": 0.07747603833865814, + "grad_norm": 3.540697871576238, + "learning_rate": 2.5797872340425536e-06, + "loss": 0.2302, + "step": 194 + }, + { + "epoch": 0.07827476038338659, + "grad_norm": 2.8155061709596616, + "learning_rate": 2.6063829787234047e-06, + "loss": 0.2442, + "step": 196 + }, + { + "epoch": 0.07907348242811502, + "grad_norm": 2.9188721708463885, + "learning_rate": 2.6329787234042554e-06, + "loss": 0.2261, + "step": 198 + }, + { + "epoch": 0.07987220447284345, + "grad_norm": 2.9114746100952598, + "learning_rate": 2.6595744680851065e-06, + "loss": 0.2287, + "step": 200 + }, + { + "epoch": 0.08067092651757188, + "grad_norm": 3.2180252275052257, + "learning_rate": 2.6861702127659577e-06, + "loss": 0.2187, + "step": 202 + }, + { + "epoch": 0.08146964856230032, + "grad_norm": 2.8873639429163314, + "learning_rate": 2.7127659574468084e-06, + "loss": 0.2158, + "step": 204 + }, + { + "epoch": 0.08226837060702875, + "grad_norm": 3.489228955679207, + "learning_rate": 2.73936170212766e-06, + "loss": 0.2258, + "step": 206 + }, + { + "epoch": 0.08306709265175719, + "grad_norm": 2.981624517929962, + "learning_rate": 2.765957446808511e-06, + "loss": 0.2108, + "step": 208 + }, + { + "epoch": 0.08386581469648563, + "grad_norm": 3.21155859005065, + "learning_rate": 2.7925531914893617e-06, + "loss": 0.216, + "step": 210 + }, + { + "epoch": 0.08466453674121406, + "grad_norm": 3.0096285120757935, + "learning_rate": 2.819148936170213e-06, + "loss": 0.2248, + "step": 212 + }, + { + "epoch": 0.08546325878594249, + "grad_norm": 2.9629033272723735, + "learning_rate": 2.845744680851064e-06, + "loss": 0.2319, + "step": 214 + }, + { + "epoch": 0.08626198083067092, + "grad_norm": 3.1407766370777646, + "learning_rate": 2.8723404255319155e-06, + "loss": 0.2083, + "step": 216 + }, + { + "epoch": 0.08706070287539937, + "grad_norm": 2.987122511363789, + "learning_rate": 2.8989361702127662e-06, + "loss": 0.2169, + "step": 218 + }, + { + "epoch": 0.0878594249201278, + "grad_norm": 3.0880793923727063, + "learning_rate": 2.9255319148936174e-06, + "loss": 0.2103, + "step": 220 + }, + { + "epoch": 0.08865814696485623, + "grad_norm": 2.7280690541697075, + "learning_rate": 2.9521276595744685e-06, + "loss": 0.1997, + "step": 222 + }, + { + "epoch": 0.08945686900958466, + "grad_norm": 4.457072529565402, + "learning_rate": 2.978723404255319e-06, + "loss": 0.212, + "step": 224 + }, + { + "epoch": 0.0902555910543131, + "grad_norm": 2.6269741757814176, + "learning_rate": 3.0053191489361703e-06, + "loss": 0.2092, + "step": 226 + }, + { + "epoch": 0.09105431309904154, + "grad_norm": 6.22452274606642, + "learning_rate": 3.031914893617022e-06, + "loss": 0.2049, + "step": 228 + }, + { + "epoch": 0.09185303514376997, + "grad_norm": 2.877291306172186, + "learning_rate": 3.0585106382978726e-06, + "loss": 0.2083, + "step": 230 + }, + { + "epoch": 0.0926517571884984, + "grad_norm": 5.1194444563249935, + "learning_rate": 3.0851063829787237e-06, + "loss": 0.2054, + "step": 232 + }, + { + "epoch": 0.09345047923322684, + "grad_norm": 6.2843799815917745, + "learning_rate": 3.111702127659575e-06, + "loss": 0.2194, + "step": 234 + }, + { + "epoch": 0.09424920127795527, + "grad_norm": 5.861687233187676, + "learning_rate": 3.1382978723404255e-06, + "loss": 0.2284, + "step": 236 + }, + { + "epoch": 0.0950479233226837, + "grad_norm": 3.213879356151013, + "learning_rate": 3.164893617021277e-06, + "loss": 0.2207, + "step": 238 + }, + { + "epoch": 0.09584664536741214, + "grad_norm": 3.1630850214772255, + "learning_rate": 3.191489361702128e-06, + "loss": 0.2066, + "step": 240 + }, + { + "epoch": 0.09664536741214058, + "grad_norm": 3.148239375316538, + "learning_rate": 3.218085106382979e-06, + "loss": 0.2212, + "step": 242 + }, + { + "epoch": 0.09744408945686901, + "grad_norm": 2.508025815693479, + "learning_rate": 3.24468085106383e-06, + "loss": 0.2054, + "step": 244 + }, + { + "epoch": 0.09824281150159744, + "grad_norm": 2.909768854428482, + "learning_rate": 3.271276595744681e-06, + "loss": 0.2186, + "step": 246 + }, + { + "epoch": 0.09904153354632587, + "grad_norm": 7.665173774329105, + "learning_rate": 3.297872340425532e-06, + "loss": 0.2228, + "step": 248 + }, + { + "epoch": 0.09984025559105432, + "grad_norm": 2.253729407128243, + "learning_rate": 3.3244680851063834e-06, + "loss": 0.2049, + "step": 250 + }, + { + "epoch": 0.10063897763578275, + "grad_norm": 3.8706889996285336, + "learning_rate": 3.3510638297872345e-06, + "loss": 0.2259, + "step": 252 + }, + { + "epoch": 0.10143769968051118, + "grad_norm": 3.5646371999914357, + "learning_rate": 3.377659574468085e-06, + "loss": 0.1859, + "step": 254 + }, + { + "epoch": 0.10223642172523961, + "grad_norm": 2.7753062530577597, + "learning_rate": 3.4042553191489363e-06, + "loss": 0.2166, + "step": 256 + }, + { + "epoch": 0.10303514376996806, + "grad_norm": 3.0863522747098022, + "learning_rate": 3.4308510638297874e-06, + "loss": 0.2346, + "step": 258 + }, + { + "epoch": 0.10383386581469649, + "grad_norm": 2.7019568800833897, + "learning_rate": 3.457446808510639e-06, + "loss": 0.2031, + "step": 260 + }, + { + "epoch": 0.10463258785942492, + "grad_norm": 2.297946892414017, + "learning_rate": 3.4840425531914897e-06, + "loss": 0.1934, + "step": 262 + }, + { + "epoch": 0.10543130990415335, + "grad_norm": 2.714689444342462, + "learning_rate": 3.510638297872341e-06, + "loss": 0.2191, + "step": 264 + }, + { + "epoch": 0.1062300319488818, + "grad_norm": 2.2968600529073098, + "learning_rate": 3.537234042553192e-06, + "loss": 0.2078, + "step": 266 + }, + { + "epoch": 0.10702875399361023, + "grad_norm": 2.740039735249897, + "learning_rate": 3.5638297872340426e-06, + "loss": 0.2196, + "step": 268 + }, + { + "epoch": 0.10782747603833866, + "grad_norm": 2.4769261724935423, + "learning_rate": 3.5904255319148938e-06, + "loss": 0.2217, + "step": 270 + }, + { + "epoch": 0.10862619808306709, + "grad_norm": 2.3980460329324713, + "learning_rate": 3.6170212765957453e-06, + "loss": 0.2174, + "step": 272 + }, + { + "epoch": 0.10942492012779553, + "grad_norm": 2.350186883116406, + "learning_rate": 3.643617021276596e-06, + "loss": 0.2164, + "step": 274 + }, + { + "epoch": 0.11022364217252396, + "grad_norm": 2.3070632701092, + "learning_rate": 3.670212765957447e-06, + "loss": 0.2058, + "step": 276 + }, + { + "epoch": 0.1110223642172524, + "grad_norm": 2.5876090442770914, + "learning_rate": 3.6968085106382983e-06, + "loss": 0.2102, + "step": 278 + }, + { + "epoch": 0.11182108626198083, + "grad_norm": 2.527498495500664, + "learning_rate": 3.723404255319149e-06, + "loss": 0.1878, + "step": 280 + }, + { + "epoch": 0.11261980830670927, + "grad_norm": 3.1408931185668734, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.2028, + "step": 282 + }, + { + "epoch": 0.1134185303514377, + "grad_norm": 2.438660009024771, + "learning_rate": 3.7765957446808516e-06, + "loss": 0.198, + "step": 284 + }, + { + "epoch": 0.11421725239616613, + "grad_norm": 3.163227332011823, + "learning_rate": 3.8031914893617023e-06, + "loss": 0.2143, + "step": 286 + }, + { + "epoch": 0.11501597444089456, + "grad_norm": 2.62548659237651, + "learning_rate": 3.8297872340425535e-06, + "loss": 0.2129, + "step": 288 + }, + { + "epoch": 0.11581469648562301, + "grad_norm": 4.68249530525775, + "learning_rate": 3.856382978723405e-06, + "loss": 0.211, + "step": 290 + }, + { + "epoch": 0.11661341853035144, + "grad_norm": 2.5096724167776387, + "learning_rate": 3.882978723404256e-06, + "loss": 0.1901, + "step": 292 + }, + { + "epoch": 0.11741214057507987, + "grad_norm": 2.172894520215211, + "learning_rate": 3.909574468085106e-06, + "loss": 0.1936, + "step": 294 + }, + { + "epoch": 0.1182108626198083, + "grad_norm": 5.3196819233421495, + "learning_rate": 3.936170212765958e-06, + "loss": 0.2104, + "step": 296 + }, + { + "epoch": 0.11900958466453675, + "grad_norm": 2.4110105936218904, + "learning_rate": 3.962765957446809e-06, + "loss": 0.2143, + "step": 298 + }, + { + "epoch": 0.11980830670926518, + "grad_norm": 2.4033618282750724, + "learning_rate": 3.98936170212766e-06, + "loss": 0.2164, + "step": 300 + }, + { + "epoch": 0.12060702875399361, + "grad_norm": 3.457118213267144, + "learning_rate": 4.015957446808511e-06, + "loss": 0.2032, + "step": 302 + }, + { + "epoch": 0.12140575079872204, + "grad_norm": 2.408758592662987, + "learning_rate": 4.042553191489362e-06, + "loss": 0.2115, + "step": 304 + }, + { + "epoch": 0.12220447284345048, + "grad_norm": 2.59506801145671, + "learning_rate": 4.069148936170213e-06, + "loss": 0.2214, + "step": 306 + }, + { + "epoch": 0.12300319488817892, + "grad_norm": 3.0414661937486933, + "learning_rate": 4.095744680851064e-06, + "loss": 0.2088, + "step": 308 + }, + { + "epoch": 0.12380191693290735, + "grad_norm": 3.647758244704823, + "learning_rate": 4.122340425531915e-06, + "loss": 0.2198, + "step": 310 + }, + { + "epoch": 0.12460063897763578, + "grad_norm": 2.67698633677868, + "learning_rate": 4.148936170212766e-06, + "loss": 0.2252, + "step": 312 + }, + { + "epoch": 0.1253993610223642, + "grad_norm": 2.898397178237324, + "learning_rate": 4.175531914893618e-06, + "loss": 0.2181, + "step": 314 + }, + { + "epoch": 0.12619808306709265, + "grad_norm": 2.6840679225522526, + "learning_rate": 4.202127659574468e-06, + "loss": 0.1992, + "step": 316 + }, + { + "epoch": 0.1269968051118211, + "grad_norm": 3.1572391222603886, + "learning_rate": 4.228723404255319e-06, + "loss": 0.1952, + "step": 318 + }, + { + "epoch": 0.12779552715654952, + "grad_norm": 3.5678297945143873, + "learning_rate": 4.255319148936171e-06, + "loss": 0.2083, + "step": 320 + }, + { + "epoch": 0.12859424920127796, + "grad_norm": 2.645675379463557, + "learning_rate": 4.281914893617022e-06, + "loss": 0.2196, + "step": 322 + }, + { + "epoch": 0.12939297124600638, + "grad_norm": 3.2806373380609672, + "learning_rate": 4.308510638297873e-06, + "loss": 0.2044, + "step": 324 + }, + { + "epoch": 0.13019169329073482, + "grad_norm": 3.877339337802895, + "learning_rate": 4.3351063829787236e-06, + "loss": 0.2188, + "step": 326 + }, + { + "epoch": 0.13099041533546327, + "grad_norm": 2.855122134235383, + "learning_rate": 4.361702127659575e-06, + "loss": 0.2178, + "step": 328 + }, + { + "epoch": 0.13178913738019168, + "grad_norm": 2.2492003663786764, + "learning_rate": 4.388297872340426e-06, + "loss": 0.2113, + "step": 330 + }, + { + "epoch": 0.13258785942492013, + "grad_norm": 2.3120507711054166, + "learning_rate": 4.414893617021277e-06, + "loss": 0.2162, + "step": 332 + }, + { + "epoch": 0.13338658146964857, + "grad_norm": 2.306824843021891, + "learning_rate": 4.441489361702128e-06, + "loss": 0.1951, + "step": 334 + }, + { + "epoch": 0.134185303514377, + "grad_norm": 2.4562848392795424, + "learning_rate": 4.468085106382979e-06, + "loss": 0.2091, + "step": 336 + }, + { + "epoch": 0.13498402555910544, + "grad_norm": 2.327058395343108, + "learning_rate": 4.49468085106383e-06, + "loss": 0.1936, + "step": 338 + }, + { + "epoch": 0.13578274760383385, + "grad_norm": 3.2365595394106377, + "learning_rate": 4.521276595744681e-06, + "loss": 0.2076, + "step": 340 + }, + { + "epoch": 0.1365814696485623, + "grad_norm": 2.3773207333074797, + "learning_rate": 4.547872340425532e-06, + "loss": 0.215, + "step": 342 + }, + { + "epoch": 0.13738019169329074, + "grad_norm": 2.588667676878435, + "learning_rate": 4.574468085106383e-06, + "loss": 0.2142, + "step": 344 + }, + { + "epoch": 0.13817891373801916, + "grad_norm": 1.9922157018157862, + "learning_rate": 4.601063829787235e-06, + "loss": 0.188, + "step": 346 + }, + { + "epoch": 0.1389776357827476, + "grad_norm": 1.9986841603432328, + "learning_rate": 4.6276595744680855e-06, + "loss": 0.2123, + "step": 348 + }, + { + "epoch": 0.13977635782747605, + "grad_norm": 3.8191377619872346, + "learning_rate": 4.654255319148936e-06, + "loss": 0.1909, + "step": 350 + }, + { + "epoch": 0.14057507987220447, + "grad_norm": 2.707801265704851, + "learning_rate": 4.680851063829788e-06, + "loss": 0.1804, + "step": 352 + }, + { + "epoch": 0.1413738019169329, + "grad_norm": 2.3382091629044264, + "learning_rate": 4.707446808510639e-06, + "loss": 0.2268, + "step": 354 + }, + { + "epoch": 0.14217252396166133, + "grad_norm": 2.4107325950799963, + "learning_rate": 4.73404255319149e-06, + "loss": 0.204, + "step": 356 + }, + { + "epoch": 0.14297124600638977, + "grad_norm": 2.1678512268790944, + "learning_rate": 4.760638297872341e-06, + "loss": 0.2078, + "step": 358 + }, + { + "epoch": 0.14376996805111822, + "grad_norm": 2.028736424354992, + "learning_rate": 4.787234042553192e-06, + "loss": 0.2091, + "step": 360 + }, + { + "epoch": 0.14456869009584664, + "grad_norm": 2.396579988473481, + "learning_rate": 4.813829787234043e-06, + "loss": 0.2045, + "step": 362 + }, + { + "epoch": 0.14536741214057508, + "grad_norm": 2.3056223101352162, + "learning_rate": 4.840425531914894e-06, + "loss": 0.2012, + "step": 364 + }, + { + "epoch": 0.14616613418530353, + "grad_norm": 2.1325793827510964, + "learning_rate": 4.867021276595745e-06, + "loss": 0.1997, + "step": 366 + }, + { + "epoch": 0.14696485623003194, + "grad_norm": 2.2181283979891213, + "learning_rate": 4.893617021276596e-06, + "loss": 0.2216, + "step": 368 + }, + { + "epoch": 0.1477635782747604, + "grad_norm": 2.2520136230085988, + "learning_rate": 4.9202127659574475e-06, + "loss": 0.2093, + "step": 370 + }, + { + "epoch": 0.1485623003194888, + "grad_norm": 2.3640436529911817, + "learning_rate": 4.946808510638298e-06, + "loss": 0.2169, + "step": 372 + }, + { + "epoch": 0.14936102236421725, + "grad_norm": 2.3535553790924952, + "learning_rate": 4.973404255319149e-06, + "loss": 0.2098, + "step": 374 + }, + { + "epoch": 0.1501597444089457, + "grad_norm": 1.8756114301592517, + "learning_rate": 5e-06, + "loss": 0.1986, + "step": 376 + }, + { + "epoch": 0.1509584664536741, + "grad_norm": 2.292659355977137, + "learning_rate": 5.026595744680851e-06, + "loss": 0.2046, + "step": 378 + }, + { + "epoch": 0.15175718849840256, + "grad_norm": 2.6775913216468994, + "learning_rate": 5.053191489361703e-06, + "loss": 0.2376, + "step": 380 + }, + { + "epoch": 0.152555910543131, + "grad_norm": 2.140742604355663, + "learning_rate": 5.079787234042553e-06, + "loss": 0.2076, + "step": 382 + }, + { + "epoch": 0.15335463258785942, + "grad_norm": 2.2577220617248983, + "learning_rate": 5.106382978723404e-06, + "loss": 0.1857, + "step": 384 + }, + { + "epoch": 0.15415335463258786, + "grad_norm": 2.322529114683112, + "learning_rate": 5.1329787234042565e-06, + "loss": 0.2252, + "step": 386 + }, + { + "epoch": 0.15495207667731628, + "grad_norm": 2.174339578110931, + "learning_rate": 5.159574468085107e-06, + "loss": 0.1859, + "step": 388 + }, + { + "epoch": 0.15575079872204473, + "grad_norm": 2.6754457483199996, + "learning_rate": 5.186170212765958e-06, + "loss": 0.2134, + "step": 390 + }, + { + "epoch": 0.15654952076677317, + "grad_norm": 2.5869207840676287, + "learning_rate": 5.212765957446809e-06, + "loss": 0.2313, + "step": 392 + }, + { + "epoch": 0.1573482428115016, + "grad_norm": 2.2594133874239555, + "learning_rate": 5.23936170212766e-06, + "loss": 0.2063, + "step": 394 + }, + { + "epoch": 0.15814696485623003, + "grad_norm": 2.3216671883603532, + "learning_rate": 5.265957446808511e-06, + "loss": 0.2068, + "step": 396 + }, + { + "epoch": 0.15894568690095848, + "grad_norm": 2.2695053056973014, + "learning_rate": 5.292553191489362e-06, + "loss": 0.1911, + "step": 398 + }, + { + "epoch": 0.1597444089456869, + "grad_norm": 2.2393947266924474, + "learning_rate": 5.319148936170213e-06, + "loss": 0.2134, + "step": 400 + }, + { + "epoch": 0.16054313099041534, + "grad_norm": 2.3780064899990223, + "learning_rate": 5.345744680851064e-06, + "loss": 0.1861, + "step": 402 + }, + { + "epoch": 0.16134185303514376, + "grad_norm": 2.0639877092575456, + "learning_rate": 5.372340425531915e-06, + "loss": 0.1934, + "step": 404 + }, + { + "epoch": 0.1621405750798722, + "grad_norm": 2.3834910258697883, + "learning_rate": 5.398936170212766e-06, + "loss": 0.2268, + "step": 406 + }, + { + "epoch": 0.16293929712460065, + "grad_norm": 2.1443375139668253, + "learning_rate": 5.425531914893617e-06, + "loss": 0.1974, + "step": 408 + }, + { + "epoch": 0.16373801916932906, + "grad_norm": 2.1691183471396323, + "learning_rate": 5.452127659574469e-06, + "loss": 0.1985, + "step": 410 + }, + { + "epoch": 0.1645367412140575, + "grad_norm": 2.4848574597065904, + "learning_rate": 5.47872340425532e-06, + "loss": 0.2127, + "step": 412 + }, + { + "epoch": 0.16533546325878595, + "grad_norm": 2.3579622949897723, + "learning_rate": 5.5053191489361705e-06, + "loss": 0.2174, + "step": 414 + }, + { + "epoch": 0.16613418530351437, + "grad_norm": 2.070124617271312, + "learning_rate": 5.531914893617022e-06, + "loss": 0.1862, + "step": 416 + }, + { + "epoch": 0.16693290734824281, + "grad_norm": 2.1267517696375213, + "learning_rate": 5.558510638297873e-06, + "loss": 0.2201, + "step": 418 + }, + { + "epoch": 0.16773162939297126, + "grad_norm": 2.7521587042179134, + "learning_rate": 5.5851063829787235e-06, + "loss": 0.1829, + "step": 420 + }, + { + "epoch": 0.16853035143769968, + "grad_norm": 2.4493518149473106, + "learning_rate": 5.611702127659575e-06, + "loss": 0.2229, + "step": 422 + }, + { + "epoch": 0.16932907348242812, + "grad_norm": 2.248345922944704, + "learning_rate": 5.638297872340426e-06, + "loss": 0.1946, + "step": 424 + }, + { + "epoch": 0.17012779552715654, + "grad_norm": 2.0487939416257763, + "learning_rate": 5.664893617021277e-06, + "loss": 0.2045, + "step": 426 + }, + { + "epoch": 0.17092651757188498, + "grad_norm": 2.2646018633860696, + "learning_rate": 5.691489361702128e-06, + "loss": 0.2146, + "step": 428 + }, + { + "epoch": 0.17172523961661343, + "grad_norm": 1.8960573449253009, + "learning_rate": 5.718085106382979e-06, + "loss": 0.2079, + "step": 430 + }, + { + "epoch": 0.17252396166134185, + "grad_norm": 2.176740909319673, + "learning_rate": 5.744680851063831e-06, + "loss": 0.21, + "step": 432 + }, + { + "epoch": 0.1733226837060703, + "grad_norm": 2.1430161168845987, + "learning_rate": 5.771276595744682e-06, + "loss": 0.2119, + "step": 434 + }, + { + "epoch": 0.17412140575079874, + "grad_norm": 1.9672794318914277, + "learning_rate": 5.7978723404255325e-06, + "loss": 0.2103, + "step": 436 + }, + { + "epoch": 0.17492012779552715, + "grad_norm": 2.1725129238295904, + "learning_rate": 5.824468085106384e-06, + "loss": 0.2097, + "step": 438 + }, + { + "epoch": 0.1757188498402556, + "grad_norm": 1.9320076776263606, + "learning_rate": 5.851063829787235e-06, + "loss": 0.2141, + "step": 440 + }, + { + "epoch": 0.17651757188498401, + "grad_norm": 2.0107708483969544, + "learning_rate": 5.877659574468085e-06, + "loss": 0.2138, + "step": 442 + }, + { + "epoch": 0.17731629392971246, + "grad_norm": 1.9692652691440204, + "learning_rate": 5.904255319148937e-06, + "loss": 0.1874, + "step": 444 + }, + { + "epoch": 0.1781150159744409, + "grad_norm": 2.1222862804558305, + "learning_rate": 5.930851063829788e-06, + "loss": 0.1875, + "step": 446 + }, + { + "epoch": 0.17891373801916932, + "grad_norm": 2.0073449512509414, + "learning_rate": 5.957446808510638e-06, + "loss": 0.2073, + "step": 448 + }, + { + "epoch": 0.17971246006389777, + "grad_norm": 2.0189150793812027, + "learning_rate": 5.98404255319149e-06, + "loss": 0.1886, + "step": 450 + }, + { + "epoch": 0.1805111821086262, + "grad_norm": 2.327500057066851, + "learning_rate": 6.010638297872341e-06, + "loss": 0.2026, + "step": 452 + }, + { + "epoch": 0.18130990415335463, + "grad_norm": 2.2501232755144183, + "learning_rate": 6.037234042553191e-06, + "loss": 0.2132, + "step": 454 + }, + { + "epoch": 0.18210862619808307, + "grad_norm": 2.317281078409067, + "learning_rate": 6.063829787234044e-06, + "loss": 0.2112, + "step": 456 + }, + { + "epoch": 0.1829073482428115, + "grad_norm": 1.9924372480322023, + "learning_rate": 6.090425531914894e-06, + "loss": 0.2092, + "step": 458 + }, + { + "epoch": 0.18370607028753994, + "grad_norm": 2.2346624173451963, + "learning_rate": 6.117021276595745e-06, + "loss": 0.2144, + "step": 460 + }, + { + "epoch": 0.18450479233226838, + "grad_norm": 2.175934261679526, + "learning_rate": 6.143617021276597e-06, + "loss": 0.1849, + "step": 462 + }, + { + "epoch": 0.1853035143769968, + "grad_norm": 2.266384026702202, + "learning_rate": 6.170212765957447e-06, + "loss": 0.2113, + "step": 464 + }, + { + "epoch": 0.18610223642172524, + "grad_norm": 1.9034790451109018, + "learning_rate": 6.196808510638298e-06, + "loss": 0.2115, + "step": 466 + }, + { + "epoch": 0.1869009584664537, + "grad_norm": 2.2484524626124087, + "learning_rate": 6.22340425531915e-06, + "loss": 0.2069, + "step": 468 + }, + { + "epoch": 0.1876996805111821, + "grad_norm": 2.0092685848128102, + "learning_rate": 6.25e-06, + "loss": 0.2031, + "step": 470 + }, + { + "epoch": 0.18849840255591055, + "grad_norm": 1.870835955521368, + "learning_rate": 6.276595744680851e-06, + "loss": 0.1903, + "step": 472 + }, + { + "epoch": 0.18929712460063897, + "grad_norm": 1.9836945986391599, + "learning_rate": 6.303191489361703e-06, + "loss": 0.2256, + "step": 474 + }, + { + "epoch": 0.1900958466453674, + "grad_norm": 1.8732225695197424, + "learning_rate": 6.329787234042554e-06, + "loss": 0.2, + "step": 476 + }, + { + "epoch": 0.19089456869009586, + "grad_norm": 2.0073497811760856, + "learning_rate": 6.356382978723404e-06, + "loss": 0.1996, + "step": 478 + }, + { + "epoch": 0.19169329073482427, + "grad_norm": 2.1089028896634785, + "learning_rate": 6.382978723404256e-06, + "loss": 0.1705, + "step": 480 + }, + { + "epoch": 0.19249201277955272, + "grad_norm": 2.3296857896927032, + "learning_rate": 6.409574468085107e-06, + "loss": 0.2127, + "step": 482 + }, + { + "epoch": 0.19329073482428116, + "grad_norm": 1.9240064591019737, + "learning_rate": 6.436170212765958e-06, + "loss": 0.2044, + "step": 484 + }, + { + "epoch": 0.19408945686900958, + "grad_norm": 2.0124517600701757, + "learning_rate": 6.462765957446809e-06, + "loss": 0.2149, + "step": 486 + }, + { + "epoch": 0.19488817891373802, + "grad_norm": 2.1954071243853, + "learning_rate": 6.48936170212766e-06, + "loss": 0.2202, + "step": 488 + }, + { + "epoch": 0.19568690095846644, + "grad_norm": 1.9656201249731728, + "learning_rate": 6.515957446808511e-06, + "loss": 0.1946, + "step": 490 + }, + { + "epoch": 0.1964856230031949, + "grad_norm": 1.989066037021515, + "learning_rate": 6.542553191489362e-06, + "loss": 0.2004, + "step": 492 + }, + { + "epoch": 0.19728434504792333, + "grad_norm": 1.872072162374712, + "learning_rate": 6.569148936170213e-06, + "loss": 0.2077, + "step": 494 + }, + { + "epoch": 0.19808306709265175, + "grad_norm": 1.9921049486714641, + "learning_rate": 6.595744680851064e-06, + "loss": 0.1874, + "step": 496 + }, + { + "epoch": 0.1988817891373802, + "grad_norm": 1.7219513015277055, + "learning_rate": 6.622340425531916e-06, + "loss": 0.1821, + "step": 498 + }, + { + "epoch": 0.19968051118210864, + "grad_norm": 2.0749672908831616, + "learning_rate": 6.648936170212767e-06, + "loss": 0.2053, + "step": 500 + }, + { + "epoch": 0.19968051118210864, + "eval_loss": 0.19043219089508057, + "eval_runtime": 420.1871, + "eval_samples_per_second": 42.379, + "eval_steps_per_second": 5.298, + "step": 500 + }, + { + "epoch": 0.20047923322683706, + "grad_norm": 2.037306801330803, + "learning_rate": 6.6755319148936174e-06, + "loss": 0.2176, + "step": 502 + }, + { + "epoch": 0.2012779552715655, + "grad_norm": 1.9203371381017114, + "learning_rate": 6.702127659574469e-06, + "loss": 0.2125, + "step": 504 + }, + { + "epoch": 0.20207667731629392, + "grad_norm": 1.985111227444487, + "learning_rate": 6.72872340425532e-06, + "loss": 0.1973, + "step": 506 + }, + { + "epoch": 0.20287539936102236, + "grad_norm": 1.8951174304875817, + "learning_rate": 6.75531914893617e-06, + "loss": 0.2149, + "step": 508 + }, + { + "epoch": 0.2036741214057508, + "grad_norm": 2.0497514964752748, + "learning_rate": 6.781914893617022e-06, + "loss": 0.1978, + "step": 510 + }, + { + "epoch": 0.20447284345047922, + "grad_norm": 2.0766753850394175, + "learning_rate": 6.808510638297873e-06, + "loss": 0.2375, + "step": 512 + }, + { + "epoch": 0.20527156549520767, + "grad_norm": 2.346429408581163, + "learning_rate": 6.835106382978723e-06, + "loss": 0.2194, + "step": 514 + }, + { + "epoch": 0.20607028753993611, + "grad_norm": 2.0826174062149647, + "learning_rate": 6.861702127659575e-06, + "loss": 0.2056, + "step": 516 + }, + { + "epoch": 0.20686900958466453, + "grad_norm": 1.99670732186987, + "learning_rate": 6.888297872340426e-06, + "loss": 0.221, + "step": 518 + }, + { + "epoch": 0.20766773162939298, + "grad_norm": 2.036868735781395, + "learning_rate": 6.914893617021278e-06, + "loss": 0.2165, + "step": 520 + }, + { + "epoch": 0.2084664536741214, + "grad_norm": 1.8657555178307903, + "learning_rate": 6.941489361702129e-06, + "loss": 0.2146, + "step": 522 + }, + { + "epoch": 0.20926517571884984, + "grad_norm": 1.9998386354651805, + "learning_rate": 6.968085106382979e-06, + "loss": 0.2165, + "step": 524 + }, + { + "epoch": 0.21006389776357828, + "grad_norm": 1.890648331838507, + "learning_rate": 6.994680851063831e-06, + "loss": 0.2215, + "step": 526 + }, + { + "epoch": 0.2108626198083067, + "grad_norm": 1.8453249030357137, + "learning_rate": 7.021276595744682e-06, + "loss": 0.227, + "step": 528 + }, + { + "epoch": 0.21166134185303515, + "grad_norm": 1.8696171981456626, + "learning_rate": 7.047872340425532e-06, + "loss": 0.202, + "step": 530 + }, + { + "epoch": 0.2124600638977636, + "grad_norm": 1.9343681296268649, + "learning_rate": 7.074468085106384e-06, + "loss": 0.2207, + "step": 532 + }, + { + "epoch": 0.213258785942492, + "grad_norm": 2.004530964431762, + "learning_rate": 7.101063829787235e-06, + "loss": 0.2024, + "step": 534 + }, + { + "epoch": 0.21405750798722045, + "grad_norm": 1.9649699331241137, + "learning_rate": 7.127659574468085e-06, + "loss": 0.201, + "step": 536 + }, + { + "epoch": 0.21485623003194887, + "grad_norm": 1.998147156306178, + "learning_rate": 7.154255319148937e-06, + "loss": 0.1925, + "step": 538 + }, + { + "epoch": 0.21565495207667731, + "grad_norm": 1.872474740663219, + "learning_rate": 7.1808510638297875e-06, + "loss": 0.1979, + "step": 540 + }, + { + "epoch": 0.21645367412140576, + "grad_norm": 2.0711681755729634, + "learning_rate": 7.207446808510638e-06, + "loss": 0.2468, + "step": 542 + }, + { + "epoch": 0.21725239616613418, + "grad_norm": 1.9701565676595827, + "learning_rate": 7.234042553191491e-06, + "loss": 0.2092, + "step": 544 + }, + { + "epoch": 0.21805111821086262, + "grad_norm": 2.0764550423720927, + "learning_rate": 7.260638297872341e-06, + "loss": 0.2148, + "step": 546 + }, + { + "epoch": 0.21884984025559107, + "grad_norm": 3.213288011368614, + "learning_rate": 7.287234042553192e-06, + "loss": 0.2287, + "step": 548 + }, + { + "epoch": 0.21964856230031948, + "grad_norm": 1.8177331310953289, + "learning_rate": 7.313829787234044e-06, + "loss": 0.207, + "step": 550 + }, + { + "epoch": 0.22044728434504793, + "grad_norm": 1.7655366549601414, + "learning_rate": 7.340425531914894e-06, + "loss": 0.2064, + "step": 552 + }, + { + "epoch": 0.22124600638977635, + "grad_norm": 1.9594454321675279, + "learning_rate": 7.367021276595745e-06, + "loss": 0.1922, + "step": 554 + }, + { + "epoch": 0.2220447284345048, + "grad_norm": 2.1733754411582407, + "learning_rate": 7.3936170212765965e-06, + "loss": 0.2266, + "step": 556 + }, + { + "epoch": 0.22284345047923323, + "grad_norm": 1.8248091216623237, + "learning_rate": 7.420212765957447e-06, + "loss": 0.2221, + "step": 558 + }, + { + "epoch": 0.22364217252396165, + "grad_norm": 2.058872918004827, + "learning_rate": 7.446808510638298e-06, + "loss": 0.2225, + "step": 560 + }, + { + "epoch": 0.2244408945686901, + "grad_norm": 1.909102118780366, + "learning_rate": 7.47340425531915e-06, + "loss": 0.1983, + "step": 562 + }, + { + "epoch": 0.22523961661341854, + "grad_norm": 2.0278745632782584, + "learning_rate": 7.500000000000001e-06, + "loss": 0.2076, + "step": 564 + }, + { + "epoch": 0.22603833865814696, + "grad_norm": 1.8175943014340494, + "learning_rate": 7.526595744680851e-06, + "loss": 0.1953, + "step": 566 + }, + { + "epoch": 0.2268370607028754, + "grad_norm": 2.025268083189413, + "learning_rate": 7.553191489361703e-06, + "loss": 0.2338, + "step": 568 + }, + { + "epoch": 0.22763578274760382, + "grad_norm": 1.8856154811157113, + "learning_rate": 7.579787234042554e-06, + "loss": 0.2077, + "step": 570 + }, + { + "epoch": 0.22843450479233227, + "grad_norm": 2.59965148545968, + "learning_rate": 7.606382978723405e-06, + "loss": 0.2057, + "step": 572 + }, + { + "epoch": 0.2292332268370607, + "grad_norm": 1.977278774789199, + "learning_rate": 7.632978723404256e-06, + "loss": 0.2021, + "step": 574 + }, + { + "epoch": 0.23003194888178913, + "grad_norm": 2.1109133002264735, + "learning_rate": 7.659574468085107e-06, + "loss": 0.2156, + "step": 576 + }, + { + "epoch": 0.23083067092651757, + "grad_norm": 2.017717675854075, + "learning_rate": 7.686170212765958e-06, + "loss": 0.2235, + "step": 578 + }, + { + "epoch": 0.23162939297124602, + "grad_norm": 1.8309604789907734, + "learning_rate": 7.71276595744681e-06, + "loss": 0.1801, + "step": 580 + }, + { + "epoch": 0.23242811501597443, + "grad_norm": 1.7699335955775852, + "learning_rate": 7.73936170212766e-06, + "loss": 0.1943, + "step": 582 + }, + { + "epoch": 0.23322683706070288, + "grad_norm": 1.9194267663124567, + "learning_rate": 7.765957446808511e-06, + "loss": 0.2153, + "step": 584 + }, + { + "epoch": 0.2340255591054313, + "grad_norm": 2.097345382192139, + "learning_rate": 7.792553191489362e-06, + "loss": 0.2338, + "step": 586 + }, + { + "epoch": 0.23482428115015974, + "grad_norm": 1.8940063065960397, + "learning_rate": 7.819148936170213e-06, + "loss": 0.2031, + "step": 588 + }, + { + "epoch": 0.2356230031948882, + "grad_norm": 1.9690164598346551, + "learning_rate": 7.845744680851064e-06, + "loss": 0.2118, + "step": 590 + }, + { + "epoch": 0.2364217252396166, + "grad_norm": 3.228651225314339, + "learning_rate": 7.872340425531916e-06, + "loss": 0.2039, + "step": 592 + }, + { + "epoch": 0.23722044728434505, + "grad_norm": 1.9156226006051282, + "learning_rate": 7.898936170212767e-06, + "loss": 0.2252, + "step": 594 + }, + { + "epoch": 0.2380191693290735, + "grad_norm": 1.7768150211461646, + "learning_rate": 7.925531914893617e-06, + "loss": 0.225, + "step": 596 + }, + { + "epoch": 0.2388178913738019, + "grad_norm": 1.9652050222591573, + "learning_rate": 7.95212765957447e-06, + "loss": 0.1965, + "step": 598 + }, + { + "epoch": 0.23961661341853036, + "grad_norm": 2.0572778537541323, + "learning_rate": 7.97872340425532e-06, + "loss": 0.203, + "step": 600 + }, + { + "epoch": 0.24041533546325877, + "grad_norm": 1.9770278124409975, + "learning_rate": 8.005319148936171e-06, + "loss": 0.2106, + "step": 602 + }, + { + "epoch": 0.24121405750798722, + "grad_norm": 1.8557771165334138, + "learning_rate": 8.031914893617022e-06, + "loss": 0.2075, + "step": 604 + }, + { + "epoch": 0.24201277955271566, + "grad_norm": 1.770990487744795, + "learning_rate": 8.058510638297873e-06, + "loss": 0.2138, + "step": 606 + }, + { + "epoch": 0.24281150159744408, + "grad_norm": 1.6450172966793275, + "learning_rate": 8.085106382978723e-06, + "loss": 0.1858, + "step": 608 + }, + { + "epoch": 0.24361022364217252, + "grad_norm": 1.6531257153455905, + "learning_rate": 8.111702127659576e-06, + "loss": 0.2181, + "step": 610 + }, + { + "epoch": 0.24440894568690097, + "grad_norm": 1.7953441983194227, + "learning_rate": 8.138297872340426e-06, + "loss": 0.2168, + "step": 612 + }, + { + "epoch": 0.2452076677316294, + "grad_norm": 1.9624853486169551, + "learning_rate": 8.164893617021277e-06, + "loss": 0.2025, + "step": 614 + }, + { + "epoch": 0.24600638977635783, + "grad_norm": 1.9202333243211456, + "learning_rate": 8.191489361702128e-06, + "loss": 0.2094, + "step": 616 + }, + { + "epoch": 0.24680511182108625, + "grad_norm": 1.9054794785418179, + "learning_rate": 8.218085106382978e-06, + "loss": 0.2287, + "step": 618 + }, + { + "epoch": 0.2476038338658147, + "grad_norm": 1.6738673715101495, + "learning_rate": 8.24468085106383e-06, + "loss": 0.2113, + "step": 620 + }, + { + "epoch": 0.24840255591054314, + "grad_norm": 1.6070602027792467, + "learning_rate": 8.271276595744682e-06, + "loss": 0.1973, + "step": 622 + }, + { + "epoch": 0.24920127795527156, + "grad_norm": 2.0785351155835134, + "learning_rate": 8.297872340425532e-06, + "loss": 0.2166, + "step": 624 + }, + { + "epoch": 0.25, + "grad_norm": 1.7980372006314775, + "learning_rate": 8.324468085106385e-06, + "loss": 0.2113, + "step": 626 + }, + { + "epoch": 0.2507987220447284, + "grad_norm": 2.0745759191840096, + "learning_rate": 8.351063829787235e-06, + "loss": 0.2359, + "step": 628 + }, + { + "epoch": 0.2515974440894569, + "grad_norm": 1.9864922897804826, + "learning_rate": 8.377659574468086e-06, + "loss": 0.2062, + "step": 630 + }, + { + "epoch": 0.2523961661341853, + "grad_norm": 2.030788677329014, + "learning_rate": 8.404255319148937e-06, + "loss": 0.2217, + "step": 632 + }, + { + "epoch": 0.2531948881789137, + "grad_norm": 1.7547740269944636, + "learning_rate": 8.430851063829787e-06, + "loss": 0.2203, + "step": 634 + }, + { + "epoch": 0.2539936102236422, + "grad_norm": 1.7889762015226203, + "learning_rate": 8.457446808510638e-06, + "loss": 0.2165, + "step": 636 + }, + { + "epoch": 0.2547923322683706, + "grad_norm": 1.6288012322878784, + "learning_rate": 8.48404255319149e-06, + "loss": 0.1979, + "step": 638 + }, + { + "epoch": 0.25559105431309903, + "grad_norm": 1.7348141744007637, + "learning_rate": 8.510638297872341e-06, + "loss": 0.1996, + "step": 640 + }, + { + "epoch": 0.2563897763578275, + "grad_norm": 1.7262432044292573, + "learning_rate": 8.537234042553192e-06, + "loss": 0.1901, + "step": 642 + }, + { + "epoch": 0.2571884984025559, + "grad_norm": 1.9327837811354087, + "learning_rate": 8.563829787234044e-06, + "loss": 0.2001, + "step": 644 + }, + { + "epoch": 0.25798722044728434, + "grad_norm": 2.211386389180962, + "learning_rate": 8.590425531914895e-06, + "loss": 0.2213, + "step": 646 + }, + { + "epoch": 0.25878594249201275, + "grad_norm": 1.8243601975607768, + "learning_rate": 8.617021276595746e-06, + "loss": 0.224, + "step": 648 + }, + { + "epoch": 0.2595846645367412, + "grad_norm": 1.932439055129622, + "learning_rate": 8.643617021276596e-06, + "loss": 0.1924, + "step": 650 + }, + { + "epoch": 0.26038338658146964, + "grad_norm": 1.746238022604233, + "learning_rate": 8.670212765957447e-06, + "loss": 0.2149, + "step": 652 + }, + { + "epoch": 0.26118210862619806, + "grad_norm": 1.7613718087296124, + "learning_rate": 8.696808510638298e-06, + "loss": 0.2085, + "step": 654 + }, + { + "epoch": 0.26198083067092653, + "grad_norm": 1.7993776636313457, + "learning_rate": 8.72340425531915e-06, + "loss": 0.2287, + "step": 656 + }, + { + "epoch": 0.26277955271565495, + "grad_norm": 2.6631359329733972, + "learning_rate": 8.750000000000001e-06, + "loss": 0.1972, + "step": 658 + }, + { + "epoch": 0.26357827476038337, + "grad_norm": 2.0244414755737146, + "learning_rate": 8.776595744680852e-06, + "loss": 0.2113, + "step": 660 + }, + { + "epoch": 0.26437699680511184, + "grad_norm": 2.635052210402534, + "learning_rate": 8.803191489361704e-06, + "loss": 0.1862, + "step": 662 + }, + { + "epoch": 0.26517571884984026, + "grad_norm": 5.482936732062825, + "learning_rate": 8.829787234042555e-06, + "loss": 0.2109, + "step": 664 + }, + { + "epoch": 0.2659744408945687, + "grad_norm": 1.8743456888792152, + "learning_rate": 8.856382978723404e-06, + "loss": 0.2045, + "step": 666 + }, + { + "epoch": 0.26677316293929715, + "grad_norm": 1.4845513209949641, + "learning_rate": 8.882978723404256e-06, + "loss": 0.1779, + "step": 668 + }, + { + "epoch": 0.26757188498402557, + "grad_norm": 1.7833087776873404, + "learning_rate": 8.909574468085107e-06, + "loss": 0.1949, + "step": 670 + }, + { + "epoch": 0.268370607028754, + "grad_norm": 1.7237851857753832, + "learning_rate": 8.936170212765958e-06, + "loss": 0.2309, + "step": 672 + }, + { + "epoch": 0.26916932907348246, + "grad_norm": 1.7943976011736698, + "learning_rate": 8.96276595744681e-06, + "loss": 0.2246, + "step": 674 + }, + { + "epoch": 0.26996805111821087, + "grad_norm": 1.6645946104715852, + "learning_rate": 8.98936170212766e-06, + "loss": 0.213, + "step": 676 + }, + { + "epoch": 0.2707667731629393, + "grad_norm": 1.6878179409944127, + "learning_rate": 9.015957446808511e-06, + "loss": 0.2177, + "step": 678 + }, + { + "epoch": 0.2715654952076677, + "grad_norm": 1.6507645045148651, + "learning_rate": 9.042553191489362e-06, + "loss": 0.1924, + "step": 680 + }, + { + "epoch": 0.2723642172523962, + "grad_norm": 1.9095635971119895, + "learning_rate": 9.069148936170213e-06, + "loss": 0.2165, + "step": 682 + }, + { + "epoch": 0.2731629392971246, + "grad_norm": 1.630380756355513, + "learning_rate": 9.095744680851063e-06, + "loss": 0.2158, + "step": 684 + }, + { + "epoch": 0.273961661341853, + "grad_norm": 1.7077429176964645, + "learning_rate": 9.122340425531916e-06, + "loss": 0.2023, + "step": 686 + }, + { + "epoch": 0.2747603833865815, + "grad_norm": 1.7776755211491289, + "learning_rate": 9.148936170212767e-06, + "loss": 0.2179, + "step": 688 + }, + { + "epoch": 0.2755591054313099, + "grad_norm": 1.6954241240154826, + "learning_rate": 9.175531914893617e-06, + "loss": 0.2137, + "step": 690 + }, + { + "epoch": 0.2763578274760383, + "grad_norm": 1.7891304751555133, + "learning_rate": 9.20212765957447e-06, + "loss": 0.2117, + "step": 692 + }, + { + "epoch": 0.2771565495207668, + "grad_norm": 1.9639021184922467, + "learning_rate": 9.22872340425532e-06, + "loss": 0.2372, + "step": 694 + }, + { + "epoch": 0.2779552715654952, + "grad_norm": 1.5797174817631339, + "learning_rate": 9.255319148936171e-06, + "loss": 0.2063, + "step": 696 + }, + { + "epoch": 0.2787539936102236, + "grad_norm": 1.8249112869928803, + "learning_rate": 9.281914893617022e-06, + "loss": 0.2095, + "step": 698 + }, + { + "epoch": 0.2795527156549521, + "grad_norm": 1.7706586811506164, + "learning_rate": 9.308510638297872e-06, + "loss": 0.2073, + "step": 700 + }, + { + "epoch": 0.2803514376996805, + "grad_norm": 1.7271382999203275, + "learning_rate": 9.335106382978723e-06, + "loss": 0.2099, + "step": 702 + }, + { + "epoch": 0.28115015974440893, + "grad_norm": 1.6484304946170987, + "learning_rate": 9.361702127659576e-06, + "loss": 0.2133, + "step": 704 + }, + { + "epoch": 0.2819488817891374, + "grad_norm": 1.724151909486214, + "learning_rate": 9.388297872340426e-06, + "loss": 0.212, + "step": 706 + }, + { + "epoch": 0.2827476038338658, + "grad_norm": 1.5897126511168378, + "learning_rate": 9.414893617021279e-06, + "loss": 0.2, + "step": 708 + }, + { + "epoch": 0.28354632587859424, + "grad_norm": 1.7835429984815048, + "learning_rate": 9.44148936170213e-06, + "loss": 0.2318, + "step": 710 + }, + { + "epoch": 0.28434504792332266, + "grad_norm": 1.7751330791980955, + "learning_rate": 9.46808510638298e-06, + "loss": 0.2316, + "step": 712 + }, + { + "epoch": 0.28514376996805113, + "grad_norm": 1.8933530962247043, + "learning_rate": 9.49468085106383e-06, + "loss": 0.2265, + "step": 714 + }, + { + "epoch": 0.28594249201277955, + "grad_norm": 1.561209719510769, + "learning_rate": 9.521276595744681e-06, + "loss": 0.199, + "step": 716 + }, + { + "epoch": 0.28674121405750796, + "grad_norm": 1.8127384648554525, + "learning_rate": 9.547872340425532e-06, + "loss": 0.213, + "step": 718 + }, + { + "epoch": 0.28753993610223644, + "grad_norm": 3.645955534528657, + "learning_rate": 9.574468085106385e-06, + "loss": 0.2213, + "step": 720 + }, + { + "epoch": 0.28833865814696485, + "grad_norm": 2.2672006586352476, + "learning_rate": 9.601063829787235e-06, + "loss": 0.2139, + "step": 722 + }, + { + "epoch": 0.28913738019169327, + "grad_norm": 1.8519024796984893, + "learning_rate": 9.627659574468086e-06, + "loss": 0.2153, + "step": 724 + }, + { + "epoch": 0.28993610223642174, + "grad_norm": 1.560446824326613, + "learning_rate": 9.654255319148937e-06, + "loss": 0.2023, + "step": 726 + }, + { + "epoch": 0.29073482428115016, + "grad_norm": 1.7699204671345776, + "learning_rate": 9.680851063829787e-06, + "loss": 0.2238, + "step": 728 + }, + { + "epoch": 0.2915335463258786, + "grad_norm": 1.6865554967271952, + "learning_rate": 9.707446808510638e-06, + "loss": 0.2029, + "step": 730 + }, + { + "epoch": 0.29233226837060705, + "grad_norm": 1.6482995869085524, + "learning_rate": 9.73404255319149e-06, + "loss": 0.189, + "step": 732 + }, + { + "epoch": 0.29313099041533547, + "grad_norm": 1.7007866856749894, + "learning_rate": 9.760638297872341e-06, + "loss": 0.2203, + "step": 734 + }, + { + "epoch": 0.2939297124600639, + "grad_norm": 2.0612744928368585, + "learning_rate": 9.787234042553192e-06, + "loss": 0.2011, + "step": 736 + }, + { + "epoch": 0.29472843450479236, + "grad_norm": 1.6349743417964269, + "learning_rate": 9.813829787234044e-06, + "loss": 0.217, + "step": 738 + }, + { + "epoch": 0.2955271565495208, + "grad_norm": 1.6647666932854406, + "learning_rate": 9.840425531914895e-06, + "loss": 0.2083, + "step": 740 + }, + { + "epoch": 0.2963258785942492, + "grad_norm": 1.6704980371536418, + "learning_rate": 9.867021276595746e-06, + "loss": 0.1969, + "step": 742 + }, + { + "epoch": 0.2971246006389776, + "grad_norm": 1.6928260652626874, + "learning_rate": 9.893617021276596e-06, + "loss": 0.1927, + "step": 744 + }, + { + "epoch": 0.2979233226837061, + "grad_norm": 1.6889079212558915, + "learning_rate": 9.920212765957447e-06, + "loss": 0.2108, + "step": 746 + }, + { + "epoch": 0.2987220447284345, + "grad_norm": 1.6008195623226404, + "learning_rate": 9.946808510638298e-06, + "loss": 0.2079, + "step": 748 + }, + { + "epoch": 0.2995207667731629, + "grad_norm": 1.6391899308564928, + "learning_rate": 9.97340425531915e-06, + "loss": 0.224, + "step": 750 + }, + { + "epoch": 0.3003194888178914, + "grad_norm": 1.6579897287838672, + "learning_rate": 1e-05, + "loss": 0.2007, + "step": 752 + }, + { + "epoch": 0.3011182108626198, + "grad_norm": 1.6060266288421388, + "learning_rate": 9.99999784023588e-06, + "loss": 0.1831, + "step": 754 + }, + { + "epoch": 0.3019169329073482, + "grad_norm": 1.5755540572254816, + "learning_rate": 9.999991360945382e-06, + "loss": 0.2366, + "step": 756 + }, + { + "epoch": 0.3027156549520767, + "grad_norm": 1.7099166011335876, + "learning_rate": 9.999980562134104e-06, + "loss": 0.2231, + "step": 758 + }, + { + "epoch": 0.3035143769968051, + "grad_norm": 1.5689663504063338, + "learning_rate": 9.999965443811378e-06, + "loss": 0.1979, + "step": 760 + }, + { + "epoch": 0.30431309904153353, + "grad_norm": 2.1289064244731657, + "learning_rate": 9.999946005990262e-06, + "loss": 0.2284, + "step": 762 + }, + { + "epoch": 0.305111821086262, + "grad_norm": 1.530947535684416, + "learning_rate": 9.99992224868755e-06, + "loss": 0.2067, + "step": 764 + }, + { + "epoch": 0.3059105431309904, + "grad_norm": 1.666888953841592, + "learning_rate": 9.999894171923764e-06, + "loss": 0.2061, + "step": 766 + }, + { + "epoch": 0.30670926517571884, + "grad_norm": 1.5905369733593537, + "learning_rate": 9.999861775723162e-06, + "loss": 0.2047, + "step": 768 + }, + { + "epoch": 0.3075079872204473, + "grad_norm": 1.9351094836921137, + "learning_rate": 9.99982506011373e-06, + "loss": 0.2111, + "step": 770 + }, + { + "epoch": 0.3083067092651757, + "grad_norm": 1.6775809786308205, + "learning_rate": 9.999784025127187e-06, + "loss": 0.2009, + "step": 772 + }, + { + "epoch": 0.30910543130990414, + "grad_norm": 1.6337987957663225, + "learning_rate": 9.999738670798983e-06, + "loss": 0.1965, + "step": 774 + }, + { + "epoch": 0.30990415335463256, + "grad_norm": 1.6853294588250392, + "learning_rate": 9.999688997168301e-06, + "loss": 0.2094, + "step": 776 + }, + { + "epoch": 0.31070287539936103, + "grad_norm": 2.380951431702259, + "learning_rate": 9.999635004278054e-06, + "loss": 0.2131, + "step": 778 + }, + { + "epoch": 0.31150159744408945, + "grad_norm": 1.6821758607705601, + "learning_rate": 9.999576692174887e-06, + "loss": 0.2148, + "step": 780 + }, + { + "epoch": 0.31230031948881787, + "grad_norm": 1.5362646502228101, + "learning_rate": 9.999514060909175e-06, + "loss": 0.2348, + "step": 782 + }, + { + "epoch": 0.31309904153354634, + "grad_norm": 1.6226459616330424, + "learning_rate": 9.999447110535026e-06, + "loss": 0.2031, + "step": 784 + }, + { + "epoch": 0.31389776357827476, + "grad_norm": 1.494547748123698, + "learning_rate": 9.999375841110277e-06, + "loss": 0.2004, + "step": 786 + }, + { + "epoch": 0.3146964856230032, + "grad_norm": 1.800819253841243, + "learning_rate": 9.999300252696502e-06, + "loss": 0.2446, + "step": 788 + }, + { + "epoch": 0.31549520766773165, + "grad_norm": 1.60492245047213, + "learning_rate": 9.999220345359e-06, + "loss": 0.2044, + "step": 790 + }, + { + "epoch": 0.31629392971246006, + "grad_norm": 1.8934326067851825, + "learning_rate": 9.999136119166803e-06, + "loss": 0.2043, + "step": 792 + }, + { + "epoch": 0.3170926517571885, + "grad_norm": 1.5648068073113917, + "learning_rate": 9.999047574192677e-06, + "loss": 0.2089, + "step": 794 + }, + { + "epoch": 0.31789137380191695, + "grad_norm": 1.6312196131131216, + "learning_rate": 9.998954710513113e-06, + "loss": 0.1983, + "step": 796 + }, + { + "epoch": 0.31869009584664537, + "grad_norm": 1.4933485598542207, + "learning_rate": 9.998857528208337e-06, + "loss": 0.2149, + "step": 798 + }, + { + "epoch": 0.3194888178913738, + "grad_norm": 1.7284519491739558, + "learning_rate": 9.998756027362308e-06, + "loss": 0.2049, + "step": 800 + }, + { + "epoch": 0.32028753993610226, + "grad_norm": 1.6062264896600003, + "learning_rate": 9.998650208062713e-06, + "loss": 0.2302, + "step": 802 + }, + { + "epoch": 0.3210862619808307, + "grad_norm": 1.541668593359599, + "learning_rate": 9.998540070400966e-06, + "loss": 0.214, + "step": 804 + }, + { + "epoch": 0.3218849840255591, + "grad_norm": 1.5744953156084605, + "learning_rate": 9.998425614472217e-06, + "loss": 0.2209, + "step": 806 + }, + { + "epoch": 0.3226837060702875, + "grad_norm": 1.5946944416094957, + "learning_rate": 9.99830684037535e-06, + "loss": 0.2209, + "step": 808 + }, + { + "epoch": 0.323482428115016, + "grad_norm": 1.6547243881770919, + "learning_rate": 9.998183748212968e-06, + "loss": 0.2291, + "step": 810 + }, + { + "epoch": 0.3242811501597444, + "grad_norm": 1.5020701296905943, + "learning_rate": 9.998056338091415e-06, + "loss": 0.1947, + "step": 812 + }, + { + "epoch": 0.3250798722044728, + "grad_norm": 1.775646858056913, + "learning_rate": 9.997924610120758e-06, + "loss": 0.2122, + "step": 814 + }, + { + "epoch": 0.3258785942492013, + "grad_norm": 1.9643173710094557, + "learning_rate": 9.9977885644148e-06, + "loss": 0.2304, + "step": 816 + }, + { + "epoch": 0.3266773162939297, + "grad_norm": 2.2694487421961305, + "learning_rate": 9.997648201091073e-06, + "loss": 0.1922, + "step": 818 + }, + { + "epoch": 0.3274760383386581, + "grad_norm": 1.6657139292549983, + "learning_rate": 9.997503520270837e-06, + "loss": 0.2166, + "step": 820 + }, + { + "epoch": 0.3282747603833866, + "grad_norm": 1.419957033481622, + "learning_rate": 9.997354522079078e-06, + "loss": 0.2025, + "step": 822 + }, + { + "epoch": 0.329073482428115, + "grad_norm": 1.499108476748773, + "learning_rate": 9.997201206644522e-06, + "loss": 0.2159, + "step": 824 + }, + { + "epoch": 0.32987220447284343, + "grad_norm": 1.624603961236652, + "learning_rate": 9.997043574099616e-06, + "loss": 0.2147, + "step": 826 + }, + { + "epoch": 0.3306709265175719, + "grad_norm": 1.5313961974893107, + "learning_rate": 9.996881624580542e-06, + "loss": 0.2063, + "step": 828 + }, + { + "epoch": 0.3314696485623003, + "grad_norm": 1.5963165181678973, + "learning_rate": 9.996715358227208e-06, + "loss": 0.2163, + "step": 830 + }, + { + "epoch": 0.33226837060702874, + "grad_norm": 2.5507679267995793, + "learning_rate": 9.99654477518325e-06, + "loss": 0.1961, + "step": 832 + }, + { + "epoch": 0.3330670926517572, + "grad_norm": 1.729152523099962, + "learning_rate": 9.99636987559604e-06, + "loss": 0.2024, + "step": 834 + }, + { + "epoch": 0.33386581469648563, + "grad_norm": 1.7210992603515631, + "learning_rate": 9.99619065961667e-06, + "loss": 0.2252, + "step": 836 + }, + { + "epoch": 0.33466453674121405, + "grad_norm": 1.560277624031283, + "learning_rate": 9.99600712739997e-06, + "loss": 0.2267, + "step": 838 + }, + { + "epoch": 0.3354632587859425, + "grad_norm": 1.4591188372864774, + "learning_rate": 9.995819279104494e-06, + "loss": 0.2011, + "step": 840 + }, + { + "epoch": 0.33626198083067094, + "grad_norm": 1.5041613555668587, + "learning_rate": 9.995627114892522e-06, + "loss": 0.2274, + "step": 842 + }, + { + "epoch": 0.33706070287539935, + "grad_norm": 1.6583699889454577, + "learning_rate": 9.995430634930068e-06, + "loss": 0.1911, + "step": 844 + }, + { + "epoch": 0.33785942492012777, + "grad_norm": 1.4587897948416573, + "learning_rate": 9.99522983938687e-06, + "loss": 0.2237, + "step": 846 + }, + { + "epoch": 0.33865814696485624, + "grad_norm": 1.503331285695136, + "learning_rate": 9.995024728436402e-06, + "loss": 0.2094, + "step": 848 + }, + { + "epoch": 0.33945686900958466, + "grad_norm": 1.535943980164657, + "learning_rate": 9.994815302255854e-06, + "loss": 0.2166, + "step": 850 + }, + { + "epoch": 0.3402555910543131, + "grad_norm": 1.6504775369785527, + "learning_rate": 9.994601561026156e-06, + "loss": 0.2002, + "step": 852 + }, + { + "epoch": 0.34105431309904155, + "grad_norm": 1.4302740240626306, + "learning_rate": 9.994383504931955e-06, + "loss": 0.2082, + "step": 854 + }, + { + "epoch": 0.34185303514376997, + "grad_norm": 1.593656408563002, + "learning_rate": 9.994161134161635e-06, + "loss": 0.2094, + "step": 856 + }, + { + "epoch": 0.3426517571884984, + "grad_norm": 1.3695207582393358, + "learning_rate": 9.9939344489073e-06, + "loss": 0.1919, + "step": 858 + }, + { + "epoch": 0.34345047923322686, + "grad_norm": 1.6060661840602912, + "learning_rate": 9.993703449364787e-06, + "loss": 0.206, + "step": 860 + }, + { + "epoch": 0.3442492012779553, + "grad_norm": 1.4773210324662711, + "learning_rate": 9.993468135733658e-06, + "loss": 0.2192, + "step": 862 + }, + { + "epoch": 0.3450479233226837, + "grad_norm": 1.406665980225023, + "learning_rate": 9.993228508217201e-06, + "loss": 0.1907, + "step": 864 + }, + { + "epoch": 0.34584664536741216, + "grad_norm": 1.5457381119375846, + "learning_rate": 9.99298456702243e-06, + "loss": 0.188, + "step": 866 + }, + { + "epoch": 0.3466453674121406, + "grad_norm": 1.392632813313601, + "learning_rate": 9.99273631236009e-06, + "loss": 0.1814, + "step": 868 + }, + { + "epoch": 0.347444089456869, + "grad_norm": 1.541510486431838, + "learning_rate": 9.992483744444648e-06, + "loss": 0.202, + "step": 870 + }, + { + "epoch": 0.34824281150159747, + "grad_norm": 1.416164840329261, + "learning_rate": 9.9922268634943e-06, + "loss": 0.2034, + "step": 872 + }, + { + "epoch": 0.3490415335463259, + "grad_norm": 1.52233433092975, + "learning_rate": 9.991965669730965e-06, + "loss": 0.2186, + "step": 874 + }, + { + "epoch": 0.3498402555910543, + "grad_norm": 1.59873194009716, + "learning_rate": 9.991700163380292e-06, + "loss": 0.2353, + "step": 876 + }, + { + "epoch": 0.3506389776357827, + "grad_norm": 1.7195541732829867, + "learning_rate": 9.991430344671653e-06, + "loss": 0.2121, + "step": 878 + }, + { + "epoch": 0.3514376996805112, + "grad_norm": 1.6835655527767803, + "learning_rate": 9.991156213838143e-06, + "loss": 0.2048, + "step": 880 + }, + { + "epoch": 0.3522364217252396, + "grad_norm": 1.4417934095264713, + "learning_rate": 9.990877771116588e-06, + "loss": 0.1967, + "step": 882 + }, + { + "epoch": 0.35303514376996803, + "grad_norm": 1.6176474246414694, + "learning_rate": 9.990595016747536e-06, + "loss": 0.203, + "step": 884 + }, + { + "epoch": 0.3538338658146965, + "grad_norm": 1.5007198389997798, + "learning_rate": 9.99030795097526e-06, + "loss": 0.2201, + "step": 886 + }, + { + "epoch": 0.3546325878594249, + "grad_norm": 1.628516285198201, + "learning_rate": 9.990016574047757e-06, + "loss": 0.208, + "step": 888 + }, + { + "epoch": 0.35543130990415334, + "grad_norm": 1.5813428579068682, + "learning_rate": 9.989720886216749e-06, + "loss": 0.2196, + "step": 890 + }, + { + "epoch": 0.3562300319488818, + "grad_norm": 1.5508584713469626, + "learning_rate": 9.989420887737684e-06, + "loss": 0.2145, + "step": 892 + }, + { + "epoch": 0.3570287539936102, + "grad_norm": 1.7163590462789873, + "learning_rate": 9.989116578869732e-06, + "loss": 0.2393, + "step": 894 + }, + { + "epoch": 0.35782747603833864, + "grad_norm": 1.361569590542389, + "learning_rate": 9.988807959875785e-06, + "loss": 0.2064, + "step": 896 + }, + { + "epoch": 0.3586261980830671, + "grad_norm": 1.451447508537209, + "learning_rate": 9.988495031022465e-06, + "loss": 0.204, + "step": 898 + }, + { + "epoch": 0.35942492012779553, + "grad_norm": 1.4487071973493906, + "learning_rate": 9.988177792580107e-06, + "loss": 0.2009, + "step": 900 + }, + { + "epoch": 0.36022364217252395, + "grad_norm": 1.419806864642736, + "learning_rate": 9.98785624482278e-06, + "loss": 0.2024, + "step": 902 + }, + { + "epoch": 0.3610223642172524, + "grad_norm": 1.578737310761318, + "learning_rate": 9.987530388028269e-06, + "loss": 0.2092, + "step": 904 + }, + { + "epoch": 0.36182108626198084, + "grad_norm": 1.6200056976952275, + "learning_rate": 9.987200222478084e-06, + "loss": 0.209, + "step": 906 + }, + { + "epoch": 0.36261980830670926, + "grad_norm": 1.6251607361656684, + "learning_rate": 9.986865748457457e-06, + "loss": 0.2102, + "step": 908 + }, + { + "epoch": 0.3634185303514377, + "grad_norm": 1.60473197937151, + "learning_rate": 9.986526966255341e-06, + "loss": 0.2188, + "step": 910 + }, + { + "epoch": 0.36421725239616615, + "grad_norm": 1.4451319185877254, + "learning_rate": 9.986183876164412e-06, + "loss": 0.211, + "step": 912 + }, + { + "epoch": 0.36501597444089456, + "grad_norm": 1.7222368755921156, + "learning_rate": 9.985836478481069e-06, + "loss": 0.2093, + "step": 914 + }, + { + "epoch": 0.365814696485623, + "grad_norm": 1.5858620117161173, + "learning_rate": 9.98548477350543e-06, + "loss": 0.2128, + "step": 916 + }, + { + "epoch": 0.36661341853035145, + "grad_norm": 1.5923894300594037, + "learning_rate": 9.985128761541334e-06, + "loss": 0.2185, + "step": 918 + }, + { + "epoch": 0.36741214057507987, + "grad_norm": 1.5815229483720405, + "learning_rate": 9.984768442896342e-06, + "loss": 0.232, + "step": 920 + }, + { + "epoch": 0.3682108626198083, + "grad_norm": 1.3902654600389392, + "learning_rate": 9.984403817881736e-06, + "loss": 0.1875, + "step": 922 + }, + { + "epoch": 0.36900958466453676, + "grad_norm": 1.434425802911243, + "learning_rate": 9.984034886812519e-06, + "loss": 0.1891, + "step": 924 + }, + { + "epoch": 0.3698083067092652, + "grad_norm": 1.4965650327837223, + "learning_rate": 9.98366165000741e-06, + "loss": 0.212, + "step": 926 + }, + { + "epoch": 0.3706070287539936, + "grad_norm": 1.5934149301565348, + "learning_rate": 9.983284107788852e-06, + "loss": 0.2081, + "step": 928 + }, + { + "epoch": 0.37140575079872207, + "grad_norm": 1.5649065798161081, + "learning_rate": 9.982902260483003e-06, + "loss": 0.2148, + "step": 930 + }, + { + "epoch": 0.3722044728434505, + "grad_norm": 1.5919056980501831, + "learning_rate": 9.982516108419746e-06, + "loss": 0.2088, + "step": 932 + }, + { + "epoch": 0.3730031948881789, + "grad_norm": 1.3802136089446853, + "learning_rate": 9.982125651932681e-06, + "loss": 0.1826, + "step": 934 + }, + { + "epoch": 0.3738019169329074, + "grad_norm": 1.4882986051122178, + "learning_rate": 9.981730891359123e-06, + "loss": 0.2278, + "step": 936 + }, + { + "epoch": 0.3746006389776358, + "grad_norm": 1.377902572686802, + "learning_rate": 9.981331827040109e-06, + "loss": 0.2011, + "step": 938 + }, + { + "epoch": 0.3753993610223642, + "grad_norm": 1.574021407706636, + "learning_rate": 9.980928459320393e-06, + "loss": 0.2101, + "step": 940 + }, + { + "epoch": 0.3761980830670926, + "grad_norm": 1.4461584644500969, + "learning_rate": 9.980520788548445e-06, + "loss": 0.1986, + "step": 942 + }, + { + "epoch": 0.3769968051118211, + "grad_norm": 1.5726540162240885, + "learning_rate": 9.980108815076456e-06, + "loss": 0.2079, + "step": 944 + }, + { + "epoch": 0.3777955271565495, + "grad_norm": 1.631975663438154, + "learning_rate": 9.979692539260331e-06, + "loss": 0.2071, + "step": 946 + }, + { + "epoch": 0.37859424920127793, + "grad_norm": 1.3678329735675085, + "learning_rate": 9.979271961459696e-06, + "loss": 0.2109, + "step": 948 + }, + { + "epoch": 0.3793929712460064, + "grad_norm": 1.641531450859251, + "learning_rate": 9.978847082037886e-06, + "loss": 0.1935, + "step": 950 + }, + { + "epoch": 0.3801916932907348, + "grad_norm": 1.5492312533670054, + "learning_rate": 9.978417901361958e-06, + "loss": 0.2046, + "step": 952 + }, + { + "epoch": 0.38099041533546324, + "grad_norm": 1.7625259362995305, + "learning_rate": 9.977984419802686e-06, + "loss": 0.2085, + "step": 954 + }, + { + "epoch": 0.3817891373801917, + "grad_norm": 1.3629858192513753, + "learning_rate": 9.977546637734557e-06, + "loss": 0.2026, + "step": 956 + }, + { + "epoch": 0.38258785942492013, + "grad_norm": 1.6553817872981966, + "learning_rate": 9.97710455553577e-06, + "loss": 0.1986, + "step": 958 + }, + { + "epoch": 0.38338658146964855, + "grad_norm": 1.6709973514366674, + "learning_rate": 9.976658173588244e-06, + "loss": 0.1885, + "step": 960 + }, + { + "epoch": 0.384185303514377, + "grad_norm": 1.5781155572362893, + "learning_rate": 9.976207492277612e-06, + "loss": 0.1978, + "step": 962 + }, + { + "epoch": 0.38498402555910544, + "grad_norm": 1.5471644306914103, + "learning_rate": 9.97575251199322e-06, + "loss": 0.2062, + "step": 964 + }, + { + "epoch": 0.38578274760383385, + "grad_norm": 1.6210596494320104, + "learning_rate": 9.975293233128128e-06, + "loss": 0.211, + "step": 966 + }, + { + "epoch": 0.3865814696485623, + "grad_norm": 1.4763420559956302, + "learning_rate": 9.974829656079106e-06, + "loss": 0.2018, + "step": 968 + }, + { + "epoch": 0.38738019169329074, + "grad_norm": 1.6383352001216414, + "learning_rate": 9.974361781246647e-06, + "loss": 0.2094, + "step": 970 + }, + { + "epoch": 0.38817891373801916, + "grad_norm": 1.7933627710025113, + "learning_rate": 9.973889609034945e-06, + "loss": 0.2163, + "step": 972 + }, + { + "epoch": 0.3889776357827476, + "grad_norm": 1.5400540527111912, + "learning_rate": 9.973413139851918e-06, + "loss": 0.1863, + "step": 974 + }, + { + "epoch": 0.38977635782747605, + "grad_norm": 1.3560103646758506, + "learning_rate": 9.972932374109184e-06, + "loss": 0.1959, + "step": 976 + }, + { + "epoch": 0.39057507987220447, + "grad_norm": 1.9026289683010842, + "learning_rate": 9.972447312222084e-06, + "loss": 0.1825, + "step": 978 + }, + { + "epoch": 0.3913738019169329, + "grad_norm": 1.5573821099296685, + "learning_rate": 9.971957954609663e-06, + "loss": 0.206, + "step": 980 + }, + { + "epoch": 0.39217252396166136, + "grad_norm": 1.463153557338603, + "learning_rate": 9.971464301694683e-06, + "loss": 0.2139, + "step": 982 + }, + { + "epoch": 0.3929712460063898, + "grad_norm": 1.4361640616504048, + "learning_rate": 9.97096635390361e-06, + "loss": 0.2039, + "step": 984 + }, + { + "epoch": 0.3937699680511182, + "grad_norm": 1.3944298003112567, + "learning_rate": 9.970464111666627e-06, + "loss": 0.2078, + "step": 986 + }, + { + "epoch": 0.39456869009584666, + "grad_norm": 1.427105562837558, + "learning_rate": 9.969957575417621e-06, + "loss": 0.1837, + "step": 988 + }, + { + "epoch": 0.3953674121405751, + "grad_norm": 1.520588415638903, + "learning_rate": 9.969446745594193e-06, + "loss": 0.2239, + "step": 990 + }, + { + "epoch": 0.3961661341853035, + "grad_norm": 2.079367024655261, + "learning_rate": 9.968931622637652e-06, + "loss": 0.1997, + "step": 992 + }, + { + "epoch": 0.39696485623003197, + "grad_norm": 1.5129459484138021, + "learning_rate": 9.968412206993015e-06, + "loss": 0.1953, + "step": 994 + }, + { + "epoch": 0.3977635782747604, + "grad_norm": 1.478216675491551, + "learning_rate": 9.967888499109008e-06, + "loss": 0.2081, + "step": 996 + }, + { + "epoch": 0.3985623003194888, + "grad_norm": 1.5236800584109207, + "learning_rate": 9.967360499438067e-06, + "loss": 0.2043, + "step": 998 + }, + { + "epoch": 0.3993610223642173, + "grad_norm": 1.4639393772316662, + "learning_rate": 9.966828208436332e-06, + "loss": 0.2035, + "step": 1000 + }, + { + "epoch": 0.3993610223642173, + "eval_loss": 0.1855674833059311, + "eval_runtime": 418.9403, + "eval_samples_per_second": 42.505, + "eval_steps_per_second": 5.313, + "step": 1000 + }, + { + "epoch": 0.4001597444089457, + "grad_norm": 1.925447602175885, + "learning_rate": 9.966291626563651e-06, + "loss": 0.2141, + "step": 1002 + }, + { + "epoch": 0.4009584664536741, + "grad_norm": 1.5342974238522853, + "learning_rate": 9.965750754283583e-06, + "loss": 0.1967, + "step": 1004 + }, + { + "epoch": 0.40175718849840253, + "grad_norm": 1.365762174319038, + "learning_rate": 9.96520559206339e-06, + "loss": 0.1787, + "step": 1006 + }, + { + "epoch": 0.402555910543131, + "grad_norm": 1.6415643245446048, + "learning_rate": 9.96465614037404e-06, + "loss": 0.2094, + "step": 1008 + }, + { + "epoch": 0.4033546325878594, + "grad_norm": 1.487881148105119, + "learning_rate": 9.964102399690206e-06, + "loss": 0.2296, + "step": 1010 + }, + { + "epoch": 0.40415335463258784, + "grad_norm": 1.3885469945101532, + "learning_rate": 9.96354437049027e-06, + "loss": 0.1953, + "step": 1012 + }, + { + "epoch": 0.4049520766773163, + "grad_norm": 1.5376182595234218, + "learning_rate": 9.962982053256317e-06, + "loss": 0.2067, + "step": 1014 + }, + { + "epoch": 0.4057507987220447, + "grad_norm": 1.5925253053184176, + "learning_rate": 9.962415448474134e-06, + "loss": 0.2034, + "step": 1016 + }, + { + "epoch": 0.40654952076677314, + "grad_norm": 1.5707593570512013, + "learning_rate": 9.961844556633216e-06, + "loss": 0.2107, + "step": 1018 + }, + { + "epoch": 0.4073482428115016, + "grad_norm": 1.538254717520436, + "learning_rate": 9.961269378226756e-06, + "loss": 0.2014, + "step": 1020 + }, + { + "epoch": 0.40814696485623003, + "grad_norm": 1.5245585221099645, + "learning_rate": 9.960689913751658e-06, + "loss": 0.2002, + "step": 1022 + }, + { + "epoch": 0.40894568690095845, + "grad_norm": 1.4248623479875928, + "learning_rate": 9.960106163708522e-06, + "loss": 0.1883, + "step": 1024 + }, + { + "epoch": 0.4097444089456869, + "grad_norm": 1.7412362384789994, + "learning_rate": 9.959518128601657e-06, + "loss": 0.218, + "step": 1026 + }, + { + "epoch": 0.41054313099041534, + "grad_norm": 1.4925072692951855, + "learning_rate": 9.958925808939066e-06, + "loss": 0.2111, + "step": 1028 + }, + { + "epoch": 0.41134185303514376, + "grad_norm": 1.4973115296085746, + "learning_rate": 9.958329205232456e-06, + "loss": 0.2059, + "step": 1030 + }, + { + "epoch": 0.41214057507987223, + "grad_norm": 1.468856647822638, + "learning_rate": 9.95772831799724e-06, + "loss": 0.2093, + "step": 1032 + }, + { + "epoch": 0.41293929712460065, + "grad_norm": 1.4085028520516685, + "learning_rate": 9.957123147752527e-06, + "loss": 0.2116, + "step": 1034 + }, + { + "epoch": 0.41373801916932906, + "grad_norm": 1.3478745438364825, + "learning_rate": 9.956513695021126e-06, + "loss": 0.2144, + "step": 1036 + }, + { + "epoch": 0.4145367412140575, + "grad_norm": 1.414787678483826, + "learning_rate": 9.955899960329546e-06, + "loss": 0.2114, + "step": 1038 + }, + { + "epoch": 0.41533546325878595, + "grad_norm": 1.4758641440286802, + "learning_rate": 9.955281944207998e-06, + "loss": 0.2114, + "step": 1040 + }, + { + "epoch": 0.41613418530351437, + "grad_norm": 1.3808751418519412, + "learning_rate": 9.95465964719039e-06, + "loss": 0.1996, + "step": 1042 + }, + { + "epoch": 0.4169329073482428, + "grad_norm": 1.3552203407425432, + "learning_rate": 9.954033069814324e-06, + "loss": 0.2197, + "step": 1044 + }, + { + "epoch": 0.41773162939297126, + "grad_norm": 1.7431939971291783, + "learning_rate": 9.953402212621107e-06, + "loss": 0.1766, + "step": 1046 + }, + { + "epoch": 0.4185303514376997, + "grad_norm": 1.2999138749816357, + "learning_rate": 9.95276707615574e-06, + "loss": 0.1742, + "step": 1048 + }, + { + "epoch": 0.4193290734824281, + "grad_norm": 1.4945842460673373, + "learning_rate": 9.952127660966919e-06, + "loss": 0.2246, + "step": 1050 + }, + { + "epoch": 0.42012779552715657, + "grad_norm": 1.5152322746280356, + "learning_rate": 9.95148396760704e-06, + "loss": 0.2192, + "step": 1052 + }, + { + "epoch": 0.420926517571885, + "grad_norm": 1.4230040236709558, + "learning_rate": 9.950835996632193e-06, + "loss": 0.1982, + "step": 1054 + }, + { + "epoch": 0.4217252396166134, + "grad_norm": 1.3823627265224154, + "learning_rate": 9.950183748602164e-06, + "loss": 0.2143, + "step": 1056 + }, + { + "epoch": 0.4225239616613419, + "grad_norm": 1.6379038373994437, + "learning_rate": 9.949527224080434e-06, + "loss": 0.2006, + "step": 1058 + }, + { + "epoch": 0.4233226837060703, + "grad_norm": 1.4280283780098375, + "learning_rate": 9.948866423634178e-06, + "loss": 0.1841, + "step": 1060 + }, + { + "epoch": 0.4241214057507987, + "grad_norm": 1.417933520198746, + "learning_rate": 9.948201347834265e-06, + "loss": 0.2032, + "step": 1062 + }, + { + "epoch": 0.4249201277955272, + "grad_norm": 1.3378469760405771, + "learning_rate": 9.947531997255256e-06, + "loss": 0.1926, + "step": 1064 + }, + { + "epoch": 0.4257188498402556, + "grad_norm": 1.5459222264184573, + "learning_rate": 9.94685837247541e-06, + "loss": 0.2052, + "step": 1066 + }, + { + "epoch": 0.426517571884984, + "grad_norm": 1.501440385232651, + "learning_rate": 9.946180474076675e-06, + "loss": 0.2052, + "step": 1068 + }, + { + "epoch": 0.4273162939297125, + "grad_norm": 1.5358561082091893, + "learning_rate": 9.945498302644687e-06, + "loss": 0.2034, + "step": 1070 + }, + { + "epoch": 0.4281150159744409, + "grad_norm": 1.6969353624664514, + "learning_rate": 9.944811858768782e-06, + "loss": 0.2141, + "step": 1072 + }, + { + "epoch": 0.4289137380191693, + "grad_norm": 1.4361329940984802, + "learning_rate": 9.944121143041982e-06, + "loss": 0.1955, + "step": 1074 + }, + { + "epoch": 0.42971246006389774, + "grad_norm": 1.4944093870984845, + "learning_rate": 9.943426156061e-06, + "loss": 0.1968, + "step": 1076 + }, + { + "epoch": 0.4305111821086262, + "grad_norm": 1.265035184804792, + "learning_rate": 9.942726898426238e-06, + "loss": 0.2113, + "step": 1078 + }, + { + "epoch": 0.43130990415335463, + "grad_norm": 1.4364726173362738, + "learning_rate": 9.94202337074179e-06, + "loss": 0.2034, + "step": 1080 + }, + { + "epoch": 0.43210862619808305, + "grad_norm": 1.4034504271470043, + "learning_rate": 9.941315573615437e-06, + "loss": 0.1783, + "step": 1082 + }, + { + "epoch": 0.4329073482428115, + "grad_norm": 1.4068580877161003, + "learning_rate": 9.940603507658649e-06, + "loss": 0.2071, + "step": 1084 + }, + { + "epoch": 0.43370607028753994, + "grad_norm": 1.4617297171517643, + "learning_rate": 9.939887173486583e-06, + "loss": 0.2086, + "step": 1086 + }, + { + "epoch": 0.43450479233226835, + "grad_norm": 1.4870587568723221, + "learning_rate": 9.939166571718086e-06, + "loss": 0.2191, + "step": 1088 + }, + { + "epoch": 0.4353035143769968, + "grad_norm": 1.3330964975081798, + "learning_rate": 9.938441702975689e-06, + "loss": 0.2105, + "step": 1090 + }, + { + "epoch": 0.43610223642172524, + "grad_norm": 1.4071934617523967, + "learning_rate": 9.93771256788561e-06, + "loss": 0.1986, + "step": 1092 + }, + { + "epoch": 0.43690095846645366, + "grad_norm": 1.4956330891164953, + "learning_rate": 9.936979167077754e-06, + "loss": 0.2242, + "step": 1094 + }, + { + "epoch": 0.43769968051118213, + "grad_norm": 1.500694287898403, + "learning_rate": 9.936241501185706e-06, + "loss": 0.213, + "step": 1096 + }, + { + "epoch": 0.43849840255591055, + "grad_norm": 1.431648588389163, + "learning_rate": 9.935499570846746e-06, + "loss": 0.1994, + "step": 1098 + }, + { + "epoch": 0.43929712460063897, + "grad_norm": 1.3419204436535992, + "learning_rate": 9.934753376701827e-06, + "loss": 0.1917, + "step": 1100 + }, + { + "epoch": 0.44009584664536744, + "grad_norm": 1.2901653329852891, + "learning_rate": 9.934002919395593e-06, + "loss": 0.1923, + "step": 1102 + }, + { + "epoch": 0.44089456869009586, + "grad_norm": 1.6677348530796035, + "learning_rate": 9.933248199576366e-06, + "loss": 0.2037, + "step": 1104 + }, + { + "epoch": 0.4416932907348243, + "grad_norm": 1.380349679623337, + "learning_rate": 9.932489217896154e-06, + "loss": 0.1909, + "step": 1106 + }, + { + "epoch": 0.4424920127795527, + "grad_norm": 1.4203485606081072, + "learning_rate": 9.931725975010647e-06, + "loss": 0.1934, + "step": 1108 + }, + { + "epoch": 0.44329073482428116, + "grad_norm": 1.4039574653298539, + "learning_rate": 9.930958471579212e-06, + "loss": 0.2137, + "step": 1110 + }, + { + "epoch": 0.4440894568690096, + "grad_norm": 1.4554296915268317, + "learning_rate": 9.930186708264902e-06, + "loss": 0.2081, + "step": 1112 + }, + { + "epoch": 0.444888178913738, + "grad_norm": 1.5862456773052678, + "learning_rate": 9.929410685734446e-06, + "loss": 0.2074, + "step": 1114 + }, + { + "epoch": 0.44568690095846647, + "grad_norm": 1.371290557948402, + "learning_rate": 9.928630404658255e-06, + "loss": 0.1934, + "step": 1116 + }, + { + "epoch": 0.4464856230031949, + "grad_norm": 1.5523688713967079, + "learning_rate": 9.92784586571042e-06, + "loss": 0.2114, + "step": 1118 + }, + { + "epoch": 0.4472843450479233, + "grad_norm": 1.5913885443768627, + "learning_rate": 9.927057069568704e-06, + "loss": 0.197, + "step": 1120 + }, + { + "epoch": 0.4480830670926518, + "grad_norm": 1.376586499393768, + "learning_rate": 9.926264016914555e-06, + "loss": 0.2114, + "step": 1122 + }, + { + "epoch": 0.4488817891373802, + "grad_norm": 1.3604215439899947, + "learning_rate": 9.925466708433097e-06, + "loss": 0.1949, + "step": 1124 + }, + { + "epoch": 0.4496805111821086, + "grad_norm": 1.4064368128907085, + "learning_rate": 9.924665144813128e-06, + "loss": 0.1876, + "step": 1126 + }, + { + "epoch": 0.4504792332268371, + "grad_norm": 1.4493555350796723, + "learning_rate": 9.923859326747125e-06, + "loss": 0.166, + "step": 1128 + }, + { + "epoch": 0.4512779552715655, + "grad_norm": 1.3005030779658915, + "learning_rate": 9.923049254931235e-06, + "loss": 0.179, + "step": 1130 + }, + { + "epoch": 0.4520766773162939, + "grad_norm": 1.301603097338869, + "learning_rate": 9.922234930065286e-06, + "loss": 0.1828, + "step": 1132 + }, + { + "epoch": 0.4528753993610224, + "grad_norm": 1.424132465075419, + "learning_rate": 9.921416352852779e-06, + "loss": 0.2027, + "step": 1134 + }, + { + "epoch": 0.4536741214057508, + "grad_norm": 1.5184245441350142, + "learning_rate": 9.920593524000887e-06, + "loss": 0.2163, + "step": 1136 + }, + { + "epoch": 0.4544728434504792, + "grad_norm": 1.2649380282890057, + "learning_rate": 9.919766444220454e-06, + "loss": 0.1952, + "step": 1138 + }, + { + "epoch": 0.45527156549520764, + "grad_norm": 1.3192846529035582, + "learning_rate": 9.918935114226001e-06, + "loss": 0.2147, + "step": 1140 + }, + { + "epoch": 0.4560702875399361, + "grad_norm": 1.6014388375199928, + "learning_rate": 9.91809953473572e-06, + "loss": 0.209, + "step": 1142 + }, + { + "epoch": 0.45686900958466453, + "grad_norm": 1.2763233267206437, + "learning_rate": 9.917259706471469e-06, + "loss": 0.1894, + "step": 1144 + }, + { + "epoch": 0.45766773162939295, + "grad_norm": 1.355011355722016, + "learning_rate": 9.916415630158782e-06, + "loss": 0.2019, + "step": 1146 + }, + { + "epoch": 0.4584664536741214, + "grad_norm": 1.420940628734508, + "learning_rate": 9.915567306526863e-06, + "loss": 0.2176, + "step": 1148 + }, + { + "epoch": 0.45926517571884984, + "grad_norm": 1.3488193076357031, + "learning_rate": 9.914714736308582e-06, + "loss": 0.2032, + "step": 1150 + }, + { + "epoch": 0.46006389776357826, + "grad_norm": 1.288799255340392, + "learning_rate": 9.913857920240481e-06, + "loss": 0.2077, + "step": 1152 + }, + { + "epoch": 0.46086261980830673, + "grad_norm": 1.3786756620906035, + "learning_rate": 9.912996859062764e-06, + "loss": 0.2113, + "step": 1154 + }, + { + "epoch": 0.46166134185303515, + "grad_norm": 1.50469580343903, + "learning_rate": 9.912131553519313e-06, + "loss": 0.2073, + "step": 1156 + }, + { + "epoch": 0.46246006389776356, + "grad_norm": 1.4700159822947163, + "learning_rate": 9.911262004357665e-06, + "loss": 0.2053, + "step": 1158 + }, + { + "epoch": 0.46325878594249204, + "grad_norm": 1.519205743263198, + "learning_rate": 9.91038821232903e-06, + "loss": 0.2217, + "step": 1160 + }, + { + "epoch": 0.46405750798722045, + "grad_norm": 1.3708099691931215, + "learning_rate": 9.909510178188281e-06, + "loss": 0.1891, + "step": 1162 + }, + { + "epoch": 0.46485623003194887, + "grad_norm": 1.2711919877902211, + "learning_rate": 9.90862790269396e-06, + "loss": 0.2003, + "step": 1164 + }, + { + "epoch": 0.46565495207667734, + "grad_norm": 1.296959216697176, + "learning_rate": 9.907741386608267e-06, + "loss": 0.1895, + "step": 1166 + }, + { + "epoch": 0.46645367412140576, + "grad_norm": 1.4988072467804332, + "learning_rate": 9.906850630697068e-06, + "loss": 0.229, + "step": 1168 + }, + { + "epoch": 0.4672523961661342, + "grad_norm": 1.357985179422065, + "learning_rate": 9.905955635729894e-06, + "loss": 0.208, + "step": 1170 + }, + { + "epoch": 0.4680511182108626, + "grad_norm": 1.3438998637885513, + "learning_rate": 9.905056402479933e-06, + "loss": 0.1809, + "step": 1172 + }, + { + "epoch": 0.46884984025559107, + "grad_norm": 1.2383109161845713, + "learning_rate": 9.904152931724043e-06, + "loss": 0.1971, + "step": 1174 + }, + { + "epoch": 0.4696485623003195, + "grad_norm": 1.5560174102928908, + "learning_rate": 9.903245224242732e-06, + "loss": 0.2032, + "step": 1176 + }, + { + "epoch": 0.4704472843450479, + "grad_norm": 1.38347824637013, + "learning_rate": 9.902333280820176e-06, + "loss": 0.198, + "step": 1178 + }, + { + "epoch": 0.4712460063897764, + "grad_norm": 1.4051889306019656, + "learning_rate": 9.901417102244208e-06, + "loss": 0.1987, + "step": 1180 + }, + { + "epoch": 0.4720447284345048, + "grad_norm": 1.6159750486401823, + "learning_rate": 9.90049668930632e-06, + "loss": 0.2029, + "step": 1182 + }, + { + "epoch": 0.4728434504792332, + "grad_norm": 1.5547809497795395, + "learning_rate": 9.899572042801662e-06, + "loss": 0.1937, + "step": 1184 + }, + { + "epoch": 0.4736421725239617, + "grad_norm": 1.392059461645592, + "learning_rate": 9.898643163529041e-06, + "loss": 0.1783, + "step": 1186 + }, + { + "epoch": 0.4744408945686901, + "grad_norm": 1.4227792657414569, + "learning_rate": 9.89771005229092e-06, + "loss": 0.1942, + "step": 1188 + }, + { + "epoch": 0.4752396166134185, + "grad_norm": 1.525020829545986, + "learning_rate": 9.89677270989342e-06, + "loss": 0.2135, + "step": 1190 + }, + { + "epoch": 0.476038338658147, + "grad_norm": 1.7374484802274244, + "learning_rate": 9.895831137146319e-06, + "loss": 0.2081, + "step": 1192 + }, + { + "epoch": 0.4768370607028754, + "grad_norm": 1.3661453654617517, + "learning_rate": 9.894885334863044e-06, + "loss": 0.2055, + "step": 1194 + }, + { + "epoch": 0.4776357827476038, + "grad_norm": 1.5602720293300305, + "learning_rate": 9.893935303860677e-06, + "loss": 0.2171, + "step": 1196 + }, + { + "epoch": 0.4784345047923323, + "grad_norm": 3.1025231784672216, + "learning_rate": 9.892981044959961e-06, + "loss": 0.193, + "step": 1198 + }, + { + "epoch": 0.4792332268370607, + "grad_norm": 1.262885402820455, + "learning_rate": 9.89202255898528e-06, + "loss": 0.1916, + "step": 1200 + }, + { + "epoch": 0.48003194888178913, + "grad_norm": 1.381018034591307, + "learning_rate": 9.891059846764679e-06, + "loss": 0.1878, + "step": 1202 + }, + { + "epoch": 0.48083067092651754, + "grad_norm": 1.5768051172665962, + "learning_rate": 9.89009290912985e-06, + "loss": 0.1967, + "step": 1204 + }, + { + "epoch": 0.481629392971246, + "grad_norm": 1.463151710058678, + "learning_rate": 9.889121746916132e-06, + "loss": 0.2269, + "step": 1206 + }, + { + "epoch": 0.48242811501597443, + "grad_norm": 1.34065053745843, + "learning_rate": 9.888146360962523e-06, + "loss": 0.1865, + "step": 1208 + }, + { + "epoch": 0.48322683706070285, + "grad_norm": 1.6723543690494453, + "learning_rate": 9.887166752111663e-06, + "loss": 0.2129, + "step": 1210 + }, + { + "epoch": 0.4840255591054313, + "grad_norm": 1.5794398784564423, + "learning_rate": 9.88618292120984e-06, + "loss": 0.2224, + "step": 1212 + }, + { + "epoch": 0.48482428115015974, + "grad_norm": 1.450388456580997, + "learning_rate": 9.88519486910699e-06, + "loss": 0.1966, + "step": 1214 + }, + { + "epoch": 0.48562300319488816, + "grad_norm": 1.7814239798658509, + "learning_rate": 9.8842025966567e-06, + "loss": 0.1904, + "step": 1216 + }, + { + "epoch": 0.48642172523961663, + "grad_norm": 1.7391567788714228, + "learning_rate": 9.883206104716198e-06, + "loss": 0.2133, + "step": 1218 + }, + { + "epoch": 0.48722044728434505, + "grad_norm": 1.442571651767735, + "learning_rate": 9.882205394146362e-06, + "loss": 0.1989, + "step": 1220 + }, + { + "epoch": 0.48801916932907347, + "grad_norm": 1.4088522689829923, + "learning_rate": 9.881200465811706e-06, + "loss": 0.2175, + "step": 1222 + }, + { + "epoch": 0.48881789137380194, + "grad_norm": 1.2413815231676442, + "learning_rate": 9.880191320580396e-06, + "loss": 0.1705, + "step": 1224 + }, + { + "epoch": 0.48961661341853036, + "grad_norm": 1.4154117503900958, + "learning_rate": 9.87917795932424e-06, + "loss": 0.1817, + "step": 1226 + }, + { + "epoch": 0.4904153354632588, + "grad_norm": 1.3393213535614317, + "learning_rate": 9.878160382918685e-06, + "loss": 0.2009, + "step": 1228 + }, + { + "epoch": 0.49121405750798725, + "grad_norm": 1.5366573602302467, + "learning_rate": 9.87713859224282e-06, + "loss": 0.2339, + "step": 1230 + }, + { + "epoch": 0.49201277955271566, + "grad_norm": 1.2826491297833846, + "learning_rate": 9.876112588179378e-06, + "loss": 0.1837, + "step": 1232 + }, + { + "epoch": 0.4928115015974441, + "grad_norm": 1.4232335556523423, + "learning_rate": 9.875082371614728e-06, + "loss": 0.1985, + "step": 1234 + }, + { + "epoch": 0.4936102236421725, + "grad_norm": 1.4944078372691827, + "learning_rate": 9.874047943438879e-06, + "loss": 0.1777, + "step": 1236 + }, + { + "epoch": 0.49440894568690097, + "grad_norm": 1.3263567228958557, + "learning_rate": 9.873009304545482e-06, + "loss": 0.2043, + "step": 1238 + }, + { + "epoch": 0.4952076677316294, + "grad_norm": 1.491665028254892, + "learning_rate": 9.87196645583182e-06, + "loss": 0.1995, + "step": 1240 + }, + { + "epoch": 0.4960063897763578, + "grad_norm": 1.4711866133685396, + "learning_rate": 9.870919398198819e-06, + "loss": 0.2151, + "step": 1242 + }, + { + "epoch": 0.4968051118210863, + "grad_norm": 1.3471791986545025, + "learning_rate": 9.869868132551037e-06, + "loss": 0.2301, + "step": 1244 + }, + { + "epoch": 0.4976038338658147, + "grad_norm": 1.30305885026678, + "learning_rate": 9.868812659796669e-06, + "loss": 0.1955, + "step": 1246 + }, + { + "epoch": 0.4984025559105431, + "grad_norm": 1.3754860324491927, + "learning_rate": 9.86775298084754e-06, + "loss": 0.2245, + "step": 1248 + }, + { + "epoch": 0.4992012779552716, + "grad_norm": 1.2464730265182025, + "learning_rate": 9.866689096619117e-06, + "loss": 0.1811, + "step": 1250 + }, + { + "epoch": 0.5, + "grad_norm": 1.3277366908866204, + "learning_rate": 9.865621008030492e-06, + "loss": 0.1916, + "step": 1252 + }, + { + "epoch": 0.5007987220447284, + "grad_norm": 1.4195649241005022, + "learning_rate": 9.864548716004399e-06, + "loss": 0.1932, + "step": 1254 + }, + { + "epoch": 0.5015974440894568, + "grad_norm": 1.4925132735569981, + "learning_rate": 9.863472221467189e-06, + "loss": 0.1938, + "step": 1256 + }, + { + "epoch": 0.5023961661341853, + "grad_norm": 1.2979288287029658, + "learning_rate": 9.862391525348856e-06, + "loss": 0.1877, + "step": 1258 + }, + { + "epoch": 0.5031948881789138, + "grad_norm": 1.3864119857396864, + "learning_rate": 9.861306628583021e-06, + "loss": 0.1958, + "step": 1260 + }, + { + "epoch": 0.5039936102236422, + "grad_norm": 1.5118691478458204, + "learning_rate": 9.86021753210693e-06, + "loss": 0.2145, + "step": 1262 + }, + { + "epoch": 0.5047923322683706, + "grad_norm": 1.3753508468091324, + "learning_rate": 9.85912423686146e-06, + "loss": 0.202, + "step": 1264 + }, + { + "epoch": 0.505591054313099, + "grad_norm": 1.4184249679671, + "learning_rate": 9.858026743791114e-06, + "loss": 0.2002, + "step": 1266 + }, + { + "epoch": 0.5063897763578274, + "grad_norm": 1.423875732058238, + "learning_rate": 9.856925053844025e-06, + "loss": 0.2118, + "step": 1268 + }, + { + "epoch": 0.5071884984025559, + "grad_norm": 1.4222433141825608, + "learning_rate": 9.855819167971946e-06, + "loss": 0.1782, + "step": 1270 + }, + { + "epoch": 0.5079872204472844, + "grad_norm": 1.3607705696064714, + "learning_rate": 9.854709087130261e-06, + "loss": 0.2206, + "step": 1272 + }, + { + "epoch": 0.5087859424920128, + "grad_norm": 1.4114769617291796, + "learning_rate": 9.853594812277973e-06, + "loss": 0.214, + "step": 1274 + }, + { + "epoch": 0.5095846645367412, + "grad_norm": 1.5913052917009076, + "learning_rate": 9.852476344377713e-06, + "loss": 0.2109, + "step": 1276 + }, + { + "epoch": 0.5103833865814696, + "grad_norm": 1.3774331396577875, + "learning_rate": 9.851353684395728e-06, + "loss": 0.2027, + "step": 1278 + }, + { + "epoch": 0.5111821086261981, + "grad_norm": 1.2773430420906078, + "learning_rate": 9.850226833301893e-06, + "loss": 0.1985, + "step": 1280 + }, + { + "epoch": 0.5119808306709265, + "grad_norm": 1.6292303107969242, + "learning_rate": 9.849095792069701e-06, + "loss": 0.2125, + "step": 1282 + }, + { + "epoch": 0.512779552715655, + "grad_norm": 1.4215039291593559, + "learning_rate": 9.847960561676263e-06, + "loss": 0.2037, + "step": 1284 + }, + { + "epoch": 0.5135782747603834, + "grad_norm": 1.3067667391370512, + "learning_rate": 9.846821143102313e-06, + "loss": 0.1677, + "step": 1286 + }, + { + "epoch": 0.5143769968051118, + "grad_norm": 1.2923555649063807, + "learning_rate": 9.8456775373322e-06, + "loss": 0.1866, + "step": 1288 + }, + { + "epoch": 0.5151757188498403, + "grad_norm": 1.4943462008704147, + "learning_rate": 9.844529745353892e-06, + "loss": 0.2126, + "step": 1290 + }, + { + "epoch": 0.5159744408945687, + "grad_norm": 1.5684625006318347, + "learning_rate": 9.843377768158972e-06, + "loss": 0.2084, + "step": 1292 + }, + { + "epoch": 0.5167731629392971, + "grad_norm": 1.4914103834569956, + "learning_rate": 9.84222160674264e-06, + "loss": 0.2153, + "step": 1294 + }, + { + "epoch": 0.5175718849840255, + "grad_norm": 1.328585368359251, + "learning_rate": 9.841061262103713e-06, + "loss": 0.1919, + "step": 1296 + }, + { + "epoch": 0.518370607028754, + "grad_norm": 1.3701342649025663, + "learning_rate": 9.839896735244615e-06, + "loss": 0.1981, + "step": 1298 + }, + { + "epoch": 0.5191693290734825, + "grad_norm": 1.5701845518180864, + "learning_rate": 9.83872802717139e-06, + "loss": 0.1849, + "step": 1300 + }, + { + "epoch": 0.5199680511182109, + "grad_norm": 1.3075421535239862, + "learning_rate": 9.83755513889369e-06, + "loss": 0.1918, + "step": 1302 + }, + { + "epoch": 0.5207667731629393, + "grad_norm": 1.5057769635536875, + "learning_rate": 9.836378071424782e-06, + "loss": 0.2218, + "step": 1304 + }, + { + "epoch": 0.5215654952076677, + "grad_norm": 1.3361043392598404, + "learning_rate": 9.835196825781539e-06, + "loss": 0.1998, + "step": 1306 + }, + { + "epoch": 0.5223642172523961, + "grad_norm": 1.3294074227646568, + "learning_rate": 9.834011402984447e-06, + "loss": 0.1885, + "step": 1308 + }, + { + "epoch": 0.5231629392971247, + "grad_norm": 1.5233983111587897, + "learning_rate": 9.8328218040576e-06, + "loss": 0.2051, + "step": 1310 + }, + { + "epoch": 0.5239616613418531, + "grad_norm": 1.263265485856387, + "learning_rate": 9.831628030028698e-06, + "loss": 0.1898, + "step": 1312 + }, + { + "epoch": 0.5247603833865815, + "grad_norm": 1.3267088643171874, + "learning_rate": 9.830430081929047e-06, + "loss": 0.1816, + "step": 1314 + }, + { + "epoch": 0.5255591054313099, + "grad_norm": 1.4118242503209621, + "learning_rate": 9.829227960793566e-06, + "loss": 0.2206, + "step": 1316 + }, + { + "epoch": 0.5263578274760383, + "grad_norm": 1.2848755780312735, + "learning_rate": 9.82802166766077e-06, + "loss": 0.191, + "step": 1318 + }, + { + "epoch": 0.5271565495207667, + "grad_norm": 1.6025918801894552, + "learning_rate": 9.826811203572785e-06, + "loss": 0.2231, + "step": 1320 + }, + { + "epoch": 0.5279552715654952, + "grad_norm": 1.3694781938576936, + "learning_rate": 9.82559656957534e-06, + "loss": 0.2133, + "step": 1322 + }, + { + "epoch": 0.5287539936102237, + "grad_norm": 1.5702346185266045, + "learning_rate": 9.824377766717758e-06, + "loss": 0.204, + "step": 1324 + }, + { + "epoch": 0.5295527156549521, + "grad_norm": 1.4119518722483408, + "learning_rate": 9.823154796052974e-06, + "loss": 0.1944, + "step": 1326 + }, + { + "epoch": 0.5303514376996805, + "grad_norm": 1.2510934475746587, + "learning_rate": 9.821927658637518e-06, + "loss": 0.1901, + "step": 1328 + }, + { + "epoch": 0.5311501597444089, + "grad_norm": 1.23475379515121, + "learning_rate": 9.82069635553152e-06, + "loss": 0.179, + "step": 1330 + }, + { + "epoch": 0.5319488817891374, + "grad_norm": 1.3831688689107673, + "learning_rate": 9.819460887798714e-06, + "loss": 0.1854, + "step": 1332 + }, + { + "epoch": 0.5327476038338658, + "grad_norm": 1.495164379191231, + "learning_rate": 9.818221256506421e-06, + "loss": 0.2067, + "step": 1334 + }, + { + "epoch": 0.5335463258785943, + "grad_norm": 1.4669136010120607, + "learning_rate": 9.81697746272557e-06, + "loss": 0.2182, + "step": 1336 + }, + { + "epoch": 0.5343450479233227, + "grad_norm": 1.3093260674635532, + "learning_rate": 9.81572950753068e-06, + "loss": 0.2077, + "step": 1338 + }, + { + "epoch": 0.5351437699680511, + "grad_norm": 1.4203515470016688, + "learning_rate": 9.814477391999868e-06, + "loss": 0.1832, + "step": 1340 + }, + { + "epoch": 0.5359424920127795, + "grad_norm": 1.2863505009713285, + "learning_rate": 9.813221117214842e-06, + "loss": 0.2022, + "step": 1342 + }, + { + "epoch": 0.536741214057508, + "grad_norm": 1.2084177020870481, + "learning_rate": 9.811960684260907e-06, + "loss": 0.1963, + "step": 1344 + }, + { + "epoch": 0.5375399361022364, + "grad_norm": 1.3072664824588964, + "learning_rate": 9.810696094226952e-06, + "loss": 0.2205, + "step": 1346 + }, + { + "epoch": 0.5383386581469649, + "grad_norm": 1.1614780192043668, + "learning_rate": 9.809427348205472e-06, + "loss": 0.1875, + "step": 1348 + }, + { + "epoch": 0.5391373801916933, + "grad_norm": 1.3175352444594506, + "learning_rate": 9.808154447292539e-06, + "loss": 0.185, + "step": 1350 + }, + { + "epoch": 0.5399361022364217, + "grad_norm": 1.2834953987740787, + "learning_rate": 9.80687739258782e-06, + "loss": 0.1924, + "step": 1352 + }, + { + "epoch": 0.5407348242811502, + "grad_norm": 1.3409325351626424, + "learning_rate": 9.805596185194571e-06, + "loss": 0.206, + "step": 1354 + }, + { + "epoch": 0.5415335463258786, + "grad_norm": 1.367209463101786, + "learning_rate": 9.804310826219633e-06, + "loss": 0.2099, + "step": 1356 + }, + { + "epoch": 0.542332268370607, + "grad_norm": 1.2819870163156417, + "learning_rate": 9.803021316773434e-06, + "loss": 0.1881, + "step": 1358 + }, + { + "epoch": 0.5431309904153354, + "grad_norm": 1.3214326903766342, + "learning_rate": 9.801727657969988e-06, + "loss": 0.1964, + "step": 1360 + }, + { + "epoch": 0.5439297124600639, + "grad_norm": 1.1833472018926297, + "learning_rate": 9.800429850926898e-06, + "loss": 0.1836, + "step": 1362 + }, + { + "epoch": 0.5447284345047924, + "grad_norm": 1.4748511681523404, + "learning_rate": 9.799127896765346e-06, + "loss": 0.1772, + "step": 1364 + }, + { + "epoch": 0.5455271565495208, + "grad_norm": 1.2243926123531605, + "learning_rate": 9.797821796610094e-06, + "loss": 0.2206, + "step": 1366 + }, + { + "epoch": 0.5463258785942492, + "grad_norm": 1.292732092241605, + "learning_rate": 9.796511551589492e-06, + "loss": 0.1888, + "step": 1368 + }, + { + "epoch": 0.5471246006389776, + "grad_norm": 1.348610309652153, + "learning_rate": 9.795197162835468e-06, + "loss": 0.2045, + "step": 1370 + }, + { + "epoch": 0.547923322683706, + "grad_norm": 1.2828128744633993, + "learning_rate": 9.79387863148353e-06, + "loss": 0.1785, + "step": 1372 + }, + { + "epoch": 0.5487220447284346, + "grad_norm": 1.369286359401732, + "learning_rate": 9.792555958672762e-06, + "loss": 0.1839, + "step": 1374 + }, + { + "epoch": 0.549520766773163, + "grad_norm": 1.4014364965242936, + "learning_rate": 9.791229145545832e-06, + "loss": 0.1972, + "step": 1376 + }, + { + "epoch": 0.5503194888178914, + "grad_norm": 1.465996725744811, + "learning_rate": 9.789898193248978e-06, + "loss": 0.1957, + "step": 1378 + }, + { + "epoch": 0.5511182108626198, + "grad_norm": 1.5171558530092926, + "learning_rate": 9.788563102932023e-06, + "loss": 0.1966, + "step": 1380 + }, + { + "epoch": 0.5519169329073482, + "grad_norm": 1.3311267145287, + "learning_rate": 9.787223875748353e-06, + "loss": 0.2064, + "step": 1382 + }, + { + "epoch": 0.5527156549520766, + "grad_norm": 1.3066784419896205, + "learning_rate": 9.785880512854937e-06, + "loss": 0.1853, + "step": 1384 + }, + { + "epoch": 0.5535143769968051, + "grad_norm": 1.2390174296827288, + "learning_rate": 9.784533015412311e-06, + "loss": 0.1991, + "step": 1386 + }, + { + "epoch": 0.5543130990415336, + "grad_norm": 1.306300904073364, + "learning_rate": 9.78318138458459e-06, + "loss": 0.1946, + "step": 1388 + }, + { + "epoch": 0.555111821086262, + "grad_norm": 1.2680623158657511, + "learning_rate": 9.781825621539451e-06, + "loss": 0.1961, + "step": 1390 + }, + { + "epoch": 0.5559105431309904, + "grad_norm": 1.2795990600690346, + "learning_rate": 9.78046572744815e-06, + "loss": 0.1852, + "step": 1392 + }, + { + "epoch": 0.5567092651757188, + "grad_norm": 1.4140574609807046, + "learning_rate": 9.779101703485503e-06, + "loss": 0.1915, + "step": 1394 + }, + { + "epoch": 0.5575079872204473, + "grad_norm": 1.1983648174489345, + "learning_rate": 9.7777335508299e-06, + "loss": 0.1812, + "step": 1396 + }, + { + "epoch": 0.5583067092651757, + "grad_norm": 1.2309594478518777, + "learning_rate": 9.776361270663295e-06, + "loss": 0.1861, + "step": 1398 + }, + { + "epoch": 0.5591054313099042, + "grad_norm": 1.1982821972463766, + "learning_rate": 9.77498486417121e-06, + "loss": 0.1713, + "step": 1400 + }, + { + "epoch": 0.5599041533546326, + "grad_norm": 1.4024839662731965, + "learning_rate": 9.77360433254273e-06, + "loss": 0.2106, + "step": 1402 + }, + { + "epoch": 0.560702875399361, + "grad_norm": 1.2213412954218998, + "learning_rate": 9.772219676970502e-06, + "loss": 0.1693, + "step": 1404 + }, + { + "epoch": 0.5615015974440895, + "grad_norm": 1.4583671258013307, + "learning_rate": 9.770830898650739e-06, + "loss": 0.2006, + "step": 1406 + }, + { + "epoch": 0.5623003194888179, + "grad_norm": 1.2563510565936002, + "learning_rate": 9.769437998783216e-06, + "loss": 0.1862, + "step": 1408 + }, + { + "epoch": 0.5630990415335463, + "grad_norm": 1.314486055157245, + "learning_rate": 9.768040978571265e-06, + "loss": 0.1755, + "step": 1410 + }, + { + "epoch": 0.5638977635782748, + "grad_norm": 1.3853554706786548, + "learning_rate": 9.76663983922178e-06, + "loss": 0.2031, + "step": 1412 + }, + { + "epoch": 0.5646964856230032, + "grad_norm": 1.2878686521894571, + "learning_rate": 9.765234581945215e-06, + "loss": 0.2123, + "step": 1414 + }, + { + "epoch": 0.5654952076677316, + "grad_norm": 1.252378097297008, + "learning_rate": 9.763825207955577e-06, + "loss": 0.1935, + "step": 1416 + }, + { + "epoch": 0.5662939297124601, + "grad_norm": 1.213815586126145, + "learning_rate": 9.762411718470434e-06, + "loss": 0.1658, + "step": 1418 + }, + { + "epoch": 0.5670926517571885, + "grad_norm": 1.3949685159556597, + "learning_rate": 9.760994114710906e-06, + "loss": 0.2114, + "step": 1420 + }, + { + "epoch": 0.5678913738019169, + "grad_norm": 1.2817914821838783, + "learning_rate": 9.759572397901671e-06, + "loss": 0.1767, + "step": 1422 + }, + { + "epoch": 0.5686900958466453, + "grad_norm": 1.4145245969611204, + "learning_rate": 9.758146569270957e-06, + "loss": 0.1986, + "step": 1424 + }, + { + "epoch": 0.5694888178913738, + "grad_norm": 1.2826653603852296, + "learning_rate": 9.756716630050546e-06, + "loss": 0.1854, + "step": 1426 + }, + { + "epoch": 0.5702875399361023, + "grad_norm": 1.2441705422611444, + "learning_rate": 9.755282581475769e-06, + "loss": 0.1923, + "step": 1428 + }, + { + "epoch": 0.5710862619808307, + "grad_norm": 1.4151560859787753, + "learning_rate": 9.75384442478551e-06, + "loss": 0.2145, + "step": 1430 + }, + { + "epoch": 0.5718849840255591, + "grad_norm": 1.2387933950813024, + "learning_rate": 9.7524021612222e-06, + "loss": 0.1804, + "step": 1432 + }, + { + "epoch": 0.5726837060702875, + "grad_norm": 1.395550176337113, + "learning_rate": 9.75095579203182e-06, + "loss": 0.203, + "step": 1434 + }, + { + "epoch": 0.5734824281150159, + "grad_norm": 1.4116044575628213, + "learning_rate": 9.749505318463894e-06, + "loss": 0.2111, + "step": 1436 + }, + { + "epoch": 0.5742811501597445, + "grad_norm": 1.2825403647392892, + "learning_rate": 9.748050741771498e-06, + "loss": 0.1891, + "step": 1438 + }, + { + "epoch": 0.5750798722044729, + "grad_norm": 1.3389504388761955, + "learning_rate": 9.746592063211247e-06, + "loss": 0.1989, + "step": 1440 + }, + { + "epoch": 0.5758785942492013, + "grad_norm": 1.2944657988503898, + "learning_rate": 9.7451292840433e-06, + "loss": 0.1986, + "step": 1442 + }, + { + "epoch": 0.5766773162939297, + "grad_norm": 1.373583553210306, + "learning_rate": 9.743662405531361e-06, + "loss": 0.1896, + "step": 1444 + }, + { + "epoch": 0.5774760383386581, + "grad_norm": 1.3216412289151287, + "learning_rate": 9.742191428942677e-06, + "loss": 0.1841, + "step": 1446 + }, + { + "epoch": 0.5782747603833865, + "grad_norm": 1.4646317495074423, + "learning_rate": 9.74071635554803e-06, + "loss": 0.1987, + "step": 1448 + }, + { + "epoch": 0.579073482428115, + "grad_norm": 1.3041923646858569, + "learning_rate": 9.739237186621747e-06, + "loss": 0.1825, + "step": 1450 + }, + { + "epoch": 0.5798722044728435, + "grad_norm": 1.3593275898591164, + "learning_rate": 9.737753923441689e-06, + "loss": 0.1859, + "step": 1452 + }, + { + "epoch": 0.5806709265175719, + "grad_norm": 1.2580533184275235, + "learning_rate": 9.736266567289255e-06, + "loss": 0.1969, + "step": 1454 + }, + { + "epoch": 0.5814696485623003, + "grad_norm": 1.325474938146526, + "learning_rate": 9.73477511944938e-06, + "loss": 0.2043, + "step": 1456 + }, + { + "epoch": 0.5822683706070287, + "grad_norm": 1.3507712058043326, + "learning_rate": 9.733279581210535e-06, + "loss": 0.1979, + "step": 1458 + }, + { + "epoch": 0.5830670926517572, + "grad_norm": 1.2160048011358895, + "learning_rate": 9.731779953864725e-06, + "loss": 0.197, + "step": 1460 + }, + { + "epoch": 0.5838658146964856, + "grad_norm": 1.3331470604472404, + "learning_rate": 9.730276238707486e-06, + "loss": 0.1808, + "step": 1462 + }, + { + "epoch": 0.5846645367412141, + "grad_norm": 1.2875278440764244, + "learning_rate": 9.728768437037882e-06, + "loss": 0.1728, + "step": 1464 + }, + { + "epoch": 0.5854632587859425, + "grad_norm": 1.3527080702439602, + "learning_rate": 9.72725655015852e-06, + "loss": 0.1979, + "step": 1466 + }, + { + "epoch": 0.5862619808306709, + "grad_norm": 1.2097235904416872, + "learning_rate": 9.725740579375518e-06, + "loss": 0.1863, + "step": 1468 + }, + { + "epoch": 0.5870607028753994, + "grad_norm": 1.3817835609057318, + "learning_rate": 9.724220525998538e-06, + "loss": 0.1966, + "step": 1470 + }, + { + "epoch": 0.5878594249201278, + "grad_norm": 1.374598787526082, + "learning_rate": 9.722696391340762e-06, + "loss": 0.2084, + "step": 1472 + }, + { + "epoch": 0.5886581469648562, + "grad_norm": 1.1682542939498242, + "learning_rate": 9.721168176718896e-06, + "loss": 0.1819, + "step": 1474 + }, + { + "epoch": 0.5894568690095847, + "grad_norm": 1.3405797462449764, + "learning_rate": 9.719635883453175e-06, + "loss": 0.2133, + "step": 1476 + }, + { + "epoch": 0.5902555910543131, + "grad_norm": 1.3482283589788477, + "learning_rate": 9.718099512867355e-06, + "loss": 0.1737, + "step": 1478 + }, + { + "epoch": 0.5910543130990416, + "grad_norm": 1.3206877696300103, + "learning_rate": 9.716559066288716e-06, + "loss": 0.1913, + "step": 1480 + }, + { + "epoch": 0.59185303514377, + "grad_norm": 1.3089066804848661, + "learning_rate": 9.715014545048059e-06, + "loss": 0.189, + "step": 1482 + }, + { + "epoch": 0.5926517571884984, + "grad_norm": 1.1772602990825523, + "learning_rate": 9.713465950479704e-06, + "loss": 0.1634, + "step": 1484 + }, + { + "epoch": 0.5934504792332268, + "grad_norm": 1.6415957840833397, + "learning_rate": 9.711913283921488e-06, + "loss": 0.1986, + "step": 1486 + }, + { + "epoch": 0.5942492012779552, + "grad_norm": 1.3340911692430844, + "learning_rate": 9.710356546714774e-06, + "loss": 0.1762, + "step": 1488 + }, + { + "epoch": 0.5950479233226837, + "grad_norm": 1.19131607954315, + "learning_rate": 9.708795740204431e-06, + "loss": 0.1807, + "step": 1490 + }, + { + "epoch": 0.5958466453674122, + "grad_norm": 1.377988455402913, + "learning_rate": 9.70723086573885e-06, + "loss": 0.1939, + "step": 1492 + }, + { + "epoch": 0.5966453674121406, + "grad_norm": 1.25358131516604, + "learning_rate": 9.705661924669937e-06, + "loss": 0.1792, + "step": 1494 + }, + { + "epoch": 0.597444089456869, + "grad_norm": 1.2404137677794904, + "learning_rate": 9.704088918353108e-06, + "loss": 0.1687, + "step": 1496 + }, + { + "epoch": 0.5982428115015974, + "grad_norm": 1.2604619975409577, + "learning_rate": 9.70251184814729e-06, + "loss": 0.1939, + "step": 1498 + }, + { + "epoch": 0.5990415335463258, + "grad_norm": 1.363988707372948, + "learning_rate": 9.700930715414923e-06, + "loss": 0.1931, + "step": 1500 + }, + { + "epoch": 0.5990415335463258, + "eval_loss": 0.1750628650188446, + "eval_runtime": 416.5473, + "eval_samples_per_second": 42.749, + "eval_steps_per_second": 5.344, + "step": 1500 + }, + { + "epoch": 0.5998402555910544, + "grad_norm": 1.5877690610897348, + "learning_rate": 9.69934552152196e-06, + "loss": 0.1771, + "step": 1502 + }, + { + "epoch": 0.6006389776357828, + "grad_norm": 1.3591700844899557, + "learning_rate": 9.697756267837856e-06, + "loss": 0.2061, + "step": 1504 + }, + { + "epoch": 0.6014376996805112, + "grad_norm": 1.7943521151008524, + "learning_rate": 9.696162955735577e-06, + "loss": 0.1918, + "step": 1506 + }, + { + "epoch": 0.6022364217252396, + "grad_norm": 1.3757909830144823, + "learning_rate": 9.694565586591595e-06, + "loss": 0.1972, + "step": 1508 + }, + { + "epoch": 0.603035143769968, + "grad_norm": 1.3887040239577928, + "learning_rate": 9.692964161785885e-06, + "loss": 0.1861, + "step": 1510 + }, + { + "epoch": 0.6038338658146964, + "grad_norm": 1.1809219692093975, + "learning_rate": 9.691358682701927e-06, + "loss": 0.1718, + "step": 1512 + }, + { + "epoch": 0.604632587859425, + "grad_norm": 1.3982270520799365, + "learning_rate": 9.689749150726705e-06, + "loss": 0.1747, + "step": 1514 + }, + { + "epoch": 0.6054313099041534, + "grad_norm": 1.3379960842286045, + "learning_rate": 9.688135567250701e-06, + "loss": 0.1977, + "step": 1516 + }, + { + "epoch": 0.6062300319488818, + "grad_norm": 1.3650657458967188, + "learning_rate": 9.6865179336679e-06, + "loss": 0.234, + "step": 1518 + }, + { + "epoch": 0.6070287539936102, + "grad_norm": 1.220639149821231, + "learning_rate": 9.684896251375784e-06, + "loss": 0.1832, + "step": 1520 + }, + { + "epoch": 0.6078274760383386, + "grad_norm": 1.3205908355697185, + "learning_rate": 9.683270521775334e-06, + "loss": 0.1877, + "step": 1522 + }, + { + "epoch": 0.6086261980830671, + "grad_norm": 1.1383918655715137, + "learning_rate": 9.681640746271026e-06, + "loss": 0.1849, + "step": 1524 + }, + { + "epoch": 0.6094249201277955, + "grad_norm": 1.2899856170845478, + "learning_rate": 9.680006926270833e-06, + "loss": 0.1879, + "step": 1526 + }, + { + "epoch": 0.610223642172524, + "grad_norm": 1.3318559141238482, + "learning_rate": 9.678369063186224e-06, + "loss": 0.197, + "step": 1528 + }, + { + "epoch": 0.6110223642172524, + "grad_norm": 1.1812228890278815, + "learning_rate": 9.676727158432153e-06, + "loss": 0.1913, + "step": 1530 + }, + { + "epoch": 0.6118210862619808, + "grad_norm": 1.4859051313262472, + "learning_rate": 9.675081213427076e-06, + "loss": 0.2113, + "step": 1532 + }, + { + "epoch": 0.6126198083067093, + "grad_norm": 1.297766937816727, + "learning_rate": 9.673431229592928e-06, + "loss": 0.1929, + "step": 1534 + }, + { + "epoch": 0.6134185303514377, + "grad_norm": 1.2317306535256243, + "learning_rate": 9.671777208355146e-06, + "loss": 0.2108, + "step": 1536 + }, + { + "epoch": 0.6142172523961661, + "grad_norm": 1.2840356619037692, + "learning_rate": 9.670119151142644e-06, + "loss": 0.1853, + "step": 1538 + }, + { + "epoch": 0.6150159744408946, + "grad_norm": 1.2450276467812322, + "learning_rate": 9.668457059387828e-06, + "loss": 0.2088, + "step": 1540 + }, + { + "epoch": 0.615814696485623, + "grad_norm": 1.3269269430845416, + "learning_rate": 9.66679093452659e-06, + "loss": 0.2082, + "step": 1542 + }, + { + "epoch": 0.6166134185303515, + "grad_norm": 1.1360832482053789, + "learning_rate": 9.665120777998303e-06, + "loss": 0.1941, + "step": 1544 + }, + { + "epoch": 0.6174121405750799, + "grad_norm": 1.2899083518509165, + "learning_rate": 9.663446591245825e-06, + "loss": 0.1761, + "step": 1546 + }, + { + "epoch": 0.6182108626198083, + "grad_norm": 1.352813774180764, + "learning_rate": 9.661768375715493e-06, + "loss": 0.1922, + "step": 1548 + }, + { + "epoch": 0.6190095846645367, + "grad_norm": 1.2861565795443108, + "learning_rate": 9.660086132857132e-06, + "loss": 0.2052, + "step": 1550 + }, + { + "epoch": 0.6198083067092651, + "grad_norm": 1.5903917159408412, + "learning_rate": 9.658399864124037e-06, + "loss": 0.2057, + "step": 1552 + }, + { + "epoch": 0.6206070287539937, + "grad_norm": 1.30967345968437, + "learning_rate": 9.656709570972987e-06, + "loss": 0.1925, + "step": 1554 + }, + { + "epoch": 0.6214057507987221, + "grad_norm": 1.328489492989945, + "learning_rate": 9.655015254864236e-06, + "loss": 0.204, + "step": 1556 + }, + { + "epoch": 0.6222044728434505, + "grad_norm": 1.6209499481494345, + "learning_rate": 9.653316917261511e-06, + "loss": 0.2084, + "step": 1558 + }, + { + "epoch": 0.6230031948881789, + "grad_norm": 1.40907634938526, + "learning_rate": 9.65161455963202e-06, + "loss": 0.2028, + "step": 1560 + }, + { + "epoch": 0.6238019169329073, + "grad_norm": 1.1340460294386678, + "learning_rate": 9.649908183446432e-06, + "loss": 0.1703, + "step": 1562 + }, + { + "epoch": 0.6246006389776357, + "grad_norm": 1.2652621838230012, + "learning_rate": 9.648197790178902e-06, + "loss": 0.1775, + "step": 1564 + }, + { + "epoch": 0.6253993610223643, + "grad_norm": 1.1637259399366224, + "learning_rate": 9.646483381307047e-06, + "loss": 0.1786, + "step": 1566 + }, + { + "epoch": 0.6261980830670927, + "grad_norm": 1.3590258772922685, + "learning_rate": 9.64476495831195e-06, + "loss": 0.1957, + "step": 1568 + }, + { + "epoch": 0.6269968051118211, + "grad_norm": 1.5133394068843713, + "learning_rate": 9.643042522678172e-06, + "loss": 0.2086, + "step": 1570 + }, + { + "epoch": 0.6277955271565495, + "grad_norm": 1.5150111988589272, + "learning_rate": 9.641316075893731e-06, + "loss": 0.2025, + "step": 1572 + }, + { + "epoch": 0.6285942492012779, + "grad_norm": 1.2465479067467933, + "learning_rate": 9.639585619450116e-06, + "loss": 0.1887, + "step": 1574 + }, + { + "epoch": 0.6293929712460063, + "grad_norm": 1.2608367845246782, + "learning_rate": 9.637851154842279e-06, + "loss": 0.1908, + "step": 1576 + }, + { + "epoch": 0.6301916932907349, + "grad_norm": 1.228578420129268, + "learning_rate": 9.636112683568633e-06, + "loss": 0.182, + "step": 1578 + }, + { + "epoch": 0.6309904153354633, + "grad_norm": 1.2794906854021242, + "learning_rate": 9.63437020713105e-06, + "loss": 0.1891, + "step": 1580 + }, + { + "epoch": 0.6317891373801917, + "grad_norm": 1.2079726286437402, + "learning_rate": 9.632623727034868e-06, + "loss": 0.1918, + "step": 1582 + }, + { + "epoch": 0.6325878594249201, + "grad_norm": 1.4507684848320332, + "learning_rate": 9.630873244788884e-06, + "loss": 0.1928, + "step": 1584 + }, + { + "epoch": 0.6333865814696485, + "grad_norm": 1.3354253708604187, + "learning_rate": 9.629118761905343e-06, + "loss": 0.2098, + "step": 1586 + }, + { + "epoch": 0.634185303514377, + "grad_norm": 1.2516888814265885, + "learning_rate": 9.627360279899958e-06, + "loss": 0.1941, + "step": 1588 + }, + { + "epoch": 0.6349840255591054, + "grad_norm": 1.193887849956848, + "learning_rate": 9.62559780029189e-06, + "loss": 0.1811, + "step": 1590 + }, + { + "epoch": 0.6357827476038339, + "grad_norm": 1.2324593768790522, + "learning_rate": 9.623831324603755e-06, + "loss": 0.1896, + "step": 1592 + }, + { + "epoch": 0.6365814696485623, + "grad_norm": 1.252327759159766, + "learning_rate": 9.62206085436162e-06, + "loss": 0.2004, + "step": 1594 + }, + { + "epoch": 0.6373801916932907, + "grad_norm": 1.375448278876851, + "learning_rate": 9.620286391095004e-06, + "loss": 0.2213, + "step": 1596 + }, + { + "epoch": 0.6381789137380192, + "grad_norm": 1.2928127323852008, + "learning_rate": 9.618507936336878e-06, + "loss": 0.184, + "step": 1598 + }, + { + "epoch": 0.6389776357827476, + "grad_norm": 1.292316256258317, + "learning_rate": 9.61672549162366e-06, + "loss": 0.1974, + "step": 1600 + }, + { + "epoch": 0.639776357827476, + "grad_norm": 1.2009672868034973, + "learning_rate": 9.61493905849521e-06, + "loss": 0.1926, + "step": 1602 + }, + { + "epoch": 0.6405750798722045, + "grad_norm": 1.1528776664831937, + "learning_rate": 9.61314863849484e-06, + "loss": 0.1781, + "step": 1604 + }, + { + "epoch": 0.6413738019169329, + "grad_norm": 1.228036340284955, + "learning_rate": 9.611354233169305e-06, + "loss": 0.1712, + "step": 1606 + }, + { + "epoch": 0.6421725239616614, + "grad_norm": 1.2207441407251327, + "learning_rate": 9.6095558440688e-06, + "loss": 0.187, + "step": 1608 + }, + { + "epoch": 0.6429712460063898, + "grad_norm": 1.2828242186591854, + "learning_rate": 9.607753472746967e-06, + "loss": 0.1847, + "step": 1610 + }, + { + "epoch": 0.6437699680511182, + "grad_norm": 1.147124947259627, + "learning_rate": 9.605947120760878e-06, + "loss": 0.1555, + "step": 1612 + }, + { + "epoch": 0.6445686900958466, + "grad_norm": 1.1939260367535258, + "learning_rate": 9.604136789671056e-06, + "loss": 0.2005, + "step": 1614 + }, + { + "epoch": 0.645367412140575, + "grad_norm": 1.20685907752979, + "learning_rate": 9.602322481041457e-06, + "loss": 0.1927, + "step": 1616 + }, + { + "epoch": 0.6461661341853036, + "grad_norm": 1.2690850828036355, + "learning_rate": 9.600504196439468e-06, + "loss": 0.1789, + "step": 1618 + }, + { + "epoch": 0.646964856230032, + "grad_norm": 1.3543205528313849, + "learning_rate": 9.59868193743592e-06, + "loss": 0.22, + "step": 1620 + }, + { + "epoch": 0.6477635782747604, + "grad_norm": 1.3669603475853995, + "learning_rate": 9.596855705605069e-06, + "loss": 0.1948, + "step": 1622 + }, + { + "epoch": 0.6485623003194888, + "grad_norm": 1.3838109830343859, + "learning_rate": 9.595025502524609e-06, + "loss": 0.1992, + "step": 1624 + }, + { + "epoch": 0.6493610223642172, + "grad_norm": 1.1502765370043713, + "learning_rate": 9.593191329775663e-06, + "loss": 0.1778, + "step": 1626 + }, + { + "epoch": 0.6501597444089456, + "grad_norm": 1.2318516996914193, + "learning_rate": 9.591353188942782e-06, + "loss": 0.2073, + "step": 1628 + }, + { + "epoch": 0.6509584664536742, + "grad_norm": 1.6179688640460086, + "learning_rate": 9.589511081613947e-06, + "loss": 0.2033, + "step": 1630 + }, + { + "epoch": 0.6517571884984026, + "grad_norm": 1.1413033654758473, + "learning_rate": 9.587665009380565e-06, + "loss": 0.1859, + "step": 1632 + }, + { + "epoch": 0.652555910543131, + "grad_norm": 1.2785775485264639, + "learning_rate": 9.585814973837468e-06, + "loss": 0.1959, + "step": 1634 + }, + { + "epoch": 0.6533546325878594, + "grad_norm": 1.2901165952115163, + "learning_rate": 9.583960976582914e-06, + "loss": 0.1962, + "step": 1636 + }, + { + "epoch": 0.6541533546325878, + "grad_norm": 1.2679330444702808, + "learning_rate": 9.582103019218577e-06, + "loss": 0.1907, + "step": 1638 + }, + { + "epoch": 0.6549520766773163, + "grad_norm": 1.246571134661727, + "learning_rate": 9.580241103349562e-06, + "loss": 0.1712, + "step": 1640 + }, + { + "epoch": 0.6557507987220448, + "grad_norm": 1.342636088102212, + "learning_rate": 9.578375230584384e-06, + "loss": 0.1789, + "step": 1642 + }, + { + "epoch": 0.6565495207667732, + "grad_norm": 1.173038253799244, + "learning_rate": 9.576505402534984e-06, + "loss": 0.1717, + "step": 1644 + }, + { + "epoch": 0.6573482428115016, + "grad_norm": 1.2394488250110955, + "learning_rate": 9.574631620816718e-06, + "loss": 0.186, + "step": 1646 + }, + { + "epoch": 0.65814696485623, + "grad_norm": 1.4829342645126746, + "learning_rate": 9.572753887048353e-06, + "loss": 0.1965, + "step": 1648 + }, + { + "epoch": 0.6589456869009584, + "grad_norm": 1.2666969584636023, + "learning_rate": 9.570872202852077e-06, + "loss": 0.181, + "step": 1650 + }, + { + "epoch": 0.6597444089456869, + "grad_norm": 1.2265218244462477, + "learning_rate": 9.568986569853487e-06, + "loss": 0.1811, + "step": 1652 + }, + { + "epoch": 0.6605431309904153, + "grad_norm": 1.391040746962543, + "learning_rate": 9.56709698968159e-06, + "loss": 0.1955, + "step": 1654 + }, + { + "epoch": 0.6613418530351438, + "grad_norm": 1.2540103131383575, + "learning_rate": 9.565203463968808e-06, + "loss": 0.2158, + "step": 1656 + }, + { + "epoch": 0.6621405750798722, + "grad_norm": 1.218580930627416, + "learning_rate": 9.563305994350966e-06, + "loss": 0.2075, + "step": 1658 + }, + { + "epoch": 0.6629392971246006, + "grad_norm": 1.2969445843723761, + "learning_rate": 9.5614045824673e-06, + "loss": 0.174, + "step": 1660 + }, + { + "epoch": 0.6637380191693291, + "grad_norm": 1.2080482907459895, + "learning_rate": 9.55949922996045e-06, + "loss": 0.1718, + "step": 1662 + }, + { + "epoch": 0.6645367412140575, + "grad_norm": 1.3088014021283991, + "learning_rate": 9.557589938476462e-06, + "loss": 0.1958, + "step": 1664 + }, + { + "epoch": 0.6653354632587859, + "grad_norm": 1.2483532241185216, + "learning_rate": 9.555676709664783e-06, + "loss": 0.1667, + "step": 1666 + }, + { + "epoch": 0.6661341853035144, + "grad_norm": 1.4228507269979516, + "learning_rate": 9.55375954517826e-06, + "loss": 0.1887, + "step": 1668 + }, + { + "epoch": 0.6669329073482428, + "grad_norm": 1.426968884001837, + "learning_rate": 9.551838446673144e-06, + "loss": 0.1733, + "step": 1670 + }, + { + "epoch": 0.6677316293929713, + "grad_norm": 1.238164625762186, + "learning_rate": 9.549913415809084e-06, + "loss": 0.1909, + "step": 1672 + }, + { + "epoch": 0.6685303514376997, + "grad_norm": 1.1947296500798315, + "learning_rate": 9.547984454249125e-06, + "loss": 0.19, + "step": 1674 + }, + { + "epoch": 0.6693290734824281, + "grad_norm": 1.3096866424518478, + "learning_rate": 9.546051563659704e-06, + "loss": 0.1838, + "step": 1676 + }, + { + "epoch": 0.6701277955271565, + "grad_norm": 1.2325979346576457, + "learning_rate": 9.54411474571066e-06, + "loss": 0.213, + "step": 1678 + }, + { + "epoch": 0.670926517571885, + "grad_norm": 1.1989676471273152, + "learning_rate": 9.542174002075221e-06, + "loss": 0.1815, + "step": 1680 + }, + { + "epoch": 0.6717252396166135, + "grad_norm": 1.3573812014117548, + "learning_rate": 9.540229334430005e-06, + "loss": 0.1897, + "step": 1682 + }, + { + "epoch": 0.6725239616613419, + "grad_norm": 1.2474833071465166, + "learning_rate": 9.53828074445502e-06, + "loss": 0.1846, + "step": 1684 + }, + { + "epoch": 0.6733226837060703, + "grad_norm": 1.2370853532394706, + "learning_rate": 9.536328233833668e-06, + "loss": 0.1926, + "step": 1686 + }, + { + "epoch": 0.6741214057507987, + "grad_norm": 1.2326249171384345, + "learning_rate": 9.534371804252727e-06, + "loss": 0.1971, + "step": 1688 + }, + { + "epoch": 0.6749201277955271, + "grad_norm": 1.2299339970845358, + "learning_rate": 9.532411457402374e-06, + "loss": 0.1806, + "step": 1690 + }, + { + "epoch": 0.6757188498402555, + "grad_norm": 1.4027305493123479, + "learning_rate": 9.530447194976164e-06, + "loss": 0.1939, + "step": 1692 + }, + { + "epoch": 0.6765175718849841, + "grad_norm": 1.7551361158569634, + "learning_rate": 9.52847901867103e-06, + "loss": 0.2109, + "step": 1694 + }, + { + "epoch": 0.6773162939297125, + "grad_norm": 1.2968839868364699, + "learning_rate": 9.526506930187294e-06, + "loss": 0.1987, + "step": 1696 + }, + { + "epoch": 0.6781150159744409, + "grad_norm": 1.4078086349831407, + "learning_rate": 9.524530931228653e-06, + "loss": 0.1989, + "step": 1698 + }, + { + "epoch": 0.6789137380191693, + "grad_norm": 1.231887491149701, + "learning_rate": 9.522551023502183e-06, + "loss": 0.1718, + "step": 1700 + }, + { + "epoch": 0.6797124600638977, + "grad_norm": 1.4088570515681005, + "learning_rate": 9.520567208718337e-06, + "loss": 0.1791, + "step": 1702 + }, + { + "epoch": 0.6805111821086262, + "grad_norm": 1.4104721082557625, + "learning_rate": 9.518579488590947e-06, + "loss": 0.2051, + "step": 1704 + }, + { + "epoch": 0.6813099041533547, + "grad_norm": 1.4204595182317563, + "learning_rate": 9.516587864837213e-06, + "loss": 0.2222, + "step": 1706 + }, + { + "epoch": 0.6821086261980831, + "grad_norm": 1.3446005177751752, + "learning_rate": 9.51459233917771e-06, + "loss": 0.1923, + "step": 1708 + }, + { + "epoch": 0.6829073482428115, + "grad_norm": 1.3177179723391306, + "learning_rate": 9.512592913336385e-06, + "loss": 0.1911, + "step": 1710 + }, + { + "epoch": 0.6837060702875399, + "grad_norm": 1.3007833533147979, + "learning_rate": 9.510589589040554e-06, + "loss": 0.2003, + "step": 1712 + }, + { + "epoch": 0.6845047923322684, + "grad_norm": 1.3084903604314289, + "learning_rate": 9.508582368020897e-06, + "loss": 0.2068, + "step": 1714 + }, + { + "epoch": 0.6853035143769968, + "grad_norm": 1.2332676487219392, + "learning_rate": 9.506571252011467e-06, + "loss": 0.1809, + "step": 1716 + }, + { + "epoch": 0.6861022364217252, + "grad_norm": 1.4206295803871527, + "learning_rate": 9.504556242749677e-06, + "loss": 0.1879, + "step": 1718 + }, + { + "epoch": 0.6869009584664537, + "grad_norm": 1.4363115148587187, + "learning_rate": 9.502537341976305e-06, + "loss": 0.1955, + "step": 1720 + }, + { + "epoch": 0.6876996805111821, + "grad_norm": 1.3048263552086317, + "learning_rate": 9.500514551435491e-06, + "loss": 0.2062, + "step": 1722 + }, + { + "epoch": 0.6884984025559105, + "grad_norm": 1.4533257081032092, + "learning_rate": 9.498487872874735e-06, + "loss": 0.2065, + "step": 1724 + }, + { + "epoch": 0.689297124600639, + "grad_norm": 1.2029574969239287, + "learning_rate": 9.496457308044895e-06, + "loss": 0.1848, + "step": 1726 + }, + { + "epoch": 0.6900958466453674, + "grad_norm": 1.2074365689053979, + "learning_rate": 9.494422858700188e-06, + "loss": 0.1966, + "step": 1728 + }, + { + "epoch": 0.6908945686900958, + "grad_norm": 1.2900526030749346, + "learning_rate": 9.492384526598188e-06, + "loss": 0.1982, + "step": 1730 + }, + { + "epoch": 0.6916932907348243, + "grad_norm": 1.2246323016242, + "learning_rate": 9.49034231349982e-06, + "loss": 0.1981, + "step": 1732 + }, + { + "epoch": 0.6924920127795527, + "grad_norm": 1.2646452794419145, + "learning_rate": 9.488296221169363e-06, + "loss": 0.1748, + "step": 1734 + }, + { + "epoch": 0.6932907348242812, + "grad_norm": 1.2901816109550333, + "learning_rate": 9.48624625137445e-06, + "loss": 0.2003, + "step": 1736 + }, + { + "epoch": 0.6940894568690096, + "grad_norm": 1.236520057555359, + "learning_rate": 9.484192405886058e-06, + "loss": 0.1943, + "step": 1738 + }, + { + "epoch": 0.694888178913738, + "grad_norm": 1.3841072722578425, + "learning_rate": 9.48213468647852e-06, + "loss": 0.2006, + "step": 1740 + }, + { + "epoch": 0.6956869009584664, + "grad_norm": 1.2637297258797011, + "learning_rate": 9.480073094929507e-06, + "loss": 0.1508, + "step": 1742 + }, + { + "epoch": 0.6964856230031949, + "grad_norm": 1.3190153343742, + "learning_rate": 9.478007633020043e-06, + "loss": 0.1791, + "step": 1744 + }, + { + "epoch": 0.6972843450479234, + "grad_norm": 1.2377929664861391, + "learning_rate": 9.47593830253449e-06, + "loss": 0.1919, + "step": 1746 + }, + { + "epoch": 0.6980830670926518, + "grad_norm": 1.2951377520887053, + "learning_rate": 9.473865105260556e-06, + "loss": 0.1931, + "step": 1748 + }, + { + "epoch": 0.6988817891373802, + "grad_norm": 1.203360302375175, + "learning_rate": 9.471788042989285e-06, + "loss": 0.1628, + "step": 1750 + }, + { + "epoch": 0.6996805111821086, + "grad_norm": 1.3053684785835005, + "learning_rate": 9.469707117515068e-06, + "loss": 0.197, + "step": 1752 + }, + { + "epoch": 0.700479233226837, + "grad_norm": 1.1639750404721112, + "learning_rate": 9.467622330635622e-06, + "loss": 0.1811, + "step": 1754 + }, + { + "epoch": 0.7012779552715654, + "grad_norm": 1.3446647639072014, + "learning_rate": 9.465533684152011e-06, + "loss": 0.1931, + "step": 1756 + }, + { + "epoch": 0.702076677316294, + "grad_norm": 1.336079168507431, + "learning_rate": 9.463441179868626e-06, + "loss": 0.1804, + "step": 1758 + }, + { + "epoch": 0.7028753993610224, + "grad_norm": 1.210643436064521, + "learning_rate": 9.461344819593194e-06, + "loss": 0.1948, + "step": 1760 + }, + { + "epoch": 0.7036741214057508, + "grad_norm": 1.2291459699844245, + "learning_rate": 9.459244605136775e-06, + "loss": 0.1854, + "step": 1762 + }, + { + "epoch": 0.7044728434504792, + "grad_norm": 1.5941109492311214, + "learning_rate": 9.45714053831375e-06, + "loss": 0.2128, + "step": 1764 + }, + { + "epoch": 0.7052715654952076, + "grad_norm": 1.1893124518881097, + "learning_rate": 9.45503262094184e-06, + "loss": 0.1735, + "step": 1766 + }, + { + "epoch": 0.7060702875399361, + "grad_norm": 1.244259188061574, + "learning_rate": 9.452920854842085e-06, + "loss": 0.203, + "step": 1768 + }, + { + "epoch": 0.7068690095846646, + "grad_norm": 1.1591101687705918, + "learning_rate": 9.45080524183885e-06, + "loss": 0.1931, + "step": 1770 + }, + { + "epoch": 0.707667731629393, + "grad_norm": 1.185305488469298, + "learning_rate": 9.448685783759825e-06, + "loss": 0.1826, + "step": 1772 + }, + { + "epoch": 0.7084664536741214, + "grad_norm": 1.3935670621941678, + "learning_rate": 9.446562482436026e-06, + "loss": 0.1882, + "step": 1774 + }, + { + "epoch": 0.7092651757188498, + "grad_norm": 1.3143460322695941, + "learning_rate": 9.44443533970178e-06, + "loss": 0.1815, + "step": 1776 + }, + { + "epoch": 0.7100638977635783, + "grad_norm": 1.3400068262690805, + "learning_rate": 9.442304357394741e-06, + "loss": 0.1968, + "step": 1778 + }, + { + "epoch": 0.7108626198083067, + "grad_norm": 1.2627252602067203, + "learning_rate": 9.440169537355874e-06, + "loss": 0.1837, + "step": 1780 + }, + { + "epoch": 0.7116613418530351, + "grad_norm": 1.1054955641627056, + "learning_rate": 9.438030881429465e-06, + "loss": 0.1603, + "step": 1782 + }, + { + "epoch": 0.7124600638977636, + "grad_norm": 1.3680070238884574, + "learning_rate": 9.435888391463108e-06, + "loss": 0.1996, + "step": 1784 + }, + { + "epoch": 0.713258785942492, + "grad_norm": 1.297397045088257, + "learning_rate": 9.433742069307714e-06, + "loss": 0.1987, + "step": 1786 + }, + { + "epoch": 0.7140575079872205, + "grad_norm": 1.2477413014257834, + "learning_rate": 9.431591916817503e-06, + "loss": 0.1821, + "step": 1788 + }, + { + "epoch": 0.7148562300319489, + "grad_norm": 1.3162258234912159, + "learning_rate": 9.429437935850003e-06, + "loss": 0.2119, + "step": 1790 + }, + { + "epoch": 0.7156549520766773, + "grad_norm": 1.215098930469111, + "learning_rate": 9.427280128266049e-06, + "loss": 0.1842, + "step": 1792 + }, + { + "epoch": 0.7164536741214057, + "grad_norm": 1.2140728281775635, + "learning_rate": 9.425118495929788e-06, + "loss": 0.1813, + "step": 1794 + }, + { + "epoch": 0.7172523961661342, + "grad_norm": 1.0941868316938037, + "learning_rate": 9.422953040708662e-06, + "loss": 0.1836, + "step": 1796 + }, + { + "epoch": 0.7180511182108626, + "grad_norm": 1.2213745174193513, + "learning_rate": 9.420783764473418e-06, + "loss": 0.1902, + "step": 1798 + }, + { + "epoch": 0.7188498402555911, + "grad_norm": 1.1225238944421074, + "learning_rate": 9.418610669098114e-06, + "loss": 0.1924, + "step": 1800 + }, + { + "epoch": 0.7196485623003195, + "grad_norm": 1.194219981117358, + "learning_rate": 9.41643375646009e-06, + "loss": 0.1901, + "step": 1802 + }, + { + "epoch": 0.7204472843450479, + "grad_norm": 1.405501209107951, + "learning_rate": 9.41425302844e-06, + "loss": 0.1962, + "step": 1804 + }, + { + "epoch": 0.7212460063897763, + "grad_norm": 1.2754008408470467, + "learning_rate": 9.412068486921786e-06, + "loss": 0.1981, + "step": 1806 + }, + { + "epoch": 0.7220447284345048, + "grad_norm": 1.153012769564869, + "learning_rate": 9.409880133792684e-06, + "loss": 0.1601, + "step": 1808 + }, + { + "epoch": 0.7228434504792333, + "grad_norm": 1.412330054040255, + "learning_rate": 9.407687970943223e-06, + "loss": 0.2015, + "step": 1810 + }, + { + "epoch": 0.7236421725239617, + "grad_norm": 1.2512273615084581, + "learning_rate": 9.405492000267228e-06, + "loss": 0.162, + "step": 1812 + }, + { + "epoch": 0.7244408945686901, + "grad_norm": 1.3404549541893849, + "learning_rate": 9.403292223661811e-06, + "loss": 0.1858, + "step": 1814 + }, + { + "epoch": 0.7252396166134185, + "grad_norm": 1.2882725353355315, + "learning_rate": 9.40108864302737e-06, + "loss": 0.1918, + "step": 1816 + }, + { + "epoch": 0.7260383386581469, + "grad_norm": 1.2513310169482343, + "learning_rate": 9.398881260267589e-06, + "loss": 0.1998, + "step": 1818 + }, + { + "epoch": 0.7268370607028753, + "grad_norm": 1.337658331922668, + "learning_rate": 9.396670077289443e-06, + "loss": 0.1932, + "step": 1820 + }, + { + "epoch": 0.7276357827476039, + "grad_norm": 1.2367717186372091, + "learning_rate": 9.394455096003182e-06, + "loss": 0.2068, + "step": 1822 + }, + { + "epoch": 0.7284345047923323, + "grad_norm": 1.1564610765311962, + "learning_rate": 9.392236318322339e-06, + "loss": 0.1801, + "step": 1824 + }, + { + "epoch": 0.7292332268370607, + "grad_norm": 1.2112589486368208, + "learning_rate": 9.390013746163733e-06, + "loss": 0.1837, + "step": 1826 + }, + { + "epoch": 0.7300319488817891, + "grad_norm": 1.254583211491624, + "learning_rate": 9.387787381447455e-06, + "loss": 0.2105, + "step": 1828 + }, + { + "epoch": 0.7308306709265175, + "grad_norm": 1.1680394605252384, + "learning_rate": 9.385557226096873e-06, + "loss": 0.1933, + "step": 1830 + }, + { + "epoch": 0.731629392971246, + "grad_norm": 1.1210837547747836, + "learning_rate": 9.383323282038632e-06, + "loss": 0.1847, + "step": 1832 + }, + { + "epoch": 0.7324281150159745, + "grad_norm": 1.2069663644377342, + "learning_rate": 9.381085551202648e-06, + "loss": 0.1968, + "step": 1834 + }, + { + "epoch": 0.7332268370607029, + "grad_norm": 1.2372918873896177, + "learning_rate": 9.378844035522112e-06, + "loss": 0.1969, + "step": 1836 + }, + { + "epoch": 0.7340255591054313, + "grad_norm": 1.1846330324034746, + "learning_rate": 9.376598736933478e-06, + "loss": 0.191, + "step": 1838 + }, + { + "epoch": 0.7348242811501597, + "grad_norm": 1.1755331433603171, + "learning_rate": 9.374349657376473e-06, + "loss": 0.1647, + "step": 1840 + }, + { + "epoch": 0.7356230031948882, + "grad_norm": 1.2996956651624354, + "learning_rate": 9.372096798794093e-06, + "loss": 0.182, + "step": 1842 + }, + { + "epoch": 0.7364217252396166, + "grad_norm": 1.2224015710817062, + "learning_rate": 9.36984016313259e-06, + "loss": 0.1732, + "step": 1844 + }, + { + "epoch": 0.737220447284345, + "grad_norm": 1.196928762655955, + "learning_rate": 9.367579752341488e-06, + "loss": 0.1863, + "step": 1846 + }, + { + "epoch": 0.7380191693290735, + "grad_norm": 1.2553608491551194, + "learning_rate": 9.365315568373569e-06, + "loss": 0.1825, + "step": 1848 + }, + { + "epoch": 0.7388178913738019, + "grad_norm": 1.1728882445017346, + "learning_rate": 9.363047613184872e-06, + "loss": 0.1767, + "step": 1850 + }, + { + "epoch": 0.7396166134185304, + "grad_norm": 1.179558407608193, + "learning_rate": 9.360775888734699e-06, + "loss": 0.1862, + "step": 1852 + }, + { + "epoch": 0.7404153354632588, + "grad_norm": 1.1401251188622028, + "learning_rate": 9.358500396985603e-06, + "loss": 0.1836, + "step": 1854 + }, + { + "epoch": 0.7412140575079872, + "grad_norm": 1.1844349399914762, + "learning_rate": 9.356221139903395e-06, + "loss": 0.1885, + "step": 1856 + }, + { + "epoch": 0.7420127795527156, + "grad_norm": 1.2044582641333985, + "learning_rate": 9.353938119457137e-06, + "loss": 0.1865, + "step": 1858 + }, + { + "epoch": 0.7428115015974441, + "grad_norm": 1.2685669411269742, + "learning_rate": 9.351651337619145e-06, + "loss": 0.1754, + "step": 1860 + }, + { + "epoch": 0.7436102236421726, + "grad_norm": 1.4085265142769048, + "learning_rate": 9.349360796364984e-06, + "loss": 0.1972, + "step": 1862 + }, + { + "epoch": 0.744408945686901, + "grad_norm": 1.3358324906099626, + "learning_rate": 9.347066497673462e-06, + "loss": 0.1934, + "step": 1864 + }, + { + "epoch": 0.7452076677316294, + "grad_norm": 1.1473591638995988, + "learning_rate": 9.34476844352664e-06, + "loss": 0.1756, + "step": 1866 + }, + { + "epoch": 0.7460063897763578, + "grad_norm": 1.1732315132551707, + "learning_rate": 9.342466635909815e-06, + "loss": 0.1937, + "step": 1868 + }, + { + "epoch": 0.7468051118210862, + "grad_norm": 1.0907285192894114, + "learning_rate": 9.340161076811539e-06, + "loss": 0.1769, + "step": 1870 + }, + { + "epoch": 0.7476038338658147, + "grad_norm": 1.3140788804501988, + "learning_rate": 9.337851768223589e-06, + "loss": 0.2101, + "step": 1872 + }, + { + "epoch": 0.7484025559105432, + "grad_norm": 1.2720702400836585, + "learning_rate": 9.335538712140997e-06, + "loss": 0.1755, + "step": 1874 + }, + { + "epoch": 0.7492012779552716, + "grad_norm": 1.1203884095157366, + "learning_rate": 9.333221910562022e-06, + "loss": 0.1645, + "step": 1876 + }, + { + "epoch": 0.75, + "grad_norm": 1.3590498055976206, + "learning_rate": 9.330901365488163e-06, + "loss": 0.1898, + "step": 1878 + }, + { + "epoch": 0.7507987220447284, + "grad_norm": 1.2561659399741483, + "learning_rate": 9.328577078924151e-06, + "loss": 0.2081, + "step": 1880 + }, + { + "epoch": 0.7515974440894568, + "grad_norm": 1.5567261987394814, + "learning_rate": 9.326249052877949e-06, + "loss": 0.1899, + "step": 1882 + }, + { + "epoch": 0.7523961661341853, + "grad_norm": 1.10938481898858, + "learning_rate": 9.323917289360755e-06, + "loss": 0.1822, + "step": 1884 + }, + { + "epoch": 0.7531948881789138, + "grad_norm": 1.199525742926509, + "learning_rate": 9.321581790386989e-06, + "loss": 0.1838, + "step": 1886 + }, + { + "epoch": 0.7539936102236422, + "grad_norm": 1.2183809794152496, + "learning_rate": 9.319242557974306e-06, + "loss": 0.1813, + "step": 1888 + }, + { + "epoch": 0.7547923322683706, + "grad_norm": 1.3361557842251508, + "learning_rate": 9.316899594143581e-06, + "loss": 0.1923, + "step": 1890 + }, + { + "epoch": 0.755591054313099, + "grad_norm": 1.147280166061991, + "learning_rate": 9.31455290091891e-06, + "loss": 0.1681, + "step": 1892 + }, + { + "epoch": 0.7563897763578274, + "grad_norm": 1.2035996373756572, + "learning_rate": 9.31220248032762e-06, + "loss": 0.1752, + "step": 1894 + }, + { + "epoch": 0.7571884984025559, + "grad_norm": 1.2112026467710946, + "learning_rate": 9.309848334400247e-06, + "loss": 0.1862, + "step": 1896 + }, + { + "epoch": 0.7579872204472844, + "grad_norm": 1.345396574129663, + "learning_rate": 9.307490465170555e-06, + "loss": 0.2152, + "step": 1898 + }, + { + "epoch": 0.7587859424920128, + "grad_norm": 1.1731055253334703, + "learning_rate": 9.30512887467552e-06, + "loss": 0.1845, + "step": 1900 + }, + { + "epoch": 0.7595846645367412, + "grad_norm": 1.2248021789089425, + "learning_rate": 9.302763564955332e-06, + "loss": 0.1896, + "step": 1902 + }, + { + "epoch": 0.7603833865814696, + "grad_norm": 1.2159843127230634, + "learning_rate": 9.300394538053395e-06, + "loss": 0.2073, + "step": 1904 + }, + { + "epoch": 0.7611821086261981, + "grad_norm": 1.0782210847536804, + "learning_rate": 9.298021796016328e-06, + "loss": 0.1965, + "step": 1906 + }, + { + "epoch": 0.7619808306709265, + "grad_norm": 1.0674705155900905, + "learning_rate": 9.295645340893954e-06, + "loss": 0.1828, + "step": 1908 + }, + { + "epoch": 0.762779552715655, + "grad_norm": 1.242030763933389, + "learning_rate": 9.293265174739304e-06, + "loss": 0.185, + "step": 1910 + }, + { + "epoch": 0.7635782747603834, + "grad_norm": 1.2869300880171743, + "learning_rate": 9.29088129960862e-06, + "loss": 0.2072, + "step": 1912 + }, + { + "epoch": 0.7643769968051118, + "grad_norm": 1.2987640167915715, + "learning_rate": 9.288493717561346e-06, + "loss": 0.1797, + "step": 1914 + }, + { + "epoch": 0.7651757188498403, + "grad_norm": 1.360583270135005, + "learning_rate": 9.286102430660124e-06, + "loss": 0.1915, + "step": 1916 + }, + { + "epoch": 0.7659744408945687, + "grad_norm": 1.1308395049372801, + "learning_rate": 9.283707440970804e-06, + "loss": 0.1732, + "step": 1918 + }, + { + "epoch": 0.7667731629392971, + "grad_norm": 1.1740116097009, + "learning_rate": 9.281308750562426e-06, + "loss": 0.1954, + "step": 1920 + }, + { + "epoch": 0.7675718849840255, + "grad_norm": 1.2021810735396423, + "learning_rate": 9.278906361507238e-06, + "loss": 0.1941, + "step": 1922 + }, + { + "epoch": 0.768370607028754, + "grad_norm": 1.3054645933982225, + "learning_rate": 9.276500275880676e-06, + "loss": 0.2025, + "step": 1924 + }, + { + "epoch": 0.7691693290734825, + "grad_norm": 1.2735690354152704, + "learning_rate": 9.274090495761368e-06, + "loss": 0.1977, + "step": 1926 + }, + { + "epoch": 0.7699680511182109, + "grad_norm": 1.0912094717996301, + "learning_rate": 9.271677023231137e-06, + "loss": 0.1918, + "step": 1928 + }, + { + "epoch": 0.7707667731629393, + "grad_norm": 1.2385601982012775, + "learning_rate": 9.269259860375001e-06, + "loss": 0.1853, + "step": 1930 + }, + { + "epoch": 0.7715654952076677, + "grad_norm": 1.2615706597096068, + "learning_rate": 9.266839009281154e-06, + "loss": 0.1775, + "step": 1932 + }, + { + "epoch": 0.7723642172523961, + "grad_norm": 1.2657440936466628, + "learning_rate": 9.264414472040986e-06, + "loss": 0.2122, + "step": 1934 + }, + { + "epoch": 0.7731629392971247, + "grad_norm": 1.2434750494740934, + "learning_rate": 9.261986250749068e-06, + "loss": 0.1642, + "step": 1936 + }, + { + "epoch": 0.7739616613418531, + "grad_norm": 1.2150467984561792, + "learning_rate": 9.259554347503157e-06, + "loss": 0.179, + "step": 1938 + }, + { + "epoch": 0.7747603833865815, + "grad_norm": 1.0460909251749075, + "learning_rate": 9.257118764404183e-06, + "loss": 0.1766, + "step": 1940 + }, + { + "epoch": 0.7755591054313099, + "grad_norm": 1.2648684414877398, + "learning_rate": 9.254679503556261e-06, + "loss": 0.1873, + "step": 1942 + }, + { + "epoch": 0.7763578274760383, + "grad_norm": 1.0629676893025566, + "learning_rate": 9.252236567066686e-06, + "loss": 0.1765, + "step": 1944 + }, + { + "epoch": 0.7771565495207667, + "grad_norm": 1.1476890935760726, + "learning_rate": 9.249789957045921e-06, + "loss": 0.1828, + "step": 1946 + }, + { + "epoch": 0.7779552715654952, + "grad_norm": 1.255044114071295, + "learning_rate": 9.247339675607606e-06, + "loss": 0.1962, + "step": 1948 + }, + { + "epoch": 0.7787539936102237, + "grad_norm": 1.1772586864322332, + "learning_rate": 9.244885724868556e-06, + "loss": 0.1948, + "step": 1950 + }, + { + "epoch": 0.7795527156549521, + "grad_norm": 1.134211924482269, + "learning_rate": 9.242428106948748e-06, + "loss": 0.1797, + "step": 1952 + }, + { + "epoch": 0.7803514376996805, + "grad_norm": 1.1580694174114012, + "learning_rate": 9.239966823971339e-06, + "loss": 0.1818, + "step": 1954 + }, + { + "epoch": 0.7811501597444089, + "grad_norm": 1.3621884578061603, + "learning_rate": 9.23750187806264e-06, + "loss": 0.2012, + "step": 1956 + }, + { + "epoch": 0.7819488817891374, + "grad_norm": 1.107597745587062, + "learning_rate": 9.235033271352132e-06, + "loss": 0.1816, + "step": 1958 + }, + { + "epoch": 0.7827476038338658, + "grad_norm": 1.1104284440920194, + "learning_rate": 9.23256100597246e-06, + "loss": 0.1748, + "step": 1960 + }, + { + "epoch": 0.7835463258785943, + "grad_norm": 1.0772833846504462, + "learning_rate": 9.230085084059428e-06, + "loss": 0.1577, + "step": 1962 + }, + { + "epoch": 0.7843450479233227, + "grad_norm": 1.5348516969940666, + "learning_rate": 9.227605507751998e-06, + "loss": 0.1797, + "step": 1964 + }, + { + "epoch": 0.7851437699680511, + "grad_norm": 1.1556035368347786, + "learning_rate": 9.22512227919229e-06, + "loss": 0.1904, + "step": 1966 + }, + { + "epoch": 0.7859424920127795, + "grad_norm": 1.1541070739348673, + "learning_rate": 9.22263540052558e-06, + "loss": 0.1629, + "step": 1968 + }, + { + "epoch": 0.786741214057508, + "grad_norm": 1.1977607582272027, + "learning_rate": 9.220144873900294e-06, + "loss": 0.1852, + "step": 1970 + }, + { + "epoch": 0.7875399361022364, + "grad_norm": 1.2010602313619996, + "learning_rate": 9.217650701468016e-06, + "loss": 0.167, + "step": 1972 + }, + { + "epoch": 0.7883386581469649, + "grad_norm": 1.2177543309257506, + "learning_rate": 9.215152885383473e-06, + "loss": 0.1785, + "step": 1974 + }, + { + "epoch": 0.7891373801916933, + "grad_norm": 1.3217713389862689, + "learning_rate": 9.212651427804544e-06, + "loss": 0.2005, + "step": 1976 + }, + { + "epoch": 0.7899361022364217, + "grad_norm": 1.2318521295090967, + "learning_rate": 9.210146330892251e-06, + "loss": 0.1804, + "step": 1978 + }, + { + "epoch": 0.7907348242811502, + "grad_norm": 1.2625115330731185, + "learning_rate": 9.20763759681076e-06, + "loss": 0.1648, + "step": 1980 + }, + { + "epoch": 0.7915335463258786, + "grad_norm": 1.1952075781633953, + "learning_rate": 9.205125227727386e-06, + "loss": 0.1753, + "step": 1982 + }, + { + "epoch": 0.792332268370607, + "grad_norm": 1.2767418757560534, + "learning_rate": 9.202609225812572e-06, + "loss": 0.1694, + "step": 1984 + }, + { + "epoch": 0.7931309904153354, + "grad_norm": 1.2452947191655441, + "learning_rate": 9.200089593239911e-06, + "loss": 0.2021, + "step": 1986 + }, + { + "epoch": 0.7939297124600639, + "grad_norm": 1.2365293461357185, + "learning_rate": 9.197566332186125e-06, + "loss": 0.1858, + "step": 1988 + }, + { + "epoch": 0.7947284345047924, + "grad_norm": 1.2036375025069872, + "learning_rate": 9.195039444831076e-06, + "loss": 0.1957, + "step": 1990 + }, + { + "epoch": 0.7955271565495208, + "grad_norm": 1.2642281456239493, + "learning_rate": 9.192508933357753e-06, + "loss": 0.1764, + "step": 1992 + }, + { + "epoch": 0.7963258785942492, + "grad_norm": 1.1075926832298097, + "learning_rate": 9.189974799952283e-06, + "loss": 0.1696, + "step": 1994 + }, + { + "epoch": 0.7971246006389776, + "grad_norm": 1.1441930453048488, + "learning_rate": 9.187437046803916e-06, + "loss": 0.1882, + "step": 1996 + }, + { + "epoch": 0.797923322683706, + "grad_norm": 1.1610488689052545, + "learning_rate": 9.184895676105033e-06, + "loss": 0.1844, + "step": 1998 + }, + { + "epoch": 0.7987220447284346, + "grad_norm": 1.2457291872936878, + "learning_rate": 9.182350690051134e-06, + "loss": 0.187, + "step": 2000 + }, + { + "epoch": 0.7987220447284346, + "eval_loss": 0.16647948324680328, + "eval_runtime": 417.9228, + "eval_samples_per_second": 42.608, + "eval_steps_per_second": 5.326, + "step": 2000 + }, + { + "epoch": 0.799520766773163, + "grad_norm": 1.1216999386193873, + "learning_rate": 9.179802090840852e-06, + "loss": 0.1554, + "step": 2002 + }, + { + "epoch": 0.8003194888178914, + "grad_norm": 1.2327864947476759, + "learning_rate": 9.177249880675934e-06, + "loss": 0.19, + "step": 2004 + }, + { + "epoch": 0.8011182108626198, + "grad_norm": 1.1765118852291059, + "learning_rate": 9.174694061761249e-06, + "loss": 0.1752, + "step": 2006 + }, + { + "epoch": 0.8019169329073482, + "grad_norm": 1.3929323793396107, + "learning_rate": 9.172134636304783e-06, + "loss": 0.197, + "step": 2008 + }, + { + "epoch": 0.8027156549520766, + "grad_norm": 1.1116097957166795, + "learning_rate": 9.169571606517637e-06, + "loss": 0.1821, + "step": 2010 + }, + { + "epoch": 0.8035143769968051, + "grad_norm": 1.1758254974846156, + "learning_rate": 9.16700497461403e-06, + "loss": 0.1801, + "step": 2012 + }, + { + "epoch": 0.8043130990415336, + "grad_norm": 1.2306371246505314, + "learning_rate": 9.164434742811287e-06, + "loss": 0.1888, + "step": 2014 + }, + { + "epoch": 0.805111821086262, + "grad_norm": 8.822301554752972, + "learning_rate": 9.16186091332985e-06, + "loss": 0.2123, + "step": 2016 + }, + { + "epoch": 0.8059105431309904, + "grad_norm": 3.5201171269663365, + "learning_rate": 9.159283488393259e-06, + "loss": 0.1897, + "step": 2018 + }, + { + "epoch": 0.8067092651757188, + "grad_norm": 1.5889480805348766, + "learning_rate": 9.15670247022817e-06, + "loss": 0.1948, + "step": 2020 + }, + { + "epoch": 0.8075079872204473, + "grad_norm": 1.2452616237787566, + "learning_rate": 9.154117861064337e-06, + "loss": 0.1736, + "step": 2022 + }, + { + "epoch": 0.8083067092651757, + "grad_norm": 1.1779378285841855, + "learning_rate": 9.15152966313462e-06, + "loss": 0.1949, + "step": 2024 + }, + { + "epoch": 0.8091054313099042, + "grad_norm": 1.1749802960279865, + "learning_rate": 9.148937878674975e-06, + "loss": 0.1931, + "step": 2026 + }, + { + "epoch": 0.8099041533546326, + "grad_norm": 1.1577483816469516, + "learning_rate": 9.146342509924464e-06, + "loss": 0.1761, + "step": 2028 + }, + { + "epoch": 0.810702875399361, + "grad_norm": 1.1493255720503592, + "learning_rate": 9.143743559125238e-06, + "loss": 0.2112, + "step": 2030 + }, + { + "epoch": 0.8115015974440895, + "grad_norm": 1.1867113395597348, + "learning_rate": 9.141141028522544e-06, + "loss": 0.1871, + "step": 2032 + }, + { + "epoch": 0.8123003194888179, + "grad_norm": 1.2097756605247039, + "learning_rate": 9.138534920364725e-06, + "loss": 0.1895, + "step": 2034 + }, + { + "epoch": 0.8130990415335463, + "grad_norm": 1.2656565196236773, + "learning_rate": 9.135925236903213e-06, + "loss": 0.1931, + "step": 2036 + }, + { + "epoch": 0.8138977635782748, + "grad_norm": 1.1498073348190923, + "learning_rate": 9.133311980392525e-06, + "loss": 0.1622, + "step": 2038 + }, + { + "epoch": 0.8146964856230032, + "grad_norm": 1.1313101324027541, + "learning_rate": 9.130695153090272e-06, + "loss": 0.1909, + "step": 2040 + }, + { + "epoch": 0.8154952076677316, + "grad_norm": 1.2312685403966457, + "learning_rate": 9.128074757257142e-06, + "loss": 0.2096, + "step": 2042 + }, + { + "epoch": 0.8162939297124601, + "grad_norm": 1.1681617587331703, + "learning_rate": 9.125450795156913e-06, + "loss": 0.168, + "step": 2044 + }, + { + "epoch": 0.8170926517571885, + "grad_norm": 1.1768723977340287, + "learning_rate": 9.12282326905644e-06, + "loss": 0.1889, + "step": 2046 + }, + { + "epoch": 0.8178913738019169, + "grad_norm": 1.0800467032397891, + "learning_rate": 9.120192181225658e-06, + "loss": 0.1948, + "step": 2048 + }, + { + "epoch": 0.8186900958466453, + "grad_norm": 1.1247061859219583, + "learning_rate": 9.117557533937575e-06, + "loss": 0.1713, + "step": 2050 + }, + { + "epoch": 0.8194888178913738, + "grad_norm": 1.172438969339138, + "learning_rate": 9.114919329468283e-06, + "loss": 0.162, + "step": 2052 + }, + { + "epoch": 0.8202875399361023, + "grad_norm": 1.2278975843705813, + "learning_rate": 9.112277570096938e-06, + "loss": 0.1865, + "step": 2054 + }, + { + "epoch": 0.8210862619808307, + "grad_norm": 1.2992348324926213, + "learning_rate": 9.109632258105771e-06, + "loss": 0.2055, + "step": 2056 + }, + { + "epoch": 0.8218849840255591, + "grad_norm": 1.251504510998654, + "learning_rate": 9.106983395780086e-06, + "loss": 0.1843, + "step": 2058 + }, + { + "epoch": 0.8226837060702875, + "grad_norm": 1.276208869165608, + "learning_rate": 9.104330985408245e-06, + "loss": 0.1886, + "step": 2060 + }, + { + "epoch": 0.8234824281150159, + "grad_norm": 1.356357122339326, + "learning_rate": 9.101675029281683e-06, + "loss": 0.1981, + "step": 2062 + }, + { + "epoch": 0.8242811501597445, + "grad_norm": 1.2399736943434136, + "learning_rate": 9.099015529694894e-06, + "loss": 0.1897, + "step": 2064 + }, + { + "epoch": 0.8250798722044729, + "grad_norm": 1.2876220526804583, + "learning_rate": 9.096352488945437e-06, + "loss": 0.1796, + "step": 2066 + }, + { + "epoch": 0.8258785942492013, + "grad_norm": 1.2532320467708493, + "learning_rate": 9.093685909333926e-06, + "loss": 0.1788, + "step": 2068 + }, + { + "epoch": 0.8266773162939297, + "grad_norm": 1.2586518550619221, + "learning_rate": 9.091015793164035e-06, + "loss": 0.1768, + "step": 2070 + }, + { + "epoch": 0.8274760383386581, + "grad_norm": 1.1394590760675556, + "learning_rate": 9.088342142742493e-06, + "loss": 0.1741, + "step": 2072 + }, + { + "epoch": 0.8282747603833865, + "grad_norm": 1.09902223120881, + "learning_rate": 9.08566496037908e-06, + "loss": 0.1749, + "step": 2074 + }, + { + "epoch": 0.829073482428115, + "grad_norm": 1.094791391856433, + "learning_rate": 9.08298424838663e-06, + "loss": 0.1785, + "step": 2076 + }, + { + "epoch": 0.8298722044728435, + "grad_norm": 1.3016717542317755, + "learning_rate": 9.080300009081025e-06, + "loss": 0.2058, + "step": 2078 + }, + { + "epoch": 0.8306709265175719, + "grad_norm": 1.2437203627775957, + "learning_rate": 9.077612244781196e-06, + "loss": 0.18, + "step": 2080 + }, + { + "epoch": 0.8314696485623003, + "grad_norm": 1.1494190321851014, + "learning_rate": 9.074920957809115e-06, + "loss": 0.1993, + "step": 2082 + }, + { + "epoch": 0.8322683706070287, + "grad_norm": 1.197846354885409, + "learning_rate": 9.0722261504898e-06, + "loss": 0.1924, + "step": 2084 + }, + { + "epoch": 0.8330670926517572, + "grad_norm": 1.2305171514008204, + "learning_rate": 9.069527825151314e-06, + "loss": 0.2045, + "step": 2086 + }, + { + "epoch": 0.8338658146964856, + "grad_norm": 1.1363998111576086, + "learning_rate": 9.066825984124751e-06, + "loss": 0.1878, + "step": 2088 + }, + { + "epoch": 0.8346645367412141, + "grad_norm": 1.2416863327291112, + "learning_rate": 9.064120629744253e-06, + "loss": 0.187, + "step": 2090 + }, + { + "epoch": 0.8354632587859425, + "grad_norm": 1.1003807898549165, + "learning_rate": 9.061411764346983e-06, + "loss": 0.1727, + "step": 2092 + }, + { + "epoch": 0.8362619808306709, + "grad_norm": 1.1391607571000586, + "learning_rate": 9.05869939027315e-06, + "loss": 0.1792, + "step": 2094 + }, + { + "epoch": 0.8370607028753994, + "grad_norm": 1.191740022915474, + "learning_rate": 9.055983509865988e-06, + "loss": 0.1852, + "step": 2096 + }, + { + "epoch": 0.8378594249201278, + "grad_norm": 1.263318830688807, + "learning_rate": 9.053264125471763e-06, + "loss": 0.1794, + "step": 2098 + }, + { + "epoch": 0.8386581469648562, + "grad_norm": 1.2749000778835136, + "learning_rate": 9.050541239439764e-06, + "loss": 0.1683, + "step": 2100 + }, + { + "epoch": 0.8394568690095847, + "grad_norm": 1.264926687502538, + "learning_rate": 9.04781485412231e-06, + "loss": 0.1811, + "step": 2102 + }, + { + "epoch": 0.8402555910543131, + "grad_norm": 1.218176101735685, + "learning_rate": 9.045084971874738e-06, + "loss": 0.1826, + "step": 2104 + }, + { + "epoch": 0.8410543130990416, + "grad_norm": 1.1339821842224997, + "learning_rate": 9.04235159505541e-06, + "loss": 0.1748, + "step": 2106 + }, + { + "epoch": 0.84185303514377, + "grad_norm": 1.228556794787821, + "learning_rate": 9.039614726025708e-06, + "loss": 0.1816, + "step": 2108 + }, + { + "epoch": 0.8426517571884984, + "grad_norm": 1.1590750565644556, + "learning_rate": 9.036874367150024e-06, + "loss": 0.1919, + "step": 2110 + }, + { + "epoch": 0.8434504792332268, + "grad_norm": 1.1992254533267062, + "learning_rate": 9.034130520795774e-06, + "loss": 0.1786, + "step": 2112 + }, + { + "epoch": 0.8442492012779552, + "grad_norm": 1.2866570327279305, + "learning_rate": 9.03138318933338e-06, + "loss": 0.1779, + "step": 2114 + }, + { + "epoch": 0.8450479233226837, + "grad_norm": 1.1304382720588986, + "learning_rate": 9.028632375136277e-06, + "loss": 0.1916, + "step": 2116 + }, + { + "epoch": 0.8458466453674122, + "grad_norm": 1.2026241441378607, + "learning_rate": 9.025878080580908e-06, + "loss": 0.1865, + "step": 2118 + }, + { + "epoch": 0.8466453674121406, + "grad_norm": 1.2239810455485836, + "learning_rate": 9.023120308046726e-06, + "loss": 0.2069, + "step": 2120 + }, + { + "epoch": 0.847444089456869, + "grad_norm": 1.1593857182783416, + "learning_rate": 9.020359059916189e-06, + "loss": 0.1802, + "step": 2122 + }, + { + "epoch": 0.8482428115015974, + "grad_norm": 1.1851101442311125, + "learning_rate": 9.017594338574746e-06, + "loss": 0.1718, + "step": 2124 + }, + { + "epoch": 0.8490415335463258, + "grad_norm": 1.3424377790206583, + "learning_rate": 9.014826146410863e-06, + "loss": 0.187, + "step": 2126 + }, + { + "epoch": 0.8498402555910544, + "grad_norm": 1.2211268818753362, + "learning_rate": 9.012054485815995e-06, + "loss": 0.2054, + "step": 2128 + }, + { + "epoch": 0.8506389776357828, + "grad_norm": 1.2093558338280859, + "learning_rate": 9.009279359184594e-06, + "loss": 0.1853, + "step": 2130 + }, + { + "epoch": 0.8514376996805112, + "grad_norm": 1.3142251663193079, + "learning_rate": 9.006500768914106e-06, + "loss": 0.182, + "step": 2132 + }, + { + "epoch": 0.8522364217252396, + "grad_norm": 1.1390165214451813, + "learning_rate": 9.003718717404977e-06, + "loss": 0.1714, + "step": 2134 + }, + { + "epoch": 0.853035143769968, + "grad_norm": 1.2833538623348046, + "learning_rate": 9.00093320706063e-06, + "loss": 0.1829, + "step": 2136 + }, + { + "epoch": 0.8538338658146964, + "grad_norm": 1.201198063387828, + "learning_rate": 8.998144240287487e-06, + "loss": 0.1836, + "step": 2138 + }, + { + "epoch": 0.854632587859425, + "grad_norm": 1.26048990415032, + "learning_rate": 8.995351819494954e-06, + "loss": 0.1823, + "step": 2140 + }, + { + "epoch": 0.8554313099041534, + "grad_norm": 1.1311889575435519, + "learning_rate": 8.992555947095414e-06, + "loss": 0.1692, + "step": 2142 + }, + { + "epoch": 0.8562300319488818, + "grad_norm": 1.351907102192122, + "learning_rate": 8.989756625504237e-06, + "loss": 0.1979, + "step": 2144 + }, + { + "epoch": 0.8570287539936102, + "grad_norm": 1.3577115649569347, + "learning_rate": 8.98695385713978e-06, + "loss": 0.1888, + "step": 2146 + }, + { + "epoch": 0.8578274760383386, + "grad_norm": 1.1332848107317308, + "learning_rate": 8.984147644423362e-06, + "loss": 0.1737, + "step": 2148 + }, + { + "epoch": 0.8586261980830671, + "grad_norm": 1.276000255217281, + "learning_rate": 8.981337989779291e-06, + "loss": 0.1904, + "step": 2150 + }, + { + "epoch": 0.8594249201277955, + "grad_norm": 1.2196944395263352, + "learning_rate": 8.978524895634842e-06, + "loss": 0.1976, + "step": 2152 + }, + { + "epoch": 0.860223642172524, + "grad_norm": 1.0522481436377091, + "learning_rate": 8.975708364420264e-06, + "loss": 0.1529, + "step": 2154 + }, + { + "epoch": 0.8610223642172524, + "grad_norm": 1.2862973570919587, + "learning_rate": 8.972888398568772e-06, + "loss": 0.1941, + "step": 2156 + }, + { + "epoch": 0.8618210862619808, + "grad_norm": 1.1472291142227702, + "learning_rate": 8.970065000516553e-06, + "loss": 0.1884, + "step": 2158 + }, + { + "epoch": 0.8626198083067093, + "grad_norm": 1.2425852591025766, + "learning_rate": 8.967238172702754e-06, + "loss": 0.1758, + "step": 2160 + }, + { + "epoch": 0.8634185303514377, + "grad_norm": 1.1523178078837133, + "learning_rate": 8.964407917569488e-06, + "loss": 0.1773, + "step": 2162 + }, + { + "epoch": 0.8642172523961661, + "grad_norm": 1.0544506917154455, + "learning_rate": 8.96157423756183e-06, + "loss": 0.1782, + "step": 2164 + }, + { + "epoch": 0.8650159744408946, + "grad_norm": 1.398137403531195, + "learning_rate": 8.958737135127812e-06, + "loss": 0.1888, + "step": 2166 + }, + { + "epoch": 0.865814696485623, + "grad_norm": 1.1006386353076576, + "learning_rate": 8.95589661271842e-06, + "loss": 0.1613, + "step": 2168 + }, + { + "epoch": 0.8666134185303515, + "grad_norm": 1.1816019658476726, + "learning_rate": 8.953052672787602e-06, + "loss": 0.1828, + "step": 2170 + }, + { + "epoch": 0.8674121405750799, + "grad_norm": 1.160479928900505, + "learning_rate": 8.95020531779225e-06, + "loss": 0.1816, + "step": 2172 + }, + { + "epoch": 0.8682108626198083, + "grad_norm": 1.1134898684750454, + "learning_rate": 8.94735455019221e-06, + "loss": 0.1806, + "step": 2174 + }, + { + "epoch": 0.8690095846645367, + "grad_norm": 1.1953510040643958, + "learning_rate": 8.94450037245028e-06, + "loss": 0.1759, + "step": 2176 + }, + { + "epoch": 0.8698083067092651, + "grad_norm": 1.2112582987106082, + "learning_rate": 8.941642787032197e-06, + "loss": 0.1684, + "step": 2178 + }, + { + "epoch": 0.8706070287539937, + "grad_norm": 1.3031673371016932, + "learning_rate": 8.938781796406646e-06, + "loss": 0.1846, + "step": 2180 + }, + { + "epoch": 0.8714057507987221, + "grad_norm": 1.247581011761579, + "learning_rate": 8.935917403045251e-06, + "loss": 0.1849, + "step": 2182 + }, + { + "epoch": 0.8722044728434505, + "grad_norm": 1.2073797176687306, + "learning_rate": 8.933049609422582e-06, + "loss": 0.17, + "step": 2184 + }, + { + "epoch": 0.8730031948881789, + "grad_norm": 1.1534987114988338, + "learning_rate": 8.930178418016138e-06, + "loss": 0.185, + "step": 2186 + }, + { + "epoch": 0.8738019169329073, + "grad_norm": 1.1820635621691253, + "learning_rate": 8.92730383130636e-06, + "loss": 0.1632, + "step": 2188 + }, + { + "epoch": 0.8746006389776357, + "grad_norm": 1.1918298750579541, + "learning_rate": 8.924425851776619e-06, + "loss": 0.1789, + "step": 2190 + }, + { + "epoch": 0.8753993610223643, + "grad_norm": 1.2772477046804016, + "learning_rate": 8.921544481913218e-06, + "loss": 0.1932, + "step": 2192 + }, + { + "epoch": 0.8761980830670927, + "grad_norm": 1.1382431039382104, + "learning_rate": 8.918659724205387e-06, + "loss": 0.1829, + "step": 2194 + }, + { + "epoch": 0.8769968051118211, + "grad_norm": 1.124034930363523, + "learning_rate": 8.915771581145286e-06, + "loss": 0.157, + "step": 2196 + }, + { + "epoch": 0.8777955271565495, + "grad_norm": 1.1938500101023577, + "learning_rate": 8.912880055227998e-06, + "loss": 0.1873, + "step": 2198 + }, + { + "epoch": 0.8785942492012779, + "grad_norm": 1.247886644845545, + "learning_rate": 8.909985148951528e-06, + "loss": 0.1893, + "step": 2200 + }, + { + "epoch": 0.8793929712460063, + "grad_norm": 1.2882483087755097, + "learning_rate": 8.907086864816804e-06, + "loss": 0.1986, + "step": 2202 + }, + { + "epoch": 0.8801916932907349, + "grad_norm": 1.1916502807322977, + "learning_rate": 8.904185205327667e-06, + "loss": 0.1585, + "step": 2204 + }, + { + "epoch": 0.8809904153354633, + "grad_norm": 1.1609680993035214, + "learning_rate": 8.901280172990878e-06, + "loss": 0.1756, + "step": 2206 + }, + { + "epoch": 0.8817891373801917, + "grad_norm": 1.0458575482646773, + "learning_rate": 8.898371770316113e-06, + "loss": 0.1545, + "step": 2208 + }, + { + "epoch": 0.8825878594249201, + "grad_norm": 1.3354091660753635, + "learning_rate": 8.895459999815954e-06, + "loss": 0.1997, + "step": 2210 + }, + { + "epoch": 0.8833865814696485, + "grad_norm": 1.1262191292997419, + "learning_rate": 8.892544864005899e-06, + "loss": 0.1743, + "step": 2212 + }, + { + "epoch": 0.884185303514377, + "grad_norm": 1.3566110237290778, + "learning_rate": 8.889626365404348e-06, + "loss": 0.1922, + "step": 2214 + }, + { + "epoch": 0.8849840255591054, + "grad_norm": 1.376910187006041, + "learning_rate": 8.886704506532611e-06, + "loss": 0.1913, + "step": 2216 + }, + { + "epoch": 0.8857827476038339, + "grad_norm": 1.2397186314872872, + "learning_rate": 8.883779289914894e-06, + "loss": 0.1732, + "step": 2218 + }, + { + "epoch": 0.8865814696485623, + "grad_norm": 1.2887051577849917, + "learning_rate": 8.880850718078313e-06, + "loss": 0.2123, + "step": 2220 + }, + { + "epoch": 0.8873801916932907, + "grad_norm": 1.2786244843973318, + "learning_rate": 8.877918793552875e-06, + "loss": 0.1828, + "step": 2222 + }, + { + "epoch": 0.8881789137380192, + "grad_norm": 1.0958110453050904, + "learning_rate": 8.874983518871488e-06, + "loss": 0.1746, + "step": 2224 + }, + { + "epoch": 0.8889776357827476, + "grad_norm": 1.2263512727265042, + "learning_rate": 8.87204489656995e-06, + "loss": 0.2076, + "step": 2226 + }, + { + "epoch": 0.889776357827476, + "grad_norm": 1.2566470262879932, + "learning_rate": 8.869102929186954e-06, + "loss": 0.1942, + "step": 2228 + }, + { + "epoch": 0.8905750798722045, + "grad_norm": 1.313910249745457, + "learning_rate": 8.866157619264086e-06, + "loss": 0.2064, + "step": 2230 + }, + { + "epoch": 0.8913738019169329, + "grad_norm": 1.2130369145657738, + "learning_rate": 8.86320896934581e-06, + "loss": 0.176, + "step": 2232 + }, + { + "epoch": 0.8921725239616614, + "grad_norm": 1.1816578044938435, + "learning_rate": 8.860256981979485e-06, + "loss": 0.1932, + "step": 2234 + }, + { + "epoch": 0.8929712460063898, + "grad_norm": 1.095754287469498, + "learning_rate": 8.857301659715348e-06, + "loss": 0.1808, + "step": 2236 + }, + { + "epoch": 0.8937699680511182, + "grad_norm": 1.2405828782550468, + "learning_rate": 8.854343005106521e-06, + "loss": 0.1975, + "step": 2238 + }, + { + "epoch": 0.8945686900958466, + "grad_norm": 1.139128509869952, + "learning_rate": 8.851381020709e-06, + "loss": 0.1638, + "step": 2240 + }, + { + "epoch": 0.895367412140575, + "grad_norm": 1.1346192386483716, + "learning_rate": 8.848415709081659e-06, + "loss": 0.1621, + "step": 2242 + }, + { + "epoch": 0.8961661341853036, + "grad_norm": 1.2827068653023486, + "learning_rate": 8.845447072786251e-06, + "loss": 0.19, + "step": 2244 + }, + { + "epoch": 0.896964856230032, + "grad_norm": 1.1215489197999844, + "learning_rate": 8.842475114387394e-06, + "loss": 0.1732, + "step": 2246 + }, + { + "epoch": 0.8977635782747604, + "grad_norm": 1.1479343886524802, + "learning_rate": 8.839499836452584e-06, + "loss": 0.1727, + "step": 2248 + }, + { + "epoch": 0.8985623003194888, + "grad_norm": 1.1534484096320634, + "learning_rate": 8.836521241552177e-06, + "loss": 0.1885, + "step": 2250 + }, + { + "epoch": 0.8993610223642172, + "grad_norm": 1.234578241869321, + "learning_rate": 8.833539332259398e-06, + "loss": 0.1672, + "step": 2252 + }, + { + "epoch": 0.9001597444089456, + "grad_norm": 1.2103918137414391, + "learning_rate": 8.830554111150337e-06, + "loss": 0.1825, + "step": 2254 + }, + { + "epoch": 0.9009584664536742, + "grad_norm": 1.0351256185130608, + "learning_rate": 8.827565580803944e-06, + "loss": 0.1624, + "step": 2256 + }, + { + "epoch": 0.9017571884984026, + "grad_norm": 1.1562824946118846, + "learning_rate": 8.824573743802023e-06, + "loss": 0.1724, + "step": 2258 + }, + { + "epoch": 0.902555910543131, + "grad_norm": 1.352880063666687, + "learning_rate": 8.821578602729242e-06, + "loss": 0.1709, + "step": 2260 + }, + { + "epoch": 0.9033546325878594, + "grad_norm": 1.2118411341735884, + "learning_rate": 8.81858016017312e-06, + "loss": 0.172, + "step": 2262 + }, + { + "epoch": 0.9041533546325878, + "grad_norm": 1.2657731749208432, + "learning_rate": 8.815578418724031e-06, + "loss": 0.1944, + "step": 2264 + }, + { + "epoch": 0.9049520766773163, + "grad_norm": 1.2696811826921244, + "learning_rate": 8.812573380975191e-06, + "loss": 0.1945, + "step": 2266 + }, + { + "epoch": 0.9057507987220448, + "grad_norm": 1.260789859526146, + "learning_rate": 8.809565049522673e-06, + "loss": 0.187, + "step": 2268 + }, + { + "epoch": 0.9065495207667732, + "grad_norm": 1.1561924099805676, + "learning_rate": 8.806553426965391e-06, + "loss": 0.184, + "step": 2270 + }, + { + "epoch": 0.9073482428115016, + "grad_norm": 1.2184507876835764, + "learning_rate": 8.803538515905102e-06, + "loss": 0.181, + "step": 2272 + }, + { + "epoch": 0.90814696485623, + "grad_norm": 1.1489816284226748, + "learning_rate": 8.800520318946404e-06, + "loss": 0.1601, + "step": 2274 + }, + { + "epoch": 0.9089456869009584, + "grad_norm": 1.150382635417185, + "learning_rate": 8.797498838696737e-06, + "loss": 0.173, + "step": 2276 + }, + { + "epoch": 0.9097444089456869, + "grad_norm": 1.2396571356927486, + "learning_rate": 8.79447407776637e-06, + "loss": 0.1775, + "step": 2278 + }, + { + "epoch": 0.9105431309904153, + "grad_norm": 1.1258691868543116, + "learning_rate": 8.791446038768416e-06, + "loss": 0.1796, + "step": 2280 + }, + { + "epoch": 0.9113418530351438, + "grad_norm": 1.1837850225751487, + "learning_rate": 8.788414724318814e-06, + "loss": 0.186, + "step": 2282 + }, + { + "epoch": 0.9121405750798722, + "grad_norm": 1.1771823649393418, + "learning_rate": 8.785380137036332e-06, + "loss": 0.1774, + "step": 2284 + }, + { + "epoch": 0.9129392971246006, + "grad_norm": 1.195664396595768, + "learning_rate": 8.782342279542569e-06, + "loss": 0.1715, + "step": 2286 + }, + { + "epoch": 0.9137380191693291, + "grad_norm": 1.394493043461634, + "learning_rate": 8.779301154461945e-06, + "loss": 0.1954, + "step": 2288 + }, + { + "epoch": 0.9145367412140575, + "grad_norm": 1.484773906372258, + "learning_rate": 8.776256764421706e-06, + "loss": 0.1649, + "step": 2290 + }, + { + "epoch": 0.9153354632587859, + "grad_norm": 1.1713356392013368, + "learning_rate": 8.773209112051919e-06, + "loss": 0.1676, + "step": 2292 + }, + { + "epoch": 0.9161341853035144, + "grad_norm": 1.2492572619724682, + "learning_rate": 8.770158199985466e-06, + "loss": 0.203, + "step": 2294 + }, + { + "epoch": 0.9169329073482428, + "grad_norm": 1.112250308815286, + "learning_rate": 8.76710403085805e-06, + "loss": 0.1675, + "step": 2296 + }, + { + "epoch": 0.9177316293929713, + "grad_norm": 1.2119980335045586, + "learning_rate": 8.764046607308183e-06, + "loss": 0.1822, + "step": 2298 + }, + { + "epoch": 0.9185303514376997, + "grad_norm": 1.2003429295335617, + "learning_rate": 8.760985931977191e-06, + "loss": 0.1648, + "step": 2300 + }, + { + "epoch": 0.9193290734824281, + "grad_norm": 1.2923004205685706, + "learning_rate": 8.757922007509208e-06, + "loss": 0.1672, + "step": 2302 + }, + { + "epoch": 0.9201277955271565, + "grad_norm": 1.2602401218318184, + "learning_rate": 8.754854836551174e-06, + "loss": 0.1697, + "step": 2304 + }, + { + "epoch": 0.920926517571885, + "grad_norm": 1.3225061494713877, + "learning_rate": 8.75178442175284e-06, + "loss": 0.2173, + "step": 2306 + }, + { + "epoch": 0.9217252396166135, + "grad_norm": 1.236225325668187, + "learning_rate": 8.748710765766752e-06, + "loss": 0.1748, + "step": 2308 + }, + { + "epoch": 0.9225239616613419, + "grad_norm": 1.5668493173725866, + "learning_rate": 8.745633871248257e-06, + "loss": 0.1792, + "step": 2310 + }, + { + "epoch": 0.9233226837060703, + "grad_norm": 1.2818950876816324, + "learning_rate": 8.742553740855507e-06, + "loss": 0.1701, + "step": 2312 + }, + { + "epoch": 0.9241214057507987, + "grad_norm": 1.0563021739777534, + "learning_rate": 8.739470377249436e-06, + "loss": 0.1673, + "step": 2314 + }, + { + "epoch": 0.9249201277955271, + "grad_norm": 1.1015536336953309, + "learning_rate": 8.736383783093788e-06, + "loss": 0.1768, + "step": 2316 + }, + { + "epoch": 0.9257188498402555, + "grad_norm": 1.2209081010826341, + "learning_rate": 8.733293961055082e-06, + "loss": 0.1893, + "step": 2318 + }, + { + "epoch": 0.9265175718849841, + "grad_norm": 1.1147003675746905, + "learning_rate": 8.730200913802638e-06, + "loss": 0.1727, + "step": 2320 + }, + { + "epoch": 0.9273162939297125, + "grad_norm": 1.1772795182600777, + "learning_rate": 8.727104644008553e-06, + "loss": 0.1777, + "step": 2322 + }, + { + "epoch": 0.9281150159744409, + "grad_norm": 1.106532804822251, + "learning_rate": 8.724005154347714e-06, + "loss": 0.1781, + "step": 2324 + }, + { + "epoch": 0.9289137380191693, + "grad_norm": 1.15574880894598, + "learning_rate": 8.720902447497788e-06, + "loss": 0.1777, + "step": 2326 + }, + { + "epoch": 0.9297124600638977, + "grad_norm": 1.2312851048264675, + "learning_rate": 8.717796526139218e-06, + "loss": 0.204, + "step": 2328 + }, + { + "epoch": 0.9305111821086262, + "grad_norm": 1.261983053853919, + "learning_rate": 8.71468739295523e-06, + "loss": 0.1993, + "step": 2330 + }, + { + "epoch": 0.9313099041533547, + "grad_norm": 1.1982722648050632, + "learning_rate": 8.711575050631823e-06, + "loss": 0.1968, + "step": 2332 + }, + { + "epoch": 0.9321086261980831, + "grad_norm": 1.1624739320562267, + "learning_rate": 8.708459501857762e-06, + "loss": 0.1864, + "step": 2334 + }, + { + "epoch": 0.9329073482428115, + "grad_norm": 1.027366394484867, + "learning_rate": 8.70534074932459e-06, + "loss": 0.1687, + "step": 2336 + }, + { + "epoch": 0.9337060702875399, + "grad_norm": 1.1021491198370645, + "learning_rate": 8.702218795726619e-06, + "loss": 0.1684, + "step": 2338 + }, + { + "epoch": 0.9345047923322684, + "grad_norm": 1.1040575428181885, + "learning_rate": 8.699093643760914e-06, + "loss": 0.1739, + "step": 2340 + }, + { + "epoch": 0.9353035143769968, + "grad_norm": 1.195647306811607, + "learning_rate": 8.695965296127318e-06, + "loss": 0.1816, + "step": 2342 + }, + { + "epoch": 0.9361022364217252, + "grad_norm": 1.0812569337947433, + "learning_rate": 8.692833755528426e-06, + "loss": 0.1766, + "step": 2344 + }, + { + "epoch": 0.9369009584664537, + "grad_norm": 1.3202981154225082, + "learning_rate": 8.689699024669594e-06, + "loss": 0.1879, + "step": 2346 + }, + { + "epoch": 0.9376996805111821, + "grad_norm": 1.1970245625047584, + "learning_rate": 8.686561106258932e-06, + "loss": 0.1803, + "step": 2348 + }, + { + "epoch": 0.9384984025559105, + "grad_norm": 1.2213363903695773, + "learning_rate": 8.683420003007308e-06, + "loss": 0.1947, + "step": 2350 + }, + { + "epoch": 0.939297124600639, + "grad_norm": 1.168137561259312, + "learning_rate": 8.680275717628336e-06, + "loss": 0.1683, + "step": 2352 + }, + { + "epoch": 0.9400958466453674, + "grad_norm": 1.1140547555618086, + "learning_rate": 8.677128252838386e-06, + "loss": 0.1808, + "step": 2354 + }, + { + "epoch": 0.9408945686900958, + "grad_norm": 1.0913344127324964, + "learning_rate": 8.673977611356567e-06, + "loss": 0.1768, + "step": 2356 + }, + { + "epoch": 0.9416932907348243, + "grad_norm": 1.2470605938854136, + "learning_rate": 8.670823795904737e-06, + "loss": 0.1881, + "step": 2358 + }, + { + "epoch": 0.9424920127795527, + "grad_norm": 1.1432903930033393, + "learning_rate": 8.667666809207495e-06, + "loss": 0.1814, + "step": 2360 + }, + { + "epoch": 0.9432907348242812, + "grad_norm": 1.1254326884247754, + "learning_rate": 8.664506653992181e-06, + "loss": 0.1721, + "step": 2362 + }, + { + "epoch": 0.9440894568690096, + "grad_norm": 1.1774199109472463, + "learning_rate": 8.661343332988869e-06, + "loss": 0.1644, + "step": 2364 + }, + { + "epoch": 0.944888178913738, + "grad_norm": 1.1966632218098605, + "learning_rate": 8.65817684893037e-06, + "loss": 0.1735, + "step": 2366 + }, + { + "epoch": 0.9456869009584664, + "grad_norm": 1.0407093351327132, + "learning_rate": 8.655007204552228e-06, + "loss": 0.1544, + "step": 2368 + }, + { + "epoch": 0.9464856230031949, + "grad_norm": 1.2274155096905306, + "learning_rate": 8.651834402592719e-06, + "loss": 0.1798, + "step": 2370 + }, + { + "epoch": 0.9472843450479234, + "grad_norm": 1.267776362466128, + "learning_rate": 8.64865844579284e-06, + "loss": 0.1751, + "step": 2372 + }, + { + "epoch": 0.9480830670926518, + "grad_norm": 1.1768732812780864, + "learning_rate": 8.64547933689632e-06, + "loss": 0.1785, + "step": 2374 + }, + { + "epoch": 0.9488817891373802, + "grad_norm": 1.1577676779075388, + "learning_rate": 8.64229707864961e-06, + "loss": 0.1947, + "step": 2376 + }, + { + "epoch": 0.9496805111821086, + "grad_norm": 1.1591103988078189, + "learning_rate": 8.63911167380188e-06, + "loss": 0.1788, + "step": 2378 + }, + { + "epoch": 0.950479233226837, + "grad_norm": 1.11481900810481, + "learning_rate": 8.635923125105019e-06, + "loss": 0.1676, + "step": 2380 + }, + { + "epoch": 0.9512779552715654, + "grad_norm": 1.2948111421923072, + "learning_rate": 8.632731435313634e-06, + "loss": 0.1645, + "step": 2382 + }, + { + "epoch": 0.952076677316294, + "grad_norm": 1.1733292169857867, + "learning_rate": 8.629536607185042e-06, + "loss": 0.1701, + "step": 2384 + }, + { + "epoch": 0.9528753993610224, + "grad_norm": 1.1636636504635067, + "learning_rate": 8.626338643479275e-06, + "loss": 0.17, + "step": 2386 + }, + { + "epoch": 0.9536741214057508, + "grad_norm": 1.2570201463967705, + "learning_rate": 8.62313754695907e-06, + "loss": 0.1738, + "step": 2388 + }, + { + "epoch": 0.9544728434504792, + "grad_norm": 1.0902131985821306, + "learning_rate": 8.619933320389872e-06, + "loss": 0.1723, + "step": 2390 + }, + { + "epoch": 0.9552715654952076, + "grad_norm": 1.2111908413145964, + "learning_rate": 8.616725966539831e-06, + "loss": 0.1926, + "step": 2392 + }, + { + "epoch": 0.9560702875399361, + "grad_norm": 1.1942013221000287, + "learning_rate": 8.6135154881798e-06, + "loss": 0.1888, + "step": 2394 + }, + { + "epoch": 0.9568690095846646, + "grad_norm": 1.1699831896669461, + "learning_rate": 8.610301888083327e-06, + "loss": 0.2023, + "step": 2396 + }, + { + "epoch": 0.957667731629393, + "grad_norm": 1.1882929391966546, + "learning_rate": 8.607085169026661e-06, + "loss": 0.1852, + "step": 2398 + }, + { + "epoch": 0.9584664536741214, + "grad_norm": 1.1905600662435856, + "learning_rate": 8.60386533378874e-06, + "loss": 0.1919, + "step": 2400 + }, + { + "epoch": 0.9592651757188498, + "grad_norm": 1.2472441081594776, + "learning_rate": 8.600642385151206e-06, + "loss": 0.1923, + "step": 2402 + }, + { + "epoch": 0.9600638977635783, + "grad_norm": 1.0759279066292031, + "learning_rate": 8.597416325898373e-06, + "loss": 0.1685, + "step": 2404 + }, + { + "epoch": 0.9608626198083067, + "grad_norm": 1.1583330331881445, + "learning_rate": 8.594187158817257e-06, + "loss": 0.1749, + "step": 2406 + }, + { + "epoch": 0.9616613418530351, + "grad_norm": 1.2089521342785554, + "learning_rate": 8.590954886697554e-06, + "loss": 0.1939, + "step": 2408 + }, + { + "epoch": 0.9624600638977636, + "grad_norm": 1.2614329419101098, + "learning_rate": 8.58771951233164e-06, + "loss": 0.2026, + "step": 2410 + }, + { + "epoch": 0.963258785942492, + "grad_norm": 1.1366241513550654, + "learning_rate": 8.584481038514573e-06, + "loss": 0.1639, + "step": 2412 + }, + { + "epoch": 0.9640575079872205, + "grad_norm": 1.3624993080110601, + "learning_rate": 8.581239468044093e-06, + "loss": 0.185, + "step": 2414 + }, + { + "epoch": 0.9648562300319489, + "grad_norm": 1.0567057031046825, + "learning_rate": 8.577994803720605e-06, + "loss": 0.1661, + "step": 2416 + }, + { + "epoch": 0.9656549520766773, + "grad_norm": 1.1539180273643035, + "learning_rate": 8.574747048347199e-06, + "loss": 0.1664, + "step": 2418 + }, + { + "epoch": 0.9664536741214057, + "grad_norm": 1.236448054190876, + "learning_rate": 8.571496204729623e-06, + "loss": 0.1818, + "step": 2420 + }, + { + "epoch": 0.9672523961661342, + "grad_norm": 1.0633439918324645, + "learning_rate": 8.568242275676304e-06, + "loss": 0.1735, + "step": 2422 + }, + { + "epoch": 0.9680511182108626, + "grad_norm": 1.1561922874116346, + "learning_rate": 8.564985263998327e-06, + "loss": 0.1651, + "step": 2424 + }, + { + "epoch": 0.9688498402555911, + "grad_norm": 1.1398950472118397, + "learning_rate": 8.561725172509444e-06, + "loss": 0.163, + "step": 2426 + }, + { + "epoch": 0.9696485623003195, + "grad_norm": 1.1653225900272541, + "learning_rate": 8.558462004026065e-06, + "loss": 0.173, + "step": 2428 + }, + { + "epoch": 0.9704472843450479, + "grad_norm": 1.3275231148352606, + "learning_rate": 8.555195761367263e-06, + "loss": 0.1819, + "step": 2430 + }, + { + "epoch": 0.9712460063897763, + "grad_norm": 1.0555861455966542, + "learning_rate": 8.551926447354759e-06, + "loss": 0.1699, + "step": 2432 + }, + { + "epoch": 0.9720447284345048, + "grad_norm": 1.2506959109380367, + "learning_rate": 8.548654064812934e-06, + "loss": 0.1792, + "step": 2434 + }, + { + "epoch": 0.9728434504792333, + "grad_norm": 1.1541294441379601, + "learning_rate": 8.54537861656882e-06, + "loss": 0.1796, + "step": 2436 + }, + { + "epoch": 0.9736421725239617, + "grad_norm": 1.3106265759711455, + "learning_rate": 8.542100105452093e-06, + "loss": 0.1823, + "step": 2438 + }, + { + "epoch": 0.9744408945686901, + "grad_norm": 1.2700540275591943, + "learning_rate": 8.538818534295076e-06, + "loss": 0.183, + "step": 2440 + }, + { + "epoch": 0.9752396166134185, + "grad_norm": 1.4250215144969003, + "learning_rate": 8.535533905932739e-06, + "loss": 0.1865, + "step": 2442 + }, + { + "epoch": 0.9760383386581469, + "grad_norm": 1.1840076787843286, + "learning_rate": 8.532246223202689e-06, + "loss": 0.1773, + "step": 2444 + }, + { + "epoch": 0.9768370607028753, + "grad_norm": 1.0800090683302819, + "learning_rate": 8.528955488945177e-06, + "loss": 0.1762, + "step": 2446 + }, + { + "epoch": 0.9776357827476039, + "grad_norm": 1.056788808510418, + "learning_rate": 8.525661706003083e-06, + "loss": 0.1752, + "step": 2448 + }, + { + "epoch": 0.9784345047923323, + "grad_norm": 1.1120115317215207, + "learning_rate": 8.522364877221926e-06, + "loss": 0.1827, + "step": 2450 + }, + { + "epoch": 0.9792332268370607, + "grad_norm": 1.2484051846053978, + "learning_rate": 8.519065005449858e-06, + "loss": 0.1744, + "step": 2452 + }, + { + "epoch": 0.9800319488817891, + "grad_norm": 1.2862407811422434, + "learning_rate": 8.515762093537654e-06, + "loss": 0.1816, + "step": 2454 + }, + { + "epoch": 0.9808306709265175, + "grad_norm": 1.1176065762239906, + "learning_rate": 8.512456144338717e-06, + "loss": 0.1779, + "step": 2456 + }, + { + "epoch": 0.981629392971246, + "grad_norm": 1.1181750039143228, + "learning_rate": 8.509147160709079e-06, + "loss": 0.1791, + "step": 2458 + }, + { + "epoch": 0.9824281150159745, + "grad_norm": 1.1233000113043177, + "learning_rate": 8.505835145507387e-06, + "loss": 0.1801, + "step": 2460 + }, + { + "epoch": 0.9832268370607029, + "grad_norm": 1.149372977172484, + "learning_rate": 8.502520101594909e-06, + "loss": 0.1621, + "step": 2462 + }, + { + "epoch": 0.9840255591054313, + "grad_norm": 1.1858307935499464, + "learning_rate": 8.499202031835532e-06, + "loss": 0.1832, + "step": 2464 + }, + { + "epoch": 0.9848242811501597, + "grad_norm": 1.290001462169469, + "learning_rate": 8.495880939095754e-06, + "loss": 0.1894, + "step": 2466 + }, + { + "epoch": 0.9856230031948882, + "grad_norm": 1.3581022131665044, + "learning_rate": 8.492556826244687e-06, + "loss": 0.2, + "step": 2468 + }, + { + "epoch": 0.9864217252396166, + "grad_norm": 1.1056676733741198, + "learning_rate": 8.489229696154049e-06, + "loss": 0.1805, + "step": 2470 + }, + { + "epoch": 0.987220447284345, + "grad_norm": 1.1250903817623754, + "learning_rate": 8.485899551698166e-06, + "loss": 0.177, + "step": 2472 + }, + { + "epoch": 0.9880191693290735, + "grad_norm": 1.148392440744409, + "learning_rate": 8.482566395753975e-06, + "loss": 0.1867, + "step": 2474 + }, + { + "epoch": 0.9888178913738019, + "grad_norm": 1.2558757559301705, + "learning_rate": 8.479230231201001e-06, + "loss": 0.1964, + "step": 2476 + }, + { + "epoch": 0.9896166134185304, + "grad_norm": 1.2579913217677217, + "learning_rate": 8.475891060921378e-06, + "loss": 0.1829, + "step": 2478 + }, + { + "epoch": 0.9904153354632588, + "grad_norm": 1.316707228945154, + "learning_rate": 8.472548887799833e-06, + "loss": 0.1696, + "step": 2480 + }, + { + "epoch": 0.9912140575079872, + "grad_norm": 1.2049638106018488, + "learning_rate": 8.46920371472369e-06, + "loss": 0.1887, + "step": 2482 + }, + { + "epoch": 0.9920127795527156, + "grad_norm": 1.2377015394107287, + "learning_rate": 8.465855544582862e-06, + "loss": 0.2014, + "step": 2484 + }, + { + "epoch": 0.9928115015974441, + "grad_norm": 1.1268024008456778, + "learning_rate": 8.462504380269853e-06, + "loss": 0.179, + "step": 2486 + }, + { + "epoch": 0.9936102236421726, + "grad_norm": 1.446057306444501, + "learning_rate": 8.45915022467975e-06, + "loss": 0.1918, + "step": 2488 + }, + { + "epoch": 0.994408945686901, + "grad_norm": 1.186023464310668, + "learning_rate": 8.455793080710231e-06, + "loss": 0.1706, + "step": 2490 + }, + { + "epoch": 0.9952076677316294, + "grad_norm": 1.2273194439241009, + "learning_rate": 8.452432951261549e-06, + "loss": 0.1857, + "step": 2492 + }, + { + "epoch": 0.9960063897763578, + "grad_norm": 1.0809657844755842, + "learning_rate": 8.44906983923654e-06, + "loss": 0.1403, + "step": 2494 + }, + { + "epoch": 0.9968051118210862, + "grad_norm": 1.1417636929002628, + "learning_rate": 8.445703747540614e-06, + "loss": 0.1727, + "step": 2496 + }, + { + "epoch": 0.9976038338658147, + "grad_norm": 1.2435186121479649, + "learning_rate": 8.442334679081757e-06, + "loss": 0.1651, + "step": 2498 + }, + { + "epoch": 0.9984025559105432, + "grad_norm": 1.261179386652203, + "learning_rate": 8.438962636770528e-06, + "loss": 0.1916, + "step": 2500 + }, + { + "epoch": 0.9984025559105432, + "eval_loss": 0.1609409898519516, + "eval_runtime": 418.1212, + "eval_samples_per_second": 42.588, + "eval_steps_per_second": 5.324, + "step": 2500 + }, + { + "epoch": 0.9992012779552716, + "grad_norm": 1.2177062771646232, + "learning_rate": 8.43558762352005e-06, + "loss": 0.1956, + "step": 2502 + }, + { + "epoch": 1.0, + "grad_norm": 1.129270940930756, + "learning_rate": 8.43220964224602e-06, + "loss": 0.1687, + "step": 2504 + }, + { + "epoch": 1.0007987220447285, + "grad_norm": 0.827421022738481, + "learning_rate": 8.428828695866694e-06, + "loss": 0.1127, + "step": 2506 + }, + { + "epoch": 1.0015974440894568, + "grad_norm": 0.8532426168374652, + "learning_rate": 8.425444787302887e-06, + "loss": 0.1097, + "step": 2508 + }, + { + "epoch": 1.0023961661341854, + "grad_norm": 1.0387260637251956, + "learning_rate": 8.422057919477984e-06, + "loss": 0.1209, + "step": 2510 + }, + { + "epoch": 1.0031948881789137, + "grad_norm": 0.9532301555359793, + "learning_rate": 8.418668095317912e-06, + "loss": 0.1124, + "step": 2512 + }, + { + "epoch": 1.0039936102236422, + "grad_norm": 0.9749882933863255, + "learning_rate": 8.415275317751163e-06, + "loss": 0.1033, + "step": 2514 + }, + { + "epoch": 1.0047923322683705, + "grad_norm": 1.215365298510418, + "learning_rate": 8.411879589708775e-06, + "loss": 0.1136, + "step": 2516 + }, + { + "epoch": 1.005591054313099, + "grad_norm": 1.0938388152428302, + "learning_rate": 8.408480914124338e-06, + "loss": 0.1126, + "step": 2518 + }, + { + "epoch": 1.0063897763578276, + "grad_norm": 1.1091853030022452, + "learning_rate": 8.405079293933986e-06, + "loss": 0.0985, + "step": 2520 + }, + { + "epoch": 1.0071884984025559, + "grad_norm": 1.1309767039321081, + "learning_rate": 8.401674732076399e-06, + "loss": 0.1121, + "step": 2522 + }, + { + "epoch": 1.0079872204472844, + "grad_norm": 1.2204646216277761, + "learning_rate": 8.398267231492797e-06, + "loss": 0.1276, + "step": 2524 + }, + { + "epoch": 1.0087859424920127, + "grad_norm": 1.380795556523787, + "learning_rate": 8.394856795126937e-06, + "loss": 0.1039, + "step": 2526 + }, + { + "epoch": 1.0095846645367412, + "grad_norm": 1.0922220217499492, + "learning_rate": 8.391443425925118e-06, + "loss": 0.1126, + "step": 2528 + }, + { + "epoch": 1.0103833865814698, + "grad_norm": 0.990994836569366, + "learning_rate": 8.388027126836168e-06, + "loss": 0.1109, + "step": 2530 + }, + { + "epoch": 1.011182108626198, + "grad_norm": 1.160502599618174, + "learning_rate": 8.384607900811442e-06, + "loss": 0.11, + "step": 2532 + }, + { + "epoch": 1.0119808306709266, + "grad_norm": 1.0016564935445926, + "learning_rate": 8.381185750804835e-06, + "loss": 0.1007, + "step": 2534 + }, + { + "epoch": 1.012779552715655, + "grad_norm": 1.0190577969031376, + "learning_rate": 8.37776067977276e-06, + "loss": 0.1095, + "step": 2536 + }, + { + "epoch": 1.0135782747603834, + "grad_norm": 1.0372954792800193, + "learning_rate": 8.374332690674151e-06, + "loss": 0.1087, + "step": 2538 + }, + { + "epoch": 1.0143769968051117, + "grad_norm": 1.0326525572009215, + "learning_rate": 8.370901786470475e-06, + "loss": 0.1023, + "step": 2540 + }, + { + "epoch": 1.0151757188498403, + "grad_norm": 0.9829529420381988, + "learning_rate": 8.367467970125702e-06, + "loss": 0.1104, + "step": 2542 + }, + { + "epoch": 1.0159744408945688, + "grad_norm": 1.1974618667877126, + "learning_rate": 8.36403124460633e-06, + "loss": 0.1052, + "step": 2544 + }, + { + "epoch": 1.016773162939297, + "grad_norm": 1.13559909487521, + "learning_rate": 8.360591612881363e-06, + "loss": 0.1076, + "step": 2546 + }, + { + "epoch": 1.0175718849840256, + "grad_norm": 1.1337615805939383, + "learning_rate": 8.357149077922322e-06, + "loss": 0.1179, + "step": 2548 + }, + { + "epoch": 1.018370607028754, + "grad_norm": 1.1381630091831911, + "learning_rate": 8.353703642703228e-06, + "loss": 0.1082, + "step": 2550 + }, + { + "epoch": 1.0191693290734825, + "grad_norm": 1.1081002212560531, + "learning_rate": 8.350255310200611e-06, + "loss": 0.1133, + "step": 2552 + }, + { + "epoch": 1.0199680511182108, + "grad_norm": 1.012013673101353, + "learning_rate": 8.346804083393511e-06, + "loss": 0.1133, + "step": 2554 + }, + { + "epoch": 1.0207667731629393, + "grad_norm": 1.0949479826249693, + "learning_rate": 8.343349965263459e-06, + "loss": 0.109, + "step": 2556 + }, + { + "epoch": 1.0215654952076678, + "grad_norm": 1.4697029214526798, + "learning_rate": 8.339892958794487e-06, + "loss": 0.1223, + "step": 2558 + }, + { + "epoch": 1.0223642172523961, + "grad_norm": 1.1256837730078384, + "learning_rate": 8.336433066973122e-06, + "loss": 0.1106, + "step": 2560 + }, + { + "epoch": 1.0231629392971247, + "grad_norm": 1.0111334067086826, + "learning_rate": 8.332970292788384e-06, + "loss": 0.0992, + "step": 2562 + }, + { + "epoch": 1.023961661341853, + "grad_norm": 1.1457290491628267, + "learning_rate": 8.329504639231784e-06, + "loss": 0.1055, + "step": 2564 + }, + { + "epoch": 1.0247603833865815, + "grad_norm": 1.094415886169632, + "learning_rate": 8.32603610929732e-06, + "loss": 0.1233, + "step": 2566 + }, + { + "epoch": 1.0255591054313098, + "grad_norm": 1.115090103670034, + "learning_rate": 8.322564705981476e-06, + "loss": 0.1027, + "step": 2568 + }, + { + "epoch": 1.0263578274760383, + "grad_norm": 1.2411016808966997, + "learning_rate": 8.319090432283213e-06, + "loss": 0.1251, + "step": 2570 + }, + { + "epoch": 1.0271565495207668, + "grad_norm": 1.161818122354369, + "learning_rate": 8.315613291203977e-06, + "loss": 0.1136, + "step": 2572 + }, + { + "epoch": 1.0279552715654952, + "grad_norm": 1.010109815757735, + "learning_rate": 8.312133285747693e-06, + "loss": 0.1116, + "step": 2574 + }, + { + "epoch": 1.0287539936102237, + "grad_norm": 1.0352536009528577, + "learning_rate": 8.308650418920751e-06, + "loss": 0.0985, + "step": 2576 + }, + { + "epoch": 1.029552715654952, + "grad_norm": 1.0385517544202987, + "learning_rate": 8.305164693732026e-06, + "loss": 0.1085, + "step": 2578 + }, + { + "epoch": 1.0303514376996805, + "grad_norm": 1.0075596705684045, + "learning_rate": 8.301676113192853e-06, + "loss": 0.1092, + "step": 2580 + }, + { + "epoch": 1.031150159744409, + "grad_norm": 1.0700338942744492, + "learning_rate": 8.298184680317035e-06, + "loss": 0.1124, + "step": 2582 + }, + { + "epoch": 1.0319488817891374, + "grad_norm": 1.0149219991197933, + "learning_rate": 8.294690398120843e-06, + "loss": 0.1121, + "step": 2584 + }, + { + "epoch": 1.0327476038338659, + "grad_norm": 1.2039242543481947, + "learning_rate": 8.291193269623005e-06, + "loss": 0.1222, + "step": 2586 + }, + { + "epoch": 1.0335463258785942, + "grad_norm": 1.2377862042067174, + "learning_rate": 8.287693297844712e-06, + "loss": 0.1145, + "step": 2588 + }, + { + "epoch": 1.0343450479233227, + "grad_norm": 1.091238797708431, + "learning_rate": 8.28419048580961e-06, + "loss": 0.1155, + "step": 2590 + }, + { + "epoch": 1.035143769968051, + "grad_norm": 1.078587618975172, + "learning_rate": 8.280684836543794e-06, + "loss": 0.1082, + "step": 2592 + }, + { + "epoch": 1.0359424920127795, + "grad_norm": 1.0454759141708203, + "learning_rate": 8.277176353075818e-06, + "loss": 0.1056, + "step": 2594 + }, + { + "epoch": 1.036741214057508, + "grad_norm": 1.168733586560627, + "learning_rate": 8.27366503843668e-06, + "loss": 0.1159, + "step": 2596 + }, + { + "epoch": 1.0375399361022364, + "grad_norm": 0.9639822523537996, + "learning_rate": 8.270150895659824e-06, + "loss": 0.1141, + "step": 2598 + }, + { + "epoch": 1.038338658146965, + "grad_norm": 1.080334562027545, + "learning_rate": 8.266633927781135e-06, + "loss": 0.097, + "step": 2600 + }, + { + "epoch": 1.0391373801916932, + "grad_norm": 1.0212322282727473, + "learning_rate": 8.263114137838946e-06, + "loss": 0.1141, + "step": 2602 + }, + { + "epoch": 1.0399361022364217, + "grad_norm": 1.0394948988809494, + "learning_rate": 8.259591528874023e-06, + "loss": 0.1101, + "step": 2604 + }, + { + "epoch": 1.04073482428115, + "grad_norm": 1.3623022308744146, + "learning_rate": 8.256066103929566e-06, + "loss": 0.1273, + "step": 2606 + }, + { + "epoch": 1.0415335463258786, + "grad_norm": 1.022587558449341, + "learning_rate": 8.25253786605121e-06, + "loss": 0.1124, + "step": 2608 + }, + { + "epoch": 1.042332268370607, + "grad_norm": 0.986241541472163, + "learning_rate": 8.249006818287017e-06, + "loss": 0.1017, + "step": 2610 + }, + { + "epoch": 1.0431309904153354, + "grad_norm": 1.1218445133529082, + "learning_rate": 8.245472963687484e-06, + "loss": 0.1051, + "step": 2612 + }, + { + "epoch": 1.043929712460064, + "grad_norm": 0.9853007519672737, + "learning_rate": 8.241936305305526e-06, + "loss": 0.0933, + "step": 2614 + }, + { + "epoch": 1.0447284345047922, + "grad_norm": 0.9238979931931091, + "learning_rate": 8.238396846196483e-06, + "loss": 0.0964, + "step": 2616 + }, + { + "epoch": 1.0455271565495208, + "grad_norm": 1.145209666797251, + "learning_rate": 8.23485458941811e-06, + "loss": 0.1178, + "step": 2618 + }, + { + "epoch": 1.0463258785942493, + "grad_norm": 1.128182977350629, + "learning_rate": 8.231309538030586e-06, + "loss": 0.1141, + "step": 2620 + }, + { + "epoch": 1.0471246006389776, + "grad_norm": 0.9914083611784468, + "learning_rate": 8.2277616950965e-06, + "loss": 0.0989, + "step": 2622 + }, + { + "epoch": 1.0479233226837061, + "grad_norm": 1.1620950439007605, + "learning_rate": 8.224211063680854e-06, + "loss": 0.1215, + "step": 2624 + }, + { + "epoch": 1.0487220447284344, + "grad_norm": 1.1508342917411967, + "learning_rate": 8.220657646851059e-06, + "loss": 0.1205, + "step": 2626 + }, + { + "epoch": 1.049520766773163, + "grad_norm": 1.2150567330620372, + "learning_rate": 8.217101447676929e-06, + "loss": 0.106, + "step": 2628 + }, + { + "epoch": 1.0503194888178913, + "grad_norm": 1.1152102769811438, + "learning_rate": 8.213542469230688e-06, + "loss": 0.1223, + "step": 2630 + }, + { + "epoch": 1.0511182108626198, + "grad_norm": 1.1807098190243748, + "learning_rate": 8.209980714586955e-06, + "loss": 0.1122, + "step": 2632 + }, + { + "epoch": 1.0519169329073483, + "grad_norm": 1.1286829181729654, + "learning_rate": 8.206416186822753e-06, + "loss": 0.1101, + "step": 2634 + }, + { + "epoch": 1.0527156549520766, + "grad_norm": 1.0697286341076933, + "learning_rate": 8.202848889017494e-06, + "loss": 0.1112, + "step": 2636 + }, + { + "epoch": 1.0535143769968052, + "grad_norm": 1.0611961961878171, + "learning_rate": 8.19927882425299e-06, + "loss": 0.1079, + "step": 2638 + }, + { + "epoch": 1.0543130990415335, + "grad_norm": 1.0124139526923805, + "learning_rate": 8.195705995613436e-06, + "loss": 0.0996, + "step": 2640 + }, + { + "epoch": 1.055111821086262, + "grad_norm": 1.108195697612016, + "learning_rate": 8.192130406185425e-06, + "loss": 0.1107, + "step": 2642 + }, + { + "epoch": 1.0559105431309903, + "grad_norm": 1.1111577119822706, + "learning_rate": 8.188552059057924e-06, + "loss": 0.116, + "step": 2644 + }, + { + "epoch": 1.0567092651757188, + "grad_norm": 1.0697354896619498, + "learning_rate": 8.184970957322287e-06, + "loss": 0.1026, + "step": 2646 + }, + { + "epoch": 1.0575079872204474, + "grad_norm": 1.1184255850296823, + "learning_rate": 8.181387104072252e-06, + "loss": 0.1008, + "step": 2648 + }, + { + "epoch": 1.0583067092651757, + "grad_norm": 1.123703636746383, + "learning_rate": 8.177800502403928e-06, + "loss": 0.1234, + "step": 2650 + }, + { + "epoch": 1.0591054313099042, + "grad_norm": 1.1145099095973394, + "learning_rate": 8.1742111554158e-06, + "loss": 0.1038, + "step": 2652 + }, + { + "epoch": 1.0599041533546325, + "grad_norm": 1.0924462066716556, + "learning_rate": 8.170619066208723e-06, + "loss": 0.1026, + "step": 2654 + }, + { + "epoch": 1.060702875399361, + "grad_norm": 1.275295631090924, + "learning_rate": 8.167024237885927e-06, + "loss": 0.1202, + "step": 2656 + }, + { + "epoch": 1.0615015974440896, + "grad_norm": 1.001711382132048, + "learning_rate": 8.163426673553002e-06, + "loss": 0.1028, + "step": 2658 + }, + { + "epoch": 1.0623003194888179, + "grad_norm": 1.090098993511894, + "learning_rate": 8.159826376317906e-06, + "loss": 0.1074, + "step": 2660 + }, + { + "epoch": 1.0630990415335464, + "grad_norm": 1.0661454669618209, + "learning_rate": 8.156223349290957e-06, + "loss": 0.1101, + "step": 2662 + }, + { + "epoch": 1.0638977635782747, + "grad_norm": 1.256680601891414, + "learning_rate": 8.152617595584827e-06, + "loss": 0.1085, + "step": 2664 + }, + { + "epoch": 1.0646964856230032, + "grad_norm": 1.0814675764007797, + "learning_rate": 8.149009118314549e-06, + "loss": 0.1041, + "step": 2666 + }, + { + "epoch": 1.0654952076677315, + "grad_norm": 1.055396058607972, + "learning_rate": 8.145397920597505e-06, + "loss": 0.1012, + "step": 2668 + }, + { + "epoch": 1.06629392971246, + "grad_norm": 1.0902362525938407, + "learning_rate": 8.141784005553433e-06, + "loss": 0.1129, + "step": 2670 + }, + { + "epoch": 1.0670926517571886, + "grad_norm": 1.040946925167894, + "learning_rate": 8.138167376304411e-06, + "loss": 0.1032, + "step": 2672 + }, + { + "epoch": 1.067891373801917, + "grad_norm": 1.1293507367739193, + "learning_rate": 8.134548035974868e-06, + "loss": 0.1093, + "step": 2674 + }, + { + "epoch": 1.0686900958466454, + "grad_norm": 1.1475487980987538, + "learning_rate": 8.13092598769157e-06, + "loss": 0.1168, + "step": 2676 + }, + { + "epoch": 1.0694888178913737, + "grad_norm": 1.0688926041290217, + "learning_rate": 8.127301234583627e-06, + "loss": 0.1209, + "step": 2678 + }, + { + "epoch": 1.0702875399361023, + "grad_norm": 1.1130827659156695, + "learning_rate": 8.123673779782481e-06, + "loss": 0.1112, + "step": 2680 + }, + { + "epoch": 1.0710862619808306, + "grad_norm": 0.999458562659492, + "learning_rate": 8.120043626421915e-06, + "loss": 0.1036, + "step": 2682 + }, + { + "epoch": 1.071884984025559, + "grad_norm": 1.1445057181311435, + "learning_rate": 8.116410777638035e-06, + "loss": 0.1088, + "step": 2684 + }, + { + "epoch": 1.0726837060702876, + "grad_norm": 1.060266912369435, + "learning_rate": 8.112775236569282e-06, + "loss": 0.1024, + "step": 2686 + }, + { + "epoch": 1.073482428115016, + "grad_norm": 1.0993996765043779, + "learning_rate": 8.10913700635642e-06, + "loss": 0.1037, + "step": 2688 + }, + { + "epoch": 1.0742811501597445, + "grad_norm": 1.039132296487209, + "learning_rate": 8.105496090142535e-06, + "loss": 0.1075, + "step": 2690 + }, + { + "epoch": 1.0750798722044728, + "grad_norm": 1.1082486440070727, + "learning_rate": 8.101852491073036e-06, + "loss": 0.105, + "step": 2692 + }, + { + "epoch": 1.0758785942492013, + "grad_norm": 1.1651294916681243, + "learning_rate": 8.098206212295652e-06, + "loss": 0.1154, + "step": 2694 + }, + { + "epoch": 1.0766773162939298, + "grad_norm": 1.172460162145474, + "learning_rate": 8.094557256960419e-06, + "loss": 0.116, + "step": 2696 + }, + { + "epoch": 1.0774760383386581, + "grad_norm": 0.9459826583314704, + "learning_rate": 8.090905628219693e-06, + "loss": 0.1068, + "step": 2698 + }, + { + "epoch": 1.0782747603833867, + "grad_norm": 1.0709333935677605, + "learning_rate": 8.087251329228136e-06, + "loss": 0.0917, + "step": 2700 + }, + { + "epoch": 1.079073482428115, + "grad_norm": 1.1016086096260114, + "learning_rate": 8.083594363142717e-06, + "loss": 0.1144, + "step": 2702 + }, + { + "epoch": 1.0798722044728435, + "grad_norm": 1.0018060470975783, + "learning_rate": 8.079934733122708e-06, + "loss": 0.1071, + "step": 2704 + }, + { + "epoch": 1.0806709265175718, + "grad_norm": 1.145863183921058, + "learning_rate": 8.07627244232969e-06, + "loss": 0.1177, + "step": 2706 + }, + { + "epoch": 1.0814696485623003, + "grad_norm": 1.0861499113619286, + "learning_rate": 8.07260749392753e-06, + "loss": 0.116, + "step": 2708 + }, + { + "epoch": 1.0822683706070289, + "grad_norm": 1.0911649754162966, + "learning_rate": 8.068939891082401e-06, + "loss": 0.1092, + "step": 2710 + }, + { + "epoch": 1.0830670926517572, + "grad_norm": 1.1624721618006877, + "learning_rate": 8.065269636962765e-06, + "loss": 0.1035, + "step": 2712 + }, + { + "epoch": 1.0838658146964857, + "grad_norm": 1.0820597055920262, + "learning_rate": 8.061596734739377e-06, + "loss": 0.1005, + "step": 2714 + }, + { + "epoch": 1.084664536741214, + "grad_norm": 1.1521338536239825, + "learning_rate": 8.057921187585274e-06, + "loss": 0.1021, + "step": 2716 + }, + { + "epoch": 1.0854632587859425, + "grad_norm": 1.088468478053058, + "learning_rate": 8.054242998675787e-06, + "loss": 0.1133, + "step": 2718 + }, + { + "epoch": 1.0862619808306708, + "grad_norm": 1.139917756342621, + "learning_rate": 8.05056217118852e-06, + "loss": 0.108, + "step": 2720 + }, + { + "epoch": 1.0870607028753994, + "grad_norm": 1.128084026950435, + "learning_rate": 8.046878708303363e-06, + "loss": 0.1026, + "step": 2722 + }, + { + "epoch": 1.0878594249201279, + "grad_norm": 1.1291139425698837, + "learning_rate": 8.04319261320248e-06, + "loss": 0.1095, + "step": 2724 + }, + { + "epoch": 1.0886581469648562, + "grad_norm": 1.3447212908276018, + "learning_rate": 8.039503889070306e-06, + "loss": 0.1194, + "step": 2726 + }, + { + "epoch": 1.0894568690095847, + "grad_norm": 1.0745254740908237, + "learning_rate": 8.035812539093557e-06, + "loss": 0.1093, + "step": 2728 + }, + { + "epoch": 1.090255591054313, + "grad_norm": 1.0828953637676413, + "learning_rate": 8.032118566461206e-06, + "loss": 0.1108, + "step": 2730 + }, + { + "epoch": 1.0910543130990416, + "grad_norm": 1.0548110887259554, + "learning_rate": 8.0284219743645e-06, + "loss": 0.1062, + "step": 2732 + }, + { + "epoch": 1.09185303514377, + "grad_norm": 1.1071922672380121, + "learning_rate": 8.024722765996944e-06, + "loss": 0.1037, + "step": 2734 + }, + { + "epoch": 1.0926517571884984, + "grad_norm": 1.230092777293863, + "learning_rate": 8.021020944554305e-06, + "loss": 0.1154, + "step": 2736 + }, + { + "epoch": 1.093450479233227, + "grad_norm": 1.0630731902655604, + "learning_rate": 8.017316513234607e-06, + "loss": 0.1152, + "step": 2738 + }, + { + "epoch": 1.0942492012779552, + "grad_norm": 1.1292429935560224, + "learning_rate": 8.013609475238132e-06, + "loss": 0.11, + "step": 2740 + }, + { + "epoch": 1.0950479233226837, + "grad_norm": 1.1249867737292523, + "learning_rate": 8.009899833767407e-06, + "loss": 0.1032, + "step": 2742 + }, + { + "epoch": 1.095846645367412, + "grad_norm": 1.430619873698458, + "learning_rate": 8.006187592027215e-06, + "loss": 0.1178, + "step": 2744 + }, + { + "epoch": 1.0966453674121406, + "grad_norm": 1.110036751210643, + "learning_rate": 8.002472753224582e-06, + "loss": 0.1145, + "step": 2746 + }, + { + "epoch": 1.097444089456869, + "grad_norm": 0.957513352594371, + "learning_rate": 7.998755320568778e-06, + "loss": 0.1018, + "step": 2748 + }, + { + "epoch": 1.0982428115015974, + "grad_norm": 1.042054355253303, + "learning_rate": 7.995035297271313e-06, + "loss": 0.1013, + "step": 2750 + }, + { + "epoch": 1.099041533546326, + "grad_norm": 1.1532599431868966, + "learning_rate": 7.991312686545939e-06, + "loss": 0.1173, + "step": 2752 + }, + { + "epoch": 1.0998402555910542, + "grad_norm": 1.2036914344298868, + "learning_rate": 7.987587491608636e-06, + "loss": 0.1186, + "step": 2754 + }, + { + "epoch": 1.1006389776357828, + "grad_norm": 1.082895054115626, + "learning_rate": 7.983859715677627e-06, + "loss": 0.1006, + "step": 2756 + }, + { + "epoch": 1.101437699680511, + "grad_norm": 1.1041877219800416, + "learning_rate": 7.980129361973355e-06, + "loss": 0.1061, + "step": 2758 + }, + { + "epoch": 1.1022364217252396, + "grad_norm": 1.0649445718677086, + "learning_rate": 7.976396433718492e-06, + "loss": 0.1034, + "step": 2760 + }, + { + "epoch": 1.1030351437699681, + "grad_norm": 1.1926141701225625, + "learning_rate": 7.97266093413794e-06, + "loss": 0.1158, + "step": 2762 + }, + { + "epoch": 1.1038338658146964, + "grad_norm": 1.2011616729379744, + "learning_rate": 7.968922866458812e-06, + "loss": 0.1166, + "step": 2764 + }, + { + "epoch": 1.104632587859425, + "grad_norm": 1.1032014477011283, + "learning_rate": 7.965182233910453e-06, + "loss": 0.1079, + "step": 2766 + }, + { + "epoch": 1.1054313099041533, + "grad_norm": 1.0491357869845093, + "learning_rate": 7.961439039724413e-06, + "loss": 0.1185, + "step": 2768 + }, + { + "epoch": 1.1062300319488818, + "grad_norm": 1.1311466555151601, + "learning_rate": 7.95769328713446e-06, + "loss": 0.1182, + "step": 2770 + }, + { + "epoch": 1.1070287539936103, + "grad_norm": 1.2237403558053657, + "learning_rate": 7.953944979376567e-06, + "loss": 0.1097, + "step": 2772 + }, + { + "epoch": 1.1078274760383386, + "grad_norm": 1.2431901555713765, + "learning_rate": 7.950194119688922e-06, + "loss": 0.1329, + "step": 2774 + }, + { + "epoch": 1.1086261980830672, + "grad_norm": 1.0880599521982486, + "learning_rate": 7.946440711311913e-06, + "loss": 0.1162, + "step": 2776 + }, + { + "epoch": 1.1094249201277955, + "grad_norm": 1.1073519932980254, + "learning_rate": 7.942684757488133e-06, + "loss": 0.1027, + "step": 2778 + }, + { + "epoch": 1.110223642172524, + "grad_norm": 0.9960392472144209, + "learning_rate": 7.938926261462366e-06, + "loss": 0.1002, + "step": 2780 + }, + { + "epoch": 1.1110223642172523, + "grad_norm": 1.1223224889936787, + "learning_rate": 7.935165226481603e-06, + "loss": 0.1138, + "step": 2782 + }, + { + "epoch": 1.1118210862619808, + "grad_norm": 1.0435684073325897, + "learning_rate": 7.931401655795021e-06, + "loss": 0.1074, + "step": 2784 + }, + { + "epoch": 1.1126198083067094, + "grad_norm": 1.0106236333934397, + "learning_rate": 7.92763555265399e-06, + "loss": 0.0992, + "step": 2786 + }, + { + "epoch": 1.1134185303514377, + "grad_norm": 1.2351704038049207, + "learning_rate": 7.923866920312069e-06, + "loss": 0.1091, + "step": 2788 + }, + { + "epoch": 1.1142172523961662, + "grad_norm": 1.2422399561551012, + "learning_rate": 7.920095762025e-06, + "loss": 0.1189, + "step": 2790 + }, + { + "epoch": 1.1150159744408945, + "grad_norm": 1.0941653259345385, + "learning_rate": 7.916322081050708e-06, + "loss": 0.1003, + "step": 2792 + }, + { + "epoch": 1.115814696485623, + "grad_norm": 1.1760490829802384, + "learning_rate": 7.912545880649299e-06, + "loss": 0.1267, + "step": 2794 + }, + { + "epoch": 1.1166134185303513, + "grad_norm": 1.158226709859037, + "learning_rate": 7.90876716408305e-06, + "loss": 0.0968, + "step": 2796 + }, + { + "epoch": 1.1174121405750799, + "grad_norm": 1.1375503575879282, + "learning_rate": 7.904985934616419e-06, + "loss": 0.1077, + "step": 2798 + }, + { + "epoch": 1.1182108626198084, + "grad_norm": 0.9183058494090953, + "learning_rate": 7.90120219551603e-06, + "loss": 0.1032, + "step": 2800 + }, + { + "epoch": 1.1190095846645367, + "grad_norm": 1.0610420482924088, + "learning_rate": 7.897415950050676e-06, + "loss": 0.1111, + "step": 2802 + }, + { + "epoch": 1.1198083067092652, + "grad_norm": 1.1233765060725236, + "learning_rate": 7.893627201491319e-06, + "loss": 0.0985, + "step": 2804 + }, + { + "epoch": 1.1206070287539935, + "grad_norm": 1.0398157065300724, + "learning_rate": 7.889835953111075e-06, + "loss": 0.1164, + "step": 2806 + }, + { + "epoch": 1.121405750798722, + "grad_norm": 1.0997762546624286, + "learning_rate": 7.88604220818523e-06, + "loss": 0.1097, + "step": 2808 + }, + { + "epoch": 1.1222044728434506, + "grad_norm": 1.0435817236551026, + "learning_rate": 7.88224596999122e-06, + "loss": 0.1046, + "step": 2810 + }, + { + "epoch": 1.123003194888179, + "grad_norm": 1.2695278909995464, + "learning_rate": 7.878447241808634e-06, + "loss": 0.1102, + "step": 2812 + }, + { + "epoch": 1.1238019169329074, + "grad_norm": 1.081882759084368, + "learning_rate": 7.874646026919219e-06, + "loss": 0.114, + "step": 2814 + }, + { + "epoch": 1.1246006389776357, + "grad_norm": 1.2277213100870545, + "learning_rate": 7.870842328606863e-06, + "loss": 0.118, + "step": 2816 + }, + { + "epoch": 1.1253993610223643, + "grad_norm": 1.0784722478718025, + "learning_rate": 7.867036150157603e-06, + "loss": 0.1132, + "step": 2818 + }, + { + "epoch": 1.1261980830670926, + "grad_norm": 1.1255206572769305, + "learning_rate": 7.86322749485962e-06, + "loss": 0.1101, + "step": 2820 + }, + { + "epoch": 1.126996805111821, + "grad_norm": 1.162832702222223, + "learning_rate": 7.85941636600323e-06, + "loss": 0.1161, + "step": 2822 + }, + { + "epoch": 1.1277955271565494, + "grad_norm": 1.1447345276638636, + "learning_rate": 7.85560276688089e-06, + "loss": 0.1162, + "step": 2824 + }, + { + "epoch": 1.128594249201278, + "grad_norm": 1.187515171501569, + "learning_rate": 7.851786700787191e-06, + "loss": 0.1046, + "step": 2826 + }, + { + "epoch": 1.1293929712460065, + "grad_norm": 1.1412349222191334, + "learning_rate": 7.847968171018851e-06, + "loss": 0.1097, + "step": 2828 + }, + { + "epoch": 1.1301916932907348, + "grad_norm": 0.9891747869008474, + "learning_rate": 7.844147180874723e-06, + "loss": 0.0948, + "step": 2830 + }, + { + "epoch": 1.1309904153354633, + "grad_norm": 1.140395014058789, + "learning_rate": 7.84032373365578e-06, + "loss": 0.1038, + "step": 2832 + }, + { + "epoch": 1.1317891373801916, + "grad_norm": 1.0508986553012427, + "learning_rate": 7.836497832665119e-06, + "loss": 0.1091, + "step": 2834 + }, + { + "epoch": 1.1325878594249201, + "grad_norm": 1.016036817043171, + "learning_rate": 7.83266948120796e-06, + "loss": 0.0973, + "step": 2836 + }, + { + "epoch": 1.1333865814696487, + "grad_norm": 1.130667401388963, + "learning_rate": 7.828838682591635e-06, + "loss": 0.1207, + "step": 2838 + }, + { + "epoch": 1.134185303514377, + "grad_norm": 1.039182888114013, + "learning_rate": 7.825005440125595e-06, + "loss": 0.106, + "step": 2840 + }, + { + "epoch": 1.1349840255591055, + "grad_norm": 1.1559863889230508, + "learning_rate": 7.821169757121395e-06, + "loss": 0.1113, + "step": 2842 + }, + { + "epoch": 1.1357827476038338, + "grad_norm": 1.0074524350682563, + "learning_rate": 7.817331636892708e-06, + "loss": 0.1171, + "step": 2844 + }, + { + "epoch": 1.1365814696485623, + "grad_norm": 1.1004441537401717, + "learning_rate": 7.813491082755306e-06, + "loss": 0.0954, + "step": 2846 + }, + { + "epoch": 1.1373801916932909, + "grad_norm": 1.0467946843112212, + "learning_rate": 7.809648098027067e-06, + "loss": 0.1098, + "step": 2848 + }, + { + "epoch": 1.1381789137380192, + "grad_norm": 1.1629386911794455, + "learning_rate": 7.805802686027965e-06, + "loss": 0.1186, + "step": 2850 + }, + { + "epoch": 1.1389776357827477, + "grad_norm": 1.0232023211944392, + "learning_rate": 7.801954850080075e-06, + "loss": 0.0925, + "step": 2852 + }, + { + "epoch": 1.139776357827476, + "grad_norm": 1.2284293666842674, + "learning_rate": 7.798104593507562e-06, + "loss": 0.1137, + "step": 2854 + }, + { + "epoch": 1.1405750798722045, + "grad_norm": 1.0439656778352415, + "learning_rate": 7.794251919636687e-06, + "loss": 0.1086, + "step": 2856 + }, + { + "epoch": 1.1413738019169328, + "grad_norm": 1.038970880098722, + "learning_rate": 7.790396831795792e-06, + "loss": 0.101, + "step": 2858 + }, + { + "epoch": 1.1421725239616614, + "grad_norm": 1.0250967712229946, + "learning_rate": 7.786539333315316e-06, + "loss": 0.0997, + "step": 2860 + }, + { + "epoch": 1.1429712460063897, + "grad_norm": 1.2328372564490289, + "learning_rate": 7.782679427527768e-06, + "loss": 0.1113, + "step": 2862 + }, + { + "epoch": 1.1437699680511182, + "grad_norm": 1.0417991972771192, + "learning_rate": 7.778817117767748e-06, + "loss": 0.1005, + "step": 2864 + }, + { + "epoch": 1.1445686900958467, + "grad_norm": 1.0282768724124047, + "learning_rate": 7.77495240737192e-06, + "loss": 0.1025, + "step": 2866 + }, + { + "epoch": 1.145367412140575, + "grad_norm": 1.0115415567641788, + "learning_rate": 7.771085299679033e-06, + "loss": 0.0994, + "step": 2868 + }, + { + "epoch": 1.1461661341853036, + "grad_norm": 1.122551661496836, + "learning_rate": 7.767215798029906e-06, + "loss": 0.1009, + "step": 2870 + }, + { + "epoch": 1.1469648562300319, + "grad_norm": 1.2689706066461852, + "learning_rate": 7.76334390576742e-06, + "loss": 0.117, + "step": 2872 + }, + { + "epoch": 1.1477635782747604, + "grad_norm": 1.1324363654209109, + "learning_rate": 7.759469626236523e-06, + "loss": 0.1068, + "step": 2874 + }, + { + "epoch": 1.148562300319489, + "grad_norm": 1.162099881311204, + "learning_rate": 7.75559296278423e-06, + "loss": 0.1149, + "step": 2876 + }, + { + "epoch": 1.1493610223642172, + "grad_norm": 1.1144526194493574, + "learning_rate": 7.751713918759612e-06, + "loss": 0.1189, + "step": 2878 + }, + { + "epoch": 1.1501597444089458, + "grad_norm": 1.0366214429764016, + "learning_rate": 7.747832497513797e-06, + "loss": 0.1143, + "step": 2880 + }, + { + "epoch": 1.150958466453674, + "grad_norm": 1.033901501799641, + "learning_rate": 7.743948702399966e-06, + "loss": 0.1038, + "step": 2882 + }, + { + "epoch": 1.1517571884984026, + "grad_norm": 1.091707818538779, + "learning_rate": 7.740062536773352e-06, + "loss": 0.1018, + "step": 2884 + }, + { + "epoch": 1.1525559105431311, + "grad_norm": 1.086352546060504, + "learning_rate": 7.736174003991234e-06, + "loss": 0.113, + "step": 2886 + }, + { + "epoch": 1.1533546325878594, + "grad_norm": 1.0324848912394144, + "learning_rate": 7.732283107412938e-06, + "loss": 0.1012, + "step": 2888 + }, + { + "epoch": 1.154153354632588, + "grad_norm": 1.180351977551197, + "learning_rate": 7.728389850399834e-06, + "loss": 0.107, + "step": 2890 + }, + { + "epoch": 1.1549520766773163, + "grad_norm": 1.1213498513689995, + "learning_rate": 7.724494236315327e-06, + "loss": 0.118, + "step": 2892 + }, + { + "epoch": 1.1557507987220448, + "grad_norm": 1.1297732537347598, + "learning_rate": 7.72059626852486e-06, + "loss": 0.1052, + "step": 2894 + }, + { + "epoch": 1.156549520766773, + "grad_norm": 1.1260273445244882, + "learning_rate": 7.71669595039591e-06, + "loss": 0.1231, + "step": 2896 + }, + { + "epoch": 1.1573482428115016, + "grad_norm": 1.0804349790876964, + "learning_rate": 7.712793285297982e-06, + "loss": 0.102, + "step": 2898 + }, + { + "epoch": 1.15814696485623, + "grad_norm": 1.0887409006629178, + "learning_rate": 7.70888827660261e-06, + "loss": 0.1038, + "step": 2900 + }, + { + "epoch": 1.1589456869009584, + "grad_norm": 1.3253348724046994, + "learning_rate": 7.704980927683359e-06, + "loss": 0.1085, + "step": 2902 + }, + { + "epoch": 1.159744408945687, + "grad_norm": 1.2297063221974824, + "learning_rate": 7.701071241915804e-06, + "loss": 0.1114, + "step": 2904 + }, + { + "epoch": 1.1605431309904153, + "grad_norm": 1.1163333185767002, + "learning_rate": 7.697159222677544e-06, + "loss": 0.1065, + "step": 2906 + }, + { + "epoch": 1.1613418530351438, + "grad_norm": 1.108863651299088, + "learning_rate": 7.693244873348197e-06, + "loss": 0.1128, + "step": 2908 + }, + { + "epoch": 1.1621405750798721, + "grad_norm": 1.0477128725454634, + "learning_rate": 7.689328197309394e-06, + "loss": 0.1014, + "step": 2910 + }, + { + "epoch": 1.1629392971246006, + "grad_norm": 1.1667151644208233, + "learning_rate": 7.685409197944768e-06, + "loss": 0.0998, + "step": 2912 + }, + { + "epoch": 1.1637380191693292, + "grad_norm": 1.2632234446787278, + "learning_rate": 7.681487878639968e-06, + "loss": 0.1067, + "step": 2914 + }, + { + "epoch": 1.1645367412140575, + "grad_norm": 1.288703128597724, + "learning_rate": 7.677564242782645e-06, + "loss": 0.1093, + "step": 2916 + }, + { + "epoch": 1.165335463258786, + "grad_norm": 1.151244739656633, + "learning_rate": 7.673638293762447e-06, + "loss": 0.1086, + "step": 2918 + }, + { + "epoch": 1.1661341853035143, + "grad_norm": 1.1676995737767575, + "learning_rate": 7.669710034971025e-06, + "loss": 0.0907, + "step": 2920 + }, + { + "epoch": 1.1669329073482428, + "grad_norm": 1.0099832716415298, + "learning_rate": 7.665779469802024e-06, + "loss": 0.1039, + "step": 2922 + }, + { + "epoch": 1.1677316293929714, + "grad_norm": 1.2160567960293138, + "learning_rate": 7.661846601651081e-06, + "loss": 0.1054, + "step": 2924 + }, + { + "epoch": 1.1685303514376997, + "grad_norm": 1.2002285628920948, + "learning_rate": 7.657911433915824e-06, + "loss": 0.1236, + "step": 2926 + }, + { + "epoch": 1.1693290734824282, + "grad_norm": 1.1011958816133838, + "learning_rate": 7.653973969995866e-06, + "loss": 0.1043, + "step": 2928 + }, + { + "epoch": 1.1701277955271565, + "grad_norm": 1.1222330810291938, + "learning_rate": 7.650034213292804e-06, + "loss": 0.0964, + "step": 2930 + }, + { + "epoch": 1.170926517571885, + "grad_norm": 1.191923146958323, + "learning_rate": 7.646092167210217e-06, + "loss": 0.1247, + "step": 2932 + }, + { + "epoch": 1.1717252396166133, + "grad_norm": 1.1057104845426344, + "learning_rate": 7.642147835153659e-06, + "loss": 0.1151, + "step": 2934 + }, + { + "epoch": 1.1725239616613419, + "grad_norm": 1.1144118547772344, + "learning_rate": 7.638201220530664e-06, + "loss": 0.106, + "step": 2936 + }, + { + "epoch": 1.1733226837060702, + "grad_norm": 1.03554140015377, + "learning_rate": 7.634252326750733e-06, + "loss": 0.1093, + "step": 2938 + }, + { + "epoch": 1.1741214057507987, + "grad_norm": 1.067085791357077, + "learning_rate": 7.630301157225336e-06, + "loss": 0.106, + "step": 2940 + }, + { + "epoch": 1.1749201277955272, + "grad_norm": 1.0936909170630784, + "learning_rate": 7.626347715367912e-06, + "loss": 0.1063, + "step": 2942 + }, + { + "epoch": 1.1757188498402555, + "grad_norm": 1.3484514118083, + "learning_rate": 7.622392004593862e-06, + "loss": 0.0992, + "step": 2944 + }, + { + "epoch": 1.176517571884984, + "grad_norm": 1.169969792758724, + "learning_rate": 7.618434028320547e-06, + "loss": 0.1107, + "step": 2946 + }, + { + "epoch": 1.1773162939297124, + "grad_norm": 1.0742999979059649, + "learning_rate": 7.614473789967285e-06, + "loss": 0.1039, + "step": 2948 + }, + { + "epoch": 1.178115015974441, + "grad_norm": 1.1077968599944528, + "learning_rate": 7.610511292955347e-06, + "loss": 0.1006, + "step": 2950 + }, + { + "epoch": 1.1789137380191694, + "grad_norm": 1.1624715070814888, + "learning_rate": 7.60654654070796e-06, + "loss": 0.0994, + "step": 2952 + }, + { + "epoch": 1.1797124600638977, + "grad_norm": 1.0968086205411158, + "learning_rate": 7.602579536650292e-06, + "loss": 0.1119, + "step": 2954 + }, + { + "epoch": 1.1805111821086263, + "grad_norm": 1.168735576631901, + "learning_rate": 7.59861028420946e-06, + "loss": 0.1136, + "step": 2956 + }, + { + "epoch": 1.1813099041533546, + "grad_norm": 1.2384061026226614, + "learning_rate": 7.594638786814526e-06, + "loss": 0.1176, + "step": 2958 + }, + { + "epoch": 1.182108626198083, + "grad_norm": 1.241137963433562, + "learning_rate": 7.59066504789649e-06, + "loss": 0.12, + "step": 2960 + }, + { + "epoch": 1.1829073482428114, + "grad_norm": 1.1850341035866552, + "learning_rate": 7.586689070888284e-06, + "loss": 0.119, + "step": 2962 + }, + { + "epoch": 1.18370607028754, + "grad_norm": 1.1220698860625842, + "learning_rate": 7.58271085922478e-06, + "loss": 0.1121, + "step": 2964 + }, + { + "epoch": 1.1845047923322685, + "grad_norm": 1.037375641928384, + "learning_rate": 7.578730416342777e-06, + "loss": 0.1102, + "step": 2966 + }, + { + "epoch": 1.1853035143769968, + "grad_norm": 1.2171636076950145, + "learning_rate": 7.574747745681e-06, + "loss": 0.1163, + "step": 2968 + }, + { + "epoch": 1.1861022364217253, + "grad_norm": 1.1555647145995425, + "learning_rate": 7.5707628506801015e-06, + "loss": 0.1033, + "step": 2970 + }, + { + "epoch": 1.1869009584664536, + "grad_norm": 1.3403187676618649, + "learning_rate": 7.566775734782656e-06, + "loss": 0.1066, + "step": 2972 + }, + { + "epoch": 1.1876996805111821, + "grad_norm": 1.1035130819507677, + "learning_rate": 7.562786401433156e-06, + "loss": 0.1073, + "step": 2974 + }, + { + "epoch": 1.1884984025559104, + "grad_norm": 1.0839124042096482, + "learning_rate": 7.558794854078006e-06, + "loss": 0.1101, + "step": 2976 + }, + { + "epoch": 1.189297124600639, + "grad_norm": 1.019947468283778, + "learning_rate": 7.5548010961655295e-06, + "loss": 0.1033, + "step": 2978 + }, + { + "epoch": 1.1900958466453675, + "grad_norm": 1.0753891042248978, + "learning_rate": 7.5508051311459555e-06, + "loss": 0.109, + "step": 2980 + }, + { + "epoch": 1.1908945686900958, + "grad_norm": 1.0714175737276472, + "learning_rate": 7.546806962471419e-06, + "loss": 0.1017, + "step": 2982 + }, + { + "epoch": 1.1916932907348243, + "grad_norm": 1.0997902745143622, + "learning_rate": 7.542806593595963e-06, + "loss": 0.106, + "step": 2984 + }, + { + "epoch": 1.1924920127795526, + "grad_norm": 1.0225175483161848, + "learning_rate": 7.538804027975528e-06, + "loss": 0.1092, + "step": 2986 + }, + { + "epoch": 1.1932907348242812, + "grad_norm": 1.107891680578376, + "learning_rate": 7.534799269067952e-06, + "loss": 0.1091, + "step": 2988 + }, + { + "epoch": 1.1940894568690097, + "grad_norm": 1.0551747572162382, + "learning_rate": 7.530792320332971e-06, + "loss": 0.1088, + "step": 2990 + }, + { + "epoch": 1.194888178913738, + "grad_norm": 1.0629259318761108, + "learning_rate": 7.526783185232208e-06, + "loss": 0.0937, + "step": 2992 + }, + { + "epoch": 1.1956869009584665, + "grad_norm": 1.1933108946432194, + "learning_rate": 7.522771867229179e-06, + "loss": 0.108, + "step": 2994 + }, + { + "epoch": 1.1964856230031948, + "grad_norm": 1.157609849649065, + "learning_rate": 7.518758369789286e-06, + "loss": 0.1192, + "step": 2996 + }, + { + "epoch": 1.1972843450479234, + "grad_norm": 1.167506126596324, + "learning_rate": 7.514742696379809e-06, + "loss": 0.1096, + "step": 2998 + }, + { + "epoch": 1.1980830670926517, + "grad_norm": 1.0333315229582538, + "learning_rate": 7.51072485046991e-06, + "loss": 0.1085, + "step": 3000 + }, + { + "epoch": 1.1980830670926517, + "eval_loss": 0.16306817531585693, + "eval_runtime": 417.6568, + "eval_samples_per_second": 42.635, + "eval_steps_per_second": 5.33, + "step": 3000 + }, + { + "epoch": 1.1988817891373802, + "grad_norm": 1.192881477161516, + "learning_rate": 7.5067048355306334e-06, + "loss": 0.1082, + "step": 3002 + }, + { + "epoch": 1.1996805111821087, + "grad_norm": 1.2452028368788162, + "learning_rate": 7.50268265503489e-06, + "loss": 0.1156, + "step": 3004 + }, + { + "epoch": 1.200479233226837, + "grad_norm": 1.1510650440674393, + "learning_rate": 7.498658312457464e-06, + "loss": 0.1133, + "step": 3006 + }, + { + "epoch": 1.2012779552715656, + "grad_norm": 1.0520484152103042, + "learning_rate": 7.494631811275008e-06, + "loss": 0.1118, + "step": 3008 + }, + { + "epoch": 1.2020766773162939, + "grad_norm": 1.0970981862348477, + "learning_rate": 7.49060315496604e-06, + "loss": 0.1072, + "step": 3010 + }, + { + "epoch": 1.2028753993610224, + "grad_norm": 1.1102889095542705, + "learning_rate": 7.486572347010937e-06, + "loss": 0.1056, + "step": 3012 + }, + { + "epoch": 1.2036741214057507, + "grad_norm": 1.178700112165599, + "learning_rate": 7.482539390891941e-06, + "loss": 0.1153, + "step": 3014 + }, + { + "epoch": 1.2044728434504792, + "grad_norm": 1.0483204125787329, + "learning_rate": 7.478504290093138e-06, + "loss": 0.0957, + "step": 3016 + }, + { + "epoch": 1.2052715654952078, + "grad_norm": 1.1855590333372437, + "learning_rate": 7.474467048100484e-06, + "loss": 0.1165, + "step": 3018 + }, + { + "epoch": 1.206070287539936, + "grad_norm": 1.132537958694911, + "learning_rate": 7.470427668401766e-06, + "loss": 0.1087, + "step": 3020 + }, + { + "epoch": 1.2068690095846646, + "grad_norm": 1.2170415750175325, + "learning_rate": 7.466386154486634e-06, + "loss": 0.1147, + "step": 3022 + }, + { + "epoch": 1.207667731629393, + "grad_norm": 1.1082959534882166, + "learning_rate": 7.462342509846571e-06, + "loss": 0.1048, + "step": 3024 + }, + { + "epoch": 1.2084664536741214, + "grad_norm": 1.1141000146197995, + "learning_rate": 7.458296737974905e-06, + "loss": 0.1132, + "step": 3026 + }, + { + "epoch": 1.20926517571885, + "grad_norm": 1.0298899347940416, + "learning_rate": 7.4542488423668005e-06, + "loss": 0.1064, + "step": 3028 + }, + { + "epoch": 1.2100638977635783, + "grad_norm": 1.083163833346684, + "learning_rate": 7.450198826519259e-06, + "loss": 0.1123, + "step": 3030 + }, + { + "epoch": 1.2108626198083068, + "grad_norm": 1.0762705695988668, + "learning_rate": 7.446146693931111e-06, + "loss": 0.1008, + "step": 3032 + }, + { + "epoch": 1.211661341853035, + "grad_norm": 1.1867201378486008, + "learning_rate": 7.442092448103019e-06, + "loss": 0.1256, + "step": 3034 + }, + { + "epoch": 1.2124600638977636, + "grad_norm": 1.1714410675076814, + "learning_rate": 7.438036092537465e-06, + "loss": 0.1039, + "step": 3036 + }, + { + "epoch": 1.213258785942492, + "grad_norm": 1.0370675696738088, + "learning_rate": 7.43397763073876e-06, + "loss": 0.1039, + "step": 3038 + }, + { + "epoch": 1.2140575079872205, + "grad_norm": 1.097720581265046, + "learning_rate": 7.42991706621303e-06, + "loss": 0.1143, + "step": 3040 + }, + { + "epoch": 1.2148562300319488, + "grad_norm": 0.9975002092372904, + "learning_rate": 7.4258544024682245e-06, + "loss": 0.1039, + "step": 3042 + }, + { + "epoch": 1.2156549520766773, + "grad_norm": 1.032389232375386, + "learning_rate": 7.421789643014096e-06, + "loss": 0.1029, + "step": 3044 + }, + { + "epoch": 1.2164536741214058, + "grad_norm": 1.159406643141239, + "learning_rate": 7.417722791362216e-06, + "loss": 0.121, + "step": 3046 + }, + { + "epoch": 1.2172523961661341, + "grad_norm": 1.075616311885778, + "learning_rate": 7.413653851025959e-06, + "loss": 0.1027, + "step": 3048 + }, + { + "epoch": 1.2180511182108626, + "grad_norm": 1.0677123331182898, + "learning_rate": 7.4095828255205085e-06, + "loss": 0.0933, + "step": 3050 + }, + { + "epoch": 1.218849840255591, + "grad_norm": 1.1351445749769558, + "learning_rate": 7.405509718362842e-06, + "loss": 0.1028, + "step": 3052 + }, + { + "epoch": 1.2196485623003195, + "grad_norm": 1.218634180761916, + "learning_rate": 7.401434533071745e-06, + "loss": 0.1146, + "step": 3054 + }, + { + "epoch": 1.220447284345048, + "grad_norm": 1.1682846848481725, + "learning_rate": 7.397357273167789e-06, + "loss": 0.1014, + "step": 3056 + }, + { + "epoch": 1.2212460063897763, + "grad_norm": 1.1641572787088017, + "learning_rate": 7.393277942173345e-06, + "loss": 0.1065, + "step": 3058 + }, + { + "epoch": 1.2220447284345048, + "grad_norm": 1.1514146815699455, + "learning_rate": 7.389196543612567e-06, + "loss": 0.1014, + "step": 3060 + }, + { + "epoch": 1.2228434504792332, + "grad_norm": 1.330134394264511, + "learning_rate": 7.3851130810113995e-06, + "loss": 0.118, + "step": 3062 + }, + { + "epoch": 1.2236421725239617, + "grad_norm": 1.1828426669934808, + "learning_rate": 7.381027557897568e-06, + "loss": 0.1054, + "step": 3064 + }, + { + "epoch": 1.2244408945686902, + "grad_norm": 1.1440466876810054, + "learning_rate": 7.376939977800581e-06, + "loss": 0.1157, + "step": 3066 + }, + { + "epoch": 1.2252396166134185, + "grad_norm": 1.1022139516494007, + "learning_rate": 7.372850344251722e-06, + "loss": 0.0968, + "step": 3068 + }, + { + "epoch": 1.226038338658147, + "grad_norm": 1.069906203066803, + "learning_rate": 7.368758660784048e-06, + "loss": 0.1154, + "step": 3070 + }, + { + "epoch": 1.2268370607028753, + "grad_norm": 1.066501858996855, + "learning_rate": 7.364664930932385e-06, + "loss": 0.1114, + "step": 3072 + }, + { + "epoch": 1.2276357827476039, + "grad_norm": 1.090807997716351, + "learning_rate": 7.360569158233332e-06, + "loss": 0.1058, + "step": 3074 + }, + { + "epoch": 1.2284345047923322, + "grad_norm": 1.1653540017717812, + "learning_rate": 7.356471346225249e-06, + "loss": 0.1141, + "step": 3076 + }, + { + "epoch": 1.2292332268370607, + "grad_norm": 1.2183346386496956, + "learning_rate": 7.352371498448261e-06, + "loss": 0.1086, + "step": 3078 + }, + { + "epoch": 1.230031948881789, + "grad_norm": 1.1495635595373217, + "learning_rate": 7.348269618444248e-06, + "loss": 0.1128, + "step": 3080 + }, + { + "epoch": 1.2308306709265175, + "grad_norm": 1.0657147313743496, + "learning_rate": 7.344165709756847e-06, + "loss": 0.1087, + "step": 3082 + }, + { + "epoch": 1.231629392971246, + "grad_norm": 1.1350112419460496, + "learning_rate": 7.340059775931449e-06, + "loss": 0.1121, + "step": 3084 + }, + { + "epoch": 1.2324281150159744, + "grad_norm": 1.1125316251538973, + "learning_rate": 7.33595182051519e-06, + "loss": 0.1112, + "step": 3086 + }, + { + "epoch": 1.233226837060703, + "grad_norm": 1.1893400382987338, + "learning_rate": 7.331841847056962e-06, + "loss": 0.1047, + "step": 3088 + }, + { + "epoch": 1.2340255591054312, + "grad_norm": 1.2177476387914874, + "learning_rate": 7.3277298591073895e-06, + "loss": 0.1136, + "step": 3090 + }, + { + "epoch": 1.2348242811501597, + "grad_norm": 1.0631740360903161, + "learning_rate": 7.323615860218844e-06, + "loss": 0.1144, + "step": 3092 + }, + { + "epoch": 1.2356230031948883, + "grad_norm": 1.092461962405361, + "learning_rate": 7.319499853945431e-06, + "loss": 0.0995, + "step": 3094 + }, + { + "epoch": 1.2364217252396166, + "grad_norm": 1.0890967378843055, + "learning_rate": 7.315381843842995e-06, + "loss": 0.1033, + "step": 3096 + }, + { + "epoch": 1.237220447284345, + "grad_norm": 1.2131460595478127, + "learning_rate": 7.3112618334691035e-06, + "loss": 0.1159, + "step": 3098 + }, + { + "epoch": 1.2380191693290734, + "grad_norm": 1.2188104551185952, + "learning_rate": 7.307139826383058e-06, + "loss": 0.1053, + "step": 3100 + }, + { + "epoch": 1.238817891373802, + "grad_norm": 1.0889591234624394, + "learning_rate": 7.303015826145886e-06, + "loss": 0.1012, + "step": 3102 + }, + { + "epoch": 1.2396166134185305, + "grad_norm": 1.1040750999295241, + "learning_rate": 7.298889836320334e-06, + "loss": 0.1013, + "step": 3104 + }, + { + "epoch": 1.2404153354632588, + "grad_norm": 1.100669836825261, + "learning_rate": 7.294761860470866e-06, + "loss": 0.1083, + "step": 3106 + }, + { + "epoch": 1.2412140575079873, + "grad_norm": 1.1773345051128925, + "learning_rate": 7.290631902163665e-06, + "loss": 0.11, + "step": 3108 + }, + { + "epoch": 1.2420127795527156, + "grad_norm": 1.2401290593387904, + "learning_rate": 7.286499964966625e-06, + "loss": 0.1203, + "step": 3110 + }, + { + "epoch": 1.2428115015974441, + "grad_norm": 1.1482156063703826, + "learning_rate": 7.282366052449351e-06, + "loss": 0.1122, + "step": 3112 + }, + { + "epoch": 1.2436102236421724, + "grad_norm": 1.4107571214662287, + "learning_rate": 7.278230168183152e-06, + "loss": 0.1183, + "step": 3114 + }, + { + "epoch": 1.244408945686901, + "grad_norm": 1.0325520625453315, + "learning_rate": 7.274092315741042e-06, + "loss": 0.1019, + "step": 3116 + }, + { + "epoch": 1.2452076677316293, + "grad_norm": 1.1107905673766356, + "learning_rate": 7.269952498697734e-06, + "loss": 0.1051, + "step": 3118 + }, + { + "epoch": 1.2460063897763578, + "grad_norm": 1.1281556579434084, + "learning_rate": 7.265810720629643e-06, + "loss": 0.1187, + "step": 3120 + }, + { + "epoch": 1.2468051118210863, + "grad_norm": 1.097863022792409, + "learning_rate": 7.261666985114871e-06, + "loss": 0.1172, + "step": 3122 + }, + { + "epoch": 1.2476038338658146, + "grad_norm": 1.1043922887663058, + "learning_rate": 7.257521295733214e-06, + "loss": 0.0985, + "step": 3124 + }, + { + "epoch": 1.2484025559105432, + "grad_norm": 1.1286717401317095, + "learning_rate": 7.253373656066159e-06, + "loss": 0.1152, + "step": 3126 + }, + { + "epoch": 1.2492012779552715, + "grad_norm": 1.1389798339479362, + "learning_rate": 7.249224069696876e-06, + "loss": 0.1098, + "step": 3128 + }, + { + "epoch": 1.25, + "grad_norm": 1.1580209218201918, + "learning_rate": 7.245072540210213e-06, + "loss": 0.1199, + "step": 3130 + }, + { + "epoch": 1.2507987220447285, + "grad_norm": 1.019767242278647, + "learning_rate": 7.2409190711927015e-06, + "loss": 0.0945, + "step": 3132 + }, + { + "epoch": 1.2515974440894568, + "grad_norm": 1.0864399680532384, + "learning_rate": 7.236763666232546e-06, + "loss": 0.1001, + "step": 3134 + }, + { + "epoch": 1.2523961661341854, + "grad_norm": 1.261597038834277, + "learning_rate": 7.232606328919627e-06, + "loss": 0.1046, + "step": 3136 + }, + { + "epoch": 1.2531948881789137, + "grad_norm": 1.1146593566415728, + "learning_rate": 7.228447062845487e-06, + "loss": 0.1201, + "step": 3138 + }, + { + "epoch": 1.2539936102236422, + "grad_norm": 1.2339789575226763, + "learning_rate": 7.224285871603344e-06, + "loss": 0.1071, + "step": 3140 + }, + { + "epoch": 1.2547923322683707, + "grad_norm": 1.1413529281364962, + "learning_rate": 7.2201227587880704e-06, + "loss": 0.1157, + "step": 3142 + }, + { + "epoch": 1.255591054313099, + "grad_norm": 1.1321943139020432, + "learning_rate": 7.215957727996208e-06, + "loss": 0.1136, + "step": 3144 + }, + { + "epoch": 1.2563897763578276, + "grad_norm": 1.1035881377949015, + "learning_rate": 7.211790782825945e-06, + "loss": 0.112, + "step": 3146 + }, + { + "epoch": 1.2571884984025559, + "grad_norm": 1.1454443734559139, + "learning_rate": 7.207621926877133e-06, + "loss": 0.107, + "step": 3148 + }, + { + "epoch": 1.2579872204472844, + "grad_norm": 1.0366769144913708, + "learning_rate": 7.203451163751268e-06, + "loss": 0.1025, + "step": 3150 + }, + { + "epoch": 1.2587859424920127, + "grad_norm": 1.1952040379853797, + "learning_rate": 7.199278497051498e-06, + "loss": 0.1259, + "step": 3152 + }, + { + "epoch": 1.2595846645367412, + "grad_norm": 1.1719876875826358, + "learning_rate": 7.195103930382609e-06, + "loss": 0.1102, + "step": 3154 + }, + { + "epoch": 1.2603833865814695, + "grad_norm": 1.2526525796976853, + "learning_rate": 7.190927467351037e-06, + "loss": 0.1219, + "step": 3156 + }, + { + "epoch": 1.261182108626198, + "grad_norm": 1.047570651725447, + "learning_rate": 7.186749111564852e-06, + "loss": 0.099, + "step": 3158 + }, + { + "epoch": 1.2619808306709266, + "grad_norm": 1.0251728815129595, + "learning_rate": 7.182568866633757e-06, + "loss": 0.1129, + "step": 3160 + }, + { + "epoch": 1.262779552715655, + "grad_norm": 1.0528626625673922, + "learning_rate": 7.178386736169087e-06, + "loss": 0.0972, + "step": 3162 + }, + { + "epoch": 1.2635782747603834, + "grad_norm": 1.1434985715089727, + "learning_rate": 7.174202723783815e-06, + "loss": 0.1108, + "step": 3164 + }, + { + "epoch": 1.2643769968051117, + "grad_norm": 1.053836775418805, + "learning_rate": 7.170016833092526e-06, + "loss": 0.111, + "step": 3166 + }, + { + "epoch": 1.2651757188498403, + "grad_norm": 1.1472854273249073, + "learning_rate": 7.16582906771144e-06, + "loss": 0.1101, + "step": 3168 + }, + { + "epoch": 1.2659744408945688, + "grad_norm": 1.019394721627375, + "learning_rate": 7.161639431258387e-06, + "loss": 0.1082, + "step": 3170 + }, + { + "epoch": 1.266773162939297, + "grad_norm": 1.1497720973804415, + "learning_rate": 7.157447927352821e-06, + "loss": 0.1038, + "step": 3172 + }, + { + "epoch": 1.2675718849840256, + "grad_norm": 0.931060363409149, + "learning_rate": 7.153254559615802e-06, + "loss": 0.0934, + "step": 3174 + }, + { + "epoch": 1.268370607028754, + "grad_norm": 1.1003773790175626, + "learning_rate": 7.149059331670009e-06, + "loss": 0.1116, + "step": 3176 + }, + { + "epoch": 1.2691693290734825, + "grad_norm": 0.9906365881640852, + "learning_rate": 7.144862247139716e-06, + "loss": 0.1048, + "step": 3178 + }, + { + "epoch": 1.269968051118211, + "grad_norm": 1.1404557162820228, + "learning_rate": 7.140663309650817e-06, + "loss": 0.1132, + "step": 3180 + }, + { + "epoch": 1.2707667731629393, + "grad_norm": 1.2239606071851836, + "learning_rate": 7.1364625228307915e-06, + "loss": 0.1124, + "step": 3182 + }, + { + "epoch": 1.2715654952076676, + "grad_norm": 1.1186497681482894, + "learning_rate": 7.132259890308726e-06, + "loss": 0.1072, + "step": 3184 + }, + { + "epoch": 1.2723642172523961, + "grad_norm": 1.1465574716494185, + "learning_rate": 7.128055415715295e-06, + "loss": 0.1034, + "step": 3186 + }, + { + "epoch": 1.2731629392971247, + "grad_norm": 1.1631071186812576, + "learning_rate": 7.123849102682772e-06, + "loss": 0.1104, + "step": 3188 + }, + { + "epoch": 1.273961661341853, + "grad_norm": 0.9901243870652381, + "learning_rate": 7.119640954845011e-06, + "loss": 0.0994, + "step": 3190 + }, + { + "epoch": 1.2747603833865815, + "grad_norm": 1.085814944090195, + "learning_rate": 7.115430975837457e-06, + "loss": 0.1105, + "step": 3192 + }, + { + "epoch": 1.2755591054313098, + "grad_norm": 1.2215853050594954, + "learning_rate": 7.111219169297134e-06, + "loss": 0.1054, + "step": 3194 + }, + { + "epoch": 1.2763578274760383, + "grad_norm": 1.062258920330732, + "learning_rate": 7.107005538862647e-06, + "loss": 0.1087, + "step": 3196 + }, + { + "epoch": 1.2771565495207668, + "grad_norm": 1.143873543200049, + "learning_rate": 7.102790088174172e-06, + "loss": 0.1098, + "step": 3198 + }, + { + "epoch": 1.2779552715654952, + "grad_norm": 0.8890153264482474, + "learning_rate": 7.098572820873461e-06, + "loss": 0.096, + "step": 3200 + }, + { + "epoch": 1.2787539936102237, + "grad_norm": 1.0763979750774153, + "learning_rate": 7.0943537406038385e-06, + "loss": 0.104, + "step": 3202 + }, + { + "epoch": 1.279552715654952, + "grad_norm": 1.005382468063005, + "learning_rate": 7.09013285101019e-06, + "loss": 0.0956, + "step": 3204 + }, + { + "epoch": 1.2803514376996805, + "grad_norm": 1.0144396802490765, + "learning_rate": 7.085910155738964e-06, + "loss": 0.0964, + "step": 3206 + }, + { + "epoch": 1.281150159744409, + "grad_norm": 1.1536744431519097, + "learning_rate": 7.081685658438173e-06, + "loss": 0.127, + "step": 3208 + }, + { + "epoch": 1.2819488817891374, + "grad_norm": 1.1286638022231128, + "learning_rate": 7.0774593627573815e-06, + "loss": 0.1083, + "step": 3210 + }, + { + "epoch": 1.2827476038338659, + "grad_norm": 1.0529407867194136, + "learning_rate": 7.073231272347714e-06, + "loss": 0.1067, + "step": 3212 + }, + { + "epoch": 1.2835463258785942, + "grad_norm": 1.0573840208402359, + "learning_rate": 7.069001390861838e-06, + "loss": 0.0897, + "step": 3214 + }, + { + "epoch": 1.2843450479233227, + "grad_norm": 1.1598664588481753, + "learning_rate": 7.064769721953975e-06, + "loss": 0.1103, + "step": 3216 + }, + { + "epoch": 1.2851437699680512, + "grad_norm": 1.2632526669395345, + "learning_rate": 7.060536269279887e-06, + "loss": 0.119, + "step": 3218 + }, + { + "epoch": 1.2859424920127795, + "grad_norm": 1.0466455925316702, + "learning_rate": 7.056301036496875e-06, + "loss": 0.0984, + "step": 3220 + }, + { + "epoch": 1.2867412140575079, + "grad_norm": 1.0228282332371574, + "learning_rate": 7.052064027263785e-06, + "loss": 0.0896, + "step": 3222 + }, + { + "epoch": 1.2875399361022364, + "grad_norm": 1.1687594953534266, + "learning_rate": 7.047825245240989e-06, + "loss": 0.1064, + "step": 3224 + }, + { + "epoch": 1.288338658146965, + "grad_norm": 1.1265735374171337, + "learning_rate": 7.0435846940903974e-06, + "loss": 0.1104, + "step": 3226 + }, + { + "epoch": 1.2891373801916932, + "grad_norm": 1.070111795962378, + "learning_rate": 7.039342377475444e-06, + "loss": 0.1072, + "step": 3228 + }, + { + "epoch": 1.2899361022364217, + "grad_norm": 1.2165890989586463, + "learning_rate": 7.035098299061094e-06, + "loss": 0.1192, + "step": 3230 + }, + { + "epoch": 1.29073482428115, + "grad_norm": 1.1082530581997079, + "learning_rate": 7.030852462513827e-06, + "loss": 0.1065, + "step": 3232 + }, + { + "epoch": 1.2915335463258786, + "grad_norm": 1.124650039405341, + "learning_rate": 7.026604871501647e-06, + "loss": 0.1045, + "step": 3234 + }, + { + "epoch": 1.292332268370607, + "grad_norm": 1.2186986975548504, + "learning_rate": 7.02235552969407e-06, + "loss": 0.1142, + "step": 3236 + }, + { + "epoch": 1.2931309904153354, + "grad_norm": 0.991464353367416, + "learning_rate": 7.018104440762128e-06, + "loss": 0.0919, + "step": 3238 + }, + { + "epoch": 1.293929712460064, + "grad_norm": 1.020705207223098, + "learning_rate": 7.013851608378359e-06, + "loss": 0.0978, + "step": 3240 + }, + { + "epoch": 1.2947284345047922, + "grad_norm": 1.0373213189747952, + "learning_rate": 7.009597036216813e-06, + "loss": 0.1088, + "step": 3242 + }, + { + "epoch": 1.2955271565495208, + "grad_norm": 0.9464480336921676, + "learning_rate": 7.005340727953035e-06, + "loss": 0.0953, + "step": 3244 + }, + { + "epoch": 1.2963258785942493, + "grad_norm": 1.0939229933892893, + "learning_rate": 7.001082687264075e-06, + "loss": 0.1079, + "step": 3246 + }, + { + "epoch": 1.2971246006389776, + "grad_norm": 1.2560358384718262, + "learning_rate": 6.9968229178284775e-06, + "loss": 0.1165, + "step": 3248 + }, + { + "epoch": 1.2979233226837061, + "grad_norm": 1.2448345478431648, + "learning_rate": 6.992561423326284e-06, + "loss": 0.1002, + "step": 3250 + }, + { + "epoch": 1.2987220447284344, + "grad_norm": 1.1712943475606261, + "learning_rate": 6.988298207439022e-06, + "loss": 0.0989, + "step": 3252 + }, + { + "epoch": 1.299520766773163, + "grad_norm": 1.1773991585755725, + "learning_rate": 6.9840332738497065e-06, + "loss": 0.1133, + "step": 3254 + }, + { + "epoch": 1.3003194888178915, + "grad_norm": 1.1116582831985906, + "learning_rate": 6.979766626242839e-06, + "loss": 0.1142, + "step": 3256 + }, + { + "epoch": 1.3011182108626198, + "grad_norm": 1.1586326246366871, + "learning_rate": 6.975498268304401e-06, + "loss": 0.1125, + "step": 3258 + }, + { + "epoch": 1.3019169329073481, + "grad_norm": 1.0830604429290178, + "learning_rate": 6.971228203721849e-06, + "loss": 0.1086, + "step": 3260 + }, + { + "epoch": 1.3027156549520766, + "grad_norm": 1.1206997334831217, + "learning_rate": 6.96695643618412e-06, + "loss": 0.1124, + "step": 3262 + }, + { + "epoch": 1.3035143769968052, + "grad_norm": 0.967514162516545, + "learning_rate": 6.9626829693816135e-06, + "loss": 0.1125, + "step": 3264 + }, + { + "epoch": 1.3043130990415335, + "grad_norm": 0.97337749177029, + "learning_rate": 6.958407807006205e-06, + "loss": 0.1037, + "step": 3266 + }, + { + "epoch": 1.305111821086262, + "grad_norm": 1.1764350961941878, + "learning_rate": 6.954130952751228e-06, + "loss": 0.1131, + "step": 3268 + }, + { + "epoch": 1.3059105431309903, + "grad_norm": 0.9892258025188561, + "learning_rate": 6.949852410311484e-06, + "loss": 0.1034, + "step": 3270 + }, + { + "epoch": 1.3067092651757188, + "grad_norm": 1.0360171336900805, + "learning_rate": 6.945572183383229e-06, + "loss": 0.1067, + "step": 3272 + }, + { + "epoch": 1.3075079872204474, + "grad_norm": 1.2410869037730936, + "learning_rate": 6.941290275664175e-06, + "loss": 0.1053, + "step": 3274 + }, + { + "epoch": 1.3083067092651757, + "grad_norm": 1.0698597118994493, + "learning_rate": 6.9370066908534875e-06, + "loss": 0.1002, + "step": 3276 + }, + { + "epoch": 1.3091054313099042, + "grad_norm": 1.0720761674526624, + "learning_rate": 6.932721432651779e-06, + "loss": 0.1093, + "step": 3278 + }, + { + "epoch": 1.3099041533546325, + "grad_norm": 1.1243594373271324, + "learning_rate": 6.928434504761106e-06, + "loss": 0.1219, + "step": 3280 + }, + { + "epoch": 1.310702875399361, + "grad_norm": 1.1859750342924074, + "learning_rate": 6.924145910884972e-06, + "loss": 0.109, + "step": 3282 + }, + { + "epoch": 1.3115015974440896, + "grad_norm": 1.0930001670477891, + "learning_rate": 6.919855654728317e-06, + "loss": 0.108, + "step": 3284 + }, + { + "epoch": 1.3123003194888179, + "grad_norm": 0.9985564810729535, + "learning_rate": 6.9155637399975196e-06, + "loss": 0.0973, + "step": 3286 + }, + { + "epoch": 1.3130990415335464, + "grad_norm": 1.056704078140189, + "learning_rate": 6.911270170400385e-06, + "loss": 0.0985, + "step": 3288 + }, + { + "epoch": 1.3138977635782747, + "grad_norm": 1.2319666270989271, + "learning_rate": 6.9069749496461555e-06, + "loss": 0.112, + "step": 3290 + }, + { + "epoch": 1.3146964856230032, + "grad_norm": 1.1376694239915568, + "learning_rate": 6.902678081445495e-06, + "loss": 0.1048, + "step": 3292 + }, + { + "epoch": 1.3154952076677318, + "grad_norm": 1.169749984507156, + "learning_rate": 6.898379569510491e-06, + "loss": 0.1021, + "step": 3294 + }, + { + "epoch": 1.31629392971246, + "grad_norm": 1.223585035436731, + "learning_rate": 6.894079417554657e-06, + "loss": 0.1194, + "step": 3296 + }, + { + "epoch": 1.3170926517571884, + "grad_norm": 1.19066097682282, + "learning_rate": 6.889777629292914e-06, + "loss": 0.1184, + "step": 3298 + }, + { + "epoch": 1.317891373801917, + "grad_norm": 1.0965120097047536, + "learning_rate": 6.885474208441602e-06, + "loss": 0.0979, + "step": 3300 + }, + { + "epoch": 1.3186900958466454, + "grad_norm": 1.0211040534374387, + "learning_rate": 6.881169158718474e-06, + "loss": 0.1081, + "step": 3302 + }, + { + "epoch": 1.3194888178913737, + "grad_norm": 1.017263749768144, + "learning_rate": 6.8768624838426815e-06, + "loss": 0.0988, + "step": 3304 + }, + { + "epoch": 1.3202875399361023, + "grad_norm": 1.0524568085425485, + "learning_rate": 6.872554187534788e-06, + "loss": 0.1032, + "step": 3306 + }, + { + "epoch": 1.3210862619808306, + "grad_norm": 1.0173821412579143, + "learning_rate": 6.868244273516755e-06, + "loss": 0.1045, + "step": 3308 + }, + { + "epoch": 1.321884984025559, + "grad_norm": 1.092162092574483, + "learning_rate": 6.863932745511942e-06, + "loss": 0.1127, + "step": 3310 + }, + { + "epoch": 1.3226837060702876, + "grad_norm": 1.1738686449026956, + "learning_rate": 6.859619607245102e-06, + "loss": 0.1048, + "step": 3312 + }, + { + "epoch": 1.323482428115016, + "grad_norm": 1.1698036650478054, + "learning_rate": 6.855304862442379e-06, + "loss": 0.109, + "step": 3314 + }, + { + "epoch": 1.3242811501597445, + "grad_norm": 1.0611253385279913, + "learning_rate": 6.850988514831304e-06, + "loss": 0.1036, + "step": 3316 + }, + { + "epoch": 1.3250798722044728, + "grad_norm": 1.180905968110651, + "learning_rate": 6.846670568140797e-06, + "loss": 0.1082, + "step": 3318 + }, + { + "epoch": 1.3258785942492013, + "grad_norm": 1.054233295900519, + "learning_rate": 6.842351026101155e-06, + "loss": 0.1059, + "step": 3320 + }, + { + "epoch": 1.3266773162939298, + "grad_norm": 1.080658997418189, + "learning_rate": 6.838029892444056e-06, + "loss": 0.1072, + "step": 3322 + }, + { + "epoch": 1.3274760383386581, + "grad_norm": 1.0388885899845177, + "learning_rate": 6.833707170902551e-06, + "loss": 0.1042, + "step": 3324 + }, + { + "epoch": 1.3282747603833867, + "grad_norm": 1.0681215628181655, + "learning_rate": 6.829382865211063e-06, + "loss": 0.1012, + "step": 3326 + }, + { + "epoch": 1.329073482428115, + "grad_norm": 0.9880264033407431, + "learning_rate": 6.825056979105382e-06, + "loss": 0.1094, + "step": 3328 + }, + { + "epoch": 1.3298722044728435, + "grad_norm": 1.0627754491783672, + "learning_rate": 6.820729516322671e-06, + "loss": 0.1045, + "step": 3330 + }, + { + "epoch": 1.330670926517572, + "grad_norm": 0.9621593864476723, + "learning_rate": 6.816400480601445e-06, + "loss": 0.0988, + "step": 3332 + }, + { + "epoch": 1.3314696485623003, + "grad_norm": 1.0320888666137535, + "learning_rate": 6.812069875681585e-06, + "loss": 0.0948, + "step": 3334 + }, + { + "epoch": 1.3322683706070286, + "grad_norm": 1.0860752731846834, + "learning_rate": 6.807737705304324e-06, + "loss": 0.0979, + "step": 3336 + }, + { + "epoch": 1.3330670926517572, + "grad_norm": 1.1173806530253718, + "learning_rate": 6.803403973212247e-06, + "loss": 0.1084, + "step": 3338 + }, + { + "epoch": 1.3338658146964857, + "grad_norm": 1.211178041971597, + "learning_rate": 6.799068683149291e-06, + "loss": 0.1026, + "step": 3340 + }, + { + "epoch": 1.334664536741214, + "grad_norm": 1.324084252679483, + "learning_rate": 6.79473183886074e-06, + "loss": 0.106, + "step": 3342 + }, + { + "epoch": 1.3354632587859425, + "grad_norm": 1.190661411969892, + "learning_rate": 6.790393444093214e-06, + "loss": 0.1103, + "step": 3344 + }, + { + "epoch": 1.3362619808306708, + "grad_norm": 1.0769935421724353, + "learning_rate": 6.786053502594679e-06, + "loss": 0.0949, + "step": 3346 + }, + { + "epoch": 1.3370607028753994, + "grad_norm": 1.0884920315893003, + "learning_rate": 6.781712018114435e-06, + "loss": 0.1041, + "step": 3348 + }, + { + "epoch": 1.3378594249201279, + "grad_norm": 1.451297567535448, + "learning_rate": 6.777368994403113e-06, + "loss": 0.1101, + "step": 3350 + }, + { + "epoch": 1.3386581469648562, + "grad_norm": 1.0225728615227607, + "learning_rate": 6.773024435212678e-06, + "loss": 0.0971, + "step": 3352 + }, + { + "epoch": 1.3394568690095847, + "grad_norm": 1.0713734854451504, + "learning_rate": 6.7686783442964195e-06, + "loss": 0.1011, + "step": 3354 + }, + { + "epoch": 1.340255591054313, + "grad_norm": 1.0477177218409457, + "learning_rate": 6.7643307254089485e-06, + "loss": 0.107, + "step": 3356 + }, + { + "epoch": 1.3410543130990416, + "grad_norm": 1.0932469970665097, + "learning_rate": 6.759981582306197e-06, + "loss": 0.1081, + "step": 3358 + }, + { + "epoch": 1.34185303514377, + "grad_norm": 1.1498681000315574, + "learning_rate": 6.7556309187454185e-06, + "loss": 0.1036, + "step": 3360 + }, + { + "epoch": 1.3426517571884984, + "grad_norm": 1.1306486794158652, + "learning_rate": 6.751278738485169e-06, + "loss": 0.117, + "step": 3362 + }, + { + "epoch": 1.343450479233227, + "grad_norm": 1.0798075050613067, + "learning_rate": 6.746925045285327e-06, + "loss": 0.1075, + "step": 3364 + }, + { + "epoch": 1.3442492012779552, + "grad_norm": 1.1185887548421796, + "learning_rate": 6.742569842907071e-06, + "loss": 0.1177, + "step": 3366 + }, + { + "epoch": 1.3450479233226837, + "grad_norm": 1.185572779424059, + "learning_rate": 6.738213135112884e-06, + "loss": 0.0999, + "step": 3368 + }, + { + "epoch": 1.3458466453674123, + "grad_norm": 1.113143717088373, + "learning_rate": 6.733854925666552e-06, + "loss": 0.111, + "step": 3370 + }, + { + "epoch": 1.3466453674121406, + "grad_norm": 1.0888946989797392, + "learning_rate": 6.729495218333157e-06, + "loss": 0.0965, + "step": 3372 + }, + { + "epoch": 1.3474440894568689, + "grad_norm": 1.0800532165447028, + "learning_rate": 6.725134016879071e-06, + "loss": 0.1294, + "step": 3374 + }, + { + "epoch": 1.3482428115015974, + "grad_norm": 1.0293169314360555, + "learning_rate": 6.720771325071965e-06, + "loss": 0.1005, + "step": 3376 + }, + { + "epoch": 1.349041533546326, + "grad_norm": 1.1266946956354467, + "learning_rate": 6.716407146680793e-06, + "loss": 0.1034, + "step": 3378 + }, + { + "epoch": 1.3498402555910542, + "grad_norm": 1.0175557765865235, + "learning_rate": 6.71204148547579e-06, + "loss": 0.1032, + "step": 3380 + }, + { + "epoch": 1.3506389776357828, + "grad_norm": 1.129124738760774, + "learning_rate": 6.7076743452284776e-06, + "loss": 0.1155, + "step": 3382 + }, + { + "epoch": 1.351437699680511, + "grad_norm": 1.1223728907114148, + "learning_rate": 6.703305729711653e-06, + "loss": 0.1065, + "step": 3384 + }, + { + "epoch": 1.3522364217252396, + "grad_norm": 1.0545042346249254, + "learning_rate": 6.698935642699386e-06, + "loss": 0.0987, + "step": 3386 + }, + { + "epoch": 1.3530351437699681, + "grad_norm": 1.0675803055716808, + "learning_rate": 6.694564087967023e-06, + "loss": 0.1139, + "step": 3388 + }, + { + "epoch": 1.3538338658146964, + "grad_norm": 1.173914284307619, + "learning_rate": 6.6901910692911706e-06, + "loss": 0.1187, + "step": 3390 + }, + { + "epoch": 1.354632587859425, + "grad_norm": 1.0709511536838934, + "learning_rate": 6.685816590449708e-06, + "loss": 0.1068, + "step": 3392 + }, + { + "epoch": 1.3554313099041533, + "grad_norm": 0.9466690009096309, + "learning_rate": 6.68144065522177e-06, + "loss": 0.0983, + "step": 3394 + }, + { + "epoch": 1.3562300319488818, + "grad_norm": 1.114009958699587, + "learning_rate": 6.677063267387754e-06, + "loss": 0.1099, + "step": 3396 + }, + { + "epoch": 1.3570287539936103, + "grad_norm": 1.1077866780843397, + "learning_rate": 6.672684430729305e-06, + "loss": 0.1036, + "step": 3398 + }, + { + "epoch": 1.3578274760383386, + "grad_norm": 1.0412230391242072, + "learning_rate": 6.668304149029331e-06, + "loss": 0.1001, + "step": 3400 + }, + { + "epoch": 1.3586261980830672, + "grad_norm": 1.1010636435498742, + "learning_rate": 6.663922426071978e-06, + "loss": 0.1045, + "step": 3402 + }, + { + "epoch": 1.3594249201277955, + "grad_norm": 1.1984509482714603, + "learning_rate": 6.659539265642643e-06, + "loss": 0.106, + "step": 3404 + }, + { + "epoch": 1.360223642172524, + "grad_norm": 0.9863537625149396, + "learning_rate": 6.655154671527962e-06, + "loss": 0.1033, + "step": 3406 + }, + { + "epoch": 1.3610223642172525, + "grad_norm": 1.0898694676194725, + "learning_rate": 6.650768647515813e-06, + "loss": 0.1068, + "step": 3408 + }, + { + "epoch": 1.3618210862619808, + "grad_norm": 1.0711686739516757, + "learning_rate": 6.646381197395302e-06, + "loss": 0.1094, + "step": 3410 + }, + { + "epoch": 1.3626198083067091, + "grad_norm": 0.9804227369687718, + "learning_rate": 6.641992324956776e-06, + "loss": 0.1105, + "step": 3412 + }, + { + "epoch": 1.3634185303514377, + "grad_norm": 1.1535558172179992, + "learning_rate": 6.637602033991807e-06, + "loss": 0.1053, + "step": 3414 + }, + { + "epoch": 1.3642172523961662, + "grad_norm": 1.0241900868101663, + "learning_rate": 6.63321032829319e-06, + "loss": 0.1094, + "step": 3416 + }, + { + "epoch": 1.3650159744408945, + "grad_norm": 1.1116602107856417, + "learning_rate": 6.628817211654945e-06, + "loss": 0.1079, + "step": 3418 + }, + { + "epoch": 1.365814696485623, + "grad_norm": 1.099401650328653, + "learning_rate": 6.624422687872312e-06, + "loss": 0.1147, + "step": 3420 + }, + { + "epoch": 1.3666134185303513, + "grad_norm": 1.1672020742722258, + "learning_rate": 6.6200267607417415e-06, + "loss": 0.1112, + "step": 3422 + }, + { + "epoch": 1.3674121405750799, + "grad_norm": 1.0307138522820616, + "learning_rate": 6.615629434060903e-06, + "loss": 0.116, + "step": 3424 + }, + { + "epoch": 1.3682108626198084, + "grad_norm": 1.2489824104367433, + "learning_rate": 6.611230711628669e-06, + "loss": 0.0957, + "step": 3426 + }, + { + "epoch": 1.3690095846645367, + "grad_norm": 0.9719925332909197, + "learning_rate": 6.6068305972451245e-06, + "loss": 0.106, + "step": 3428 + }, + { + "epoch": 1.3698083067092652, + "grad_norm": 1.058537905484153, + "learning_rate": 6.602429094711549e-06, + "loss": 0.1027, + "step": 3430 + }, + { + "epoch": 1.3706070287539935, + "grad_norm": 1.0235718542609205, + "learning_rate": 6.598026207830428e-06, + "loss": 0.0964, + "step": 3432 + }, + { + "epoch": 1.371405750798722, + "grad_norm": 1.0786959847501962, + "learning_rate": 6.593621940405439e-06, + "loss": 0.1014, + "step": 3434 + }, + { + "epoch": 1.3722044728434506, + "grad_norm": 1.137212886630325, + "learning_rate": 6.589216296241455e-06, + "loss": 0.1168, + "step": 3436 + }, + { + "epoch": 1.373003194888179, + "grad_norm": 1.0030513413696003, + "learning_rate": 6.584809279144535e-06, + "loss": 0.0999, + "step": 3438 + }, + { + "epoch": 1.3738019169329074, + "grad_norm": 1.1517827771669715, + "learning_rate": 6.5804008929219284e-06, + "loss": 0.1177, + "step": 3440 + }, + { + "epoch": 1.3746006389776357, + "grad_norm": 1.1512991234134653, + "learning_rate": 6.575991141382063e-06, + "loss": 0.0966, + "step": 3442 + }, + { + "epoch": 1.3753993610223643, + "grad_norm": 1.1000379035510455, + "learning_rate": 6.571580028334547e-06, + "loss": 0.1053, + "step": 3444 + }, + { + "epoch": 1.3761980830670926, + "grad_norm": 1.0218098747432192, + "learning_rate": 6.56716755759017e-06, + "loss": 0.0912, + "step": 3446 + }, + { + "epoch": 1.376996805111821, + "grad_norm": 1.1551745154641282, + "learning_rate": 6.562753732960887e-06, + "loss": 0.1119, + "step": 3448 + }, + { + "epoch": 1.3777955271565494, + "grad_norm": 1.2194490422535142, + "learning_rate": 6.5583385582598255e-06, + "loss": 0.099, + "step": 3450 + }, + { + "epoch": 1.378594249201278, + "grad_norm": 1.4172435672608823, + "learning_rate": 6.553922037301283e-06, + "loss": 0.0978, + "step": 3452 + }, + { + "epoch": 1.3793929712460065, + "grad_norm": 1.0695393339059773, + "learning_rate": 6.549504173900715e-06, + "loss": 0.0917, + "step": 3454 + }, + { + "epoch": 1.3801916932907348, + "grad_norm": 1.196402708368458, + "learning_rate": 6.545084971874738e-06, + "loss": 0.1113, + "step": 3456 + }, + { + "epoch": 1.3809904153354633, + "grad_norm": 1.0433584799967095, + "learning_rate": 6.540664435041127e-06, + "loss": 0.104, + "step": 3458 + }, + { + "epoch": 1.3817891373801916, + "grad_norm": 1.149354927046393, + "learning_rate": 6.536242567218808e-06, + "loss": 0.1031, + "step": 3460 + }, + { + "epoch": 1.3825878594249201, + "grad_norm": 1.1148556785558263, + "learning_rate": 6.531819372227856e-06, + "loss": 0.1032, + "step": 3462 + }, + { + "epoch": 1.3833865814696487, + "grad_norm": 1.071711948179689, + "learning_rate": 6.527394853889499e-06, + "loss": 0.1085, + "step": 3464 + }, + { + "epoch": 1.384185303514377, + "grad_norm": 1.082175681158764, + "learning_rate": 6.522969016026099e-06, + "loss": 0.11, + "step": 3466 + }, + { + "epoch": 1.3849840255591055, + "grad_norm": 1.1369109201663514, + "learning_rate": 6.518541862461163e-06, + "loss": 0.1169, + "step": 3468 + }, + { + "epoch": 1.3857827476038338, + "grad_norm": 1.2011247923954287, + "learning_rate": 6.514113397019335e-06, + "loss": 0.1081, + "step": 3470 + }, + { + "epoch": 1.3865814696485623, + "grad_norm": 1.1792397645094768, + "learning_rate": 6.5096836235263904e-06, + "loss": 0.1202, + "step": 3472 + }, + { + "epoch": 1.3873801916932909, + "grad_norm": 1.0727123114625765, + "learning_rate": 6.505252545809238e-06, + "loss": 0.0962, + "step": 3474 + }, + { + "epoch": 1.3881789137380192, + "grad_norm": 1.0276014422922428, + "learning_rate": 6.500820167695906e-06, + "loss": 0.0996, + "step": 3476 + }, + { + "epoch": 1.3889776357827475, + "grad_norm": 0.9724065635327491, + "learning_rate": 6.496386493015554e-06, + "loss": 0.0987, + "step": 3478 + }, + { + "epoch": 1.389776357827476, + "grad_norm": 1.1292315012262628, + "learning_rate": 6.491951525598461e-06, + "loss": 0.0999, + "step": 3480 + }, + { + "epoch": 1.3905750798722045, + "grad_norm": 1.1716250389490488, + "learning_rate": 6.487515269276015e-06, + "loss": 0.1015, + "step": 3482 + }, + { + "epoch": 1.3913738019169328, + "grad_norm": 1.036997684705096, + "learning_rate": 6.483077727880726e-06, + "loss": 0.0863, + "step": 3484 + }, + { + "epoch": 1.3921725239616614, + "grad_norm": 1.199757124869139, + "learning_rate": 6.478638905246213e-06, + "loss": 0.1116, + "step": 3486 + }, + { + "epoch": 1.3929712460063897, + "grad_norm": 1.142726930599777, + "learning_rate": 6.4741988052071965e-06, + "loss": 0.1098, + "step": 3488 + }, + { + "epoch": 1.3937699680511182, + "grad_norm": 0.9651746600463053, + "learning_rate": 6.469757431599503e-06, + "loss": 0.1038, + "step": 3490 + }, + { + "epoch": 1.3945686900958467, + "grad_norm": 1.1474675700025958, + "learning_rate": 6.465314788260067e-06, + "loss": 0.1154, + "step": 3492 + }, + { + "epoch": 1.395367412140575, + "grad_norm": 1.0346444553697798, + "learning_rate": 6.460870879026906e-06, + "loss": 0.0961, + "step": 3494 + }, + { + "epoch": 1.3961661341853036, + "grad_norm": 1.032379620946616, + "learning_rate": 6.45642570773914e-06, + "loss": 0.0997, + "step": 3496 + }, + { + "epoch": 1.3969648562300319, + "grad_norm": 1.016064340983906, + "learning_rate": 6.451979278236979e-06, + "loss": 0.0931, + "step": 3498 + }, + { + "epoch": 1.3977635782747604, + "grad_norm": 1.178554041440438, + "learning_rate": 6.447531594361719e-06, + "loss": 0.1153, + "step": 3500 + }, + { + "epoch": 1.3977635782747604, + "eval_loss": 0.1600114107131958, + "eval_runtime": 418.1465, + "eval_samples_per_second": 42.586, + "eval_steps_per_second": 5.323, + "step": 3500 + }, + { + "epoch": 1.398562300319489, + "grad_norm": 1.1584355046290076, + "learning_rate": 6.443082659955737e-06, + "loss": 0.1037, + "step": 3502 + }, + { + "epoch": 1.3993610223642172, + "grad_norm": 1.0340861375288486, + "learning_rate": 6.438632478862495e-06, + "loss": 0.1011, + "step": 3504 + }, + { + "epoch": 1.4001597444089458, + "grad_norm": 1.1875264702825228, + "learning_rate": 6.434181054926528e-06, + "loss": 0.1125, + "step": 3506 + }, + { + "epoch": 1.400958466453674, + "grad_norm": 1.0588069817372803, + "learning_rate": 6.429728391993446e-06, + "loss": 0.107, + "step": 3508 + }, + { + "epoch": 1.4017571884984026, + "grad_norm": 1.1098486981661322, + "learning_rate": 6.425274493909932e-06, + "loss": 0.1132, + "step": 3510 + }, + { + "epoch": 1.4025559105431311, + "grad_norm": 0.9823134309813667, + "learning_rate": 6.4208193645237314e-06, + "loss": 0.0941, + "step": 3512 + }, + { + "epoch": 1.4033546325878594, + "grad_norm": 1.0327788368934439, + "learning_rate": 6.416363007683656e-06, + "loss": 0.1105, + "step": 3514 + }, + { + "epoch": 1.4041533546325877, + "grad_norm": 1.1821001072193735, + "learning_rate": 6.411905427239577e-06, + "loss": 0.1084, + "step": 3516 + }, + { + "epoch": 1.4049520766773163, + "grad_norm": 1.1379784770115198, + "learning_rate": 6.407446627042426e-06, + "loss": 0.1135, + "step": 3518 + }, + { + "epoch": 1.4057507987220448, + "grad_norm": 1.1147819119237963, + "learning_rate": 6.402986610944183e-06, + "loss": 0.1076, + "step": 3520 + }, + { + "epoch": 1.406549520766773, + "grad_norm": 1.1200339092988951, + "learning_rate": 6.398525382797884e-06, + "loss": 0.1063, + "step": 3522 + }, + { + "epoch": 1.4073482428115016, + "grad_norm": 1.1312798096131238, + "learning_rate": 6.394062946457604e-06, + "loss": 0.1002, + "step": 3524 + }, + { + "epoch": 1.40814696485623, + "grad_norm": 1.199479655032073, + "learning_rate": 6.389599305778471e-06, + "loss": 0.108, + "step": 3526 + }, + { + "epoch": 1.4089456869009584, + "grad_norm": 1.218231109858983, + "learning_rate": 6.385134464616649e-06, + "loss": 0.1069, + "step": 3528 + }, + { + "epoch": 1.409744408945687, + "grad_norm": 1.1995058184118474, + "learning_rate": 6.38066842682934e-06, + "loss": 0.1103, + "step": 3530 + }, + { + "epoch": 1.4105431309904153, + "grad_norm": 1.1046534115214985, + "learning_rate": 6.376201196274778e-06, + "loss": 0.1142, + "step": 3532 + }, + { + "epoch": 1.4113418530351438, + "grad_norm": 1.0585983588946046, + "learning_rate": 6.37173277681223e-06, + "loss": 0.1114, + "step": 3534 + }, + { + "epoch": 1.4121405750798721, + "grad_norm": 1.1029224104867834, + "learning_rate": 6.367263172301985e-06, + "loss": 0.1113, + "step": 3536 + }, + { + "epoch": 1.4129392971246006, + "grad_norm": 1.1060776788940239, + "learning_rate": 6.3627923866053656e-06, + "loss": 0.0969, + "step": 3538 + }, + { + "epoch": 1.4137380191693292, + "grad_norm": 1.1032132052448507, + "learning_rate": 6.358320423584704e-06, + "loss": 0.0979, + "step": 3540 + }, + { + "epoch": 1.4145367412140575, + "grad_norm": 1.029503416268003, + "learning_rate": 6.353847287103356e-06, + "loss": 0.0953, + "step": 3542 + }, + { + "epoch": 1.415335463258786, + "grad_norm": 1.1352827365353346, + "learning_rate": 6.3493729810256895e-06, + "loss": 0.1118, + "step": 3544 + }, + { + "epoch": 1.4161341853035143, + "grad_norm": 1.119812994569396, + "learning_rate": 6.344897509217085e-06, + "loss": 0.1114, + "step": 3546 + }, + { + "epoch": 1.4169329073482428, + "grad_norm": 1.076940990174831, + "learning_rate": 6.340420875543922e-06, + "loss": 0.107, + "step": 3548 + }, + { + "epoch": 1.4177316293929714, + "grad_norm": 1.1517799889594393, + "learning_rate": 6.335943083873596e-06, + "loss": 0.0933, + "step": 3550 + }, + { + "epoch": 1.4185303514376997, + "grad_norm": 1.1617154802397773, + "learning_rate": 6.331464138074493e-06, + "loss": 0.105, + "step": 3552 + }, + { + "epoch": 1.419329073482428, + "grad_norm": 0.9494287593142456, + "learning_rate": 6.326984042016e-06, + "loss": 0.0921, + "step": 3554 + }, + { + "epoch": 1.4201277955271565, + "grad_norm": 1.1141280255887196, + "learning_rate": 6.322502799568498e-06, + "loss": 0.1061, + "step": 3556 + }, + { + "epoch": 1.420926517571885, + "grad_norm": 1.0414217088512951, + "learning_rate": 6.3180204146033586e-06, + "loss": 0.1029, + "step": 3558 + }, + { + "epoch": 1.4217252396166133, + "grad_norm": 1.0724759251125648, + "learning_rate": 6.313536890992935e-06, + "loss": 0.1053, + "step": 3560 + }, + { + "epoch": 1.4225239616613419, + "grad_norm": 1.0754047728283922, + "learning_rate": 6.309052232610574e-06, + "loss": 0.1096, + "step": 3562 + }, + { + "epoch": 1.4233226837060702, + "grad_norm": 1.1017909660001841, + "learning_rate": 6.3045664433305945e-06, + "loss": 0.1092, + "step": 3564 + }, + { + "epoch": 1.4241214057507987, + "grad_norm": 1.4387268439796592, + "learning_rate": 6.300079527028297e-06, + "loss": 0.1133, + "step": 3566 + }, + { + "epoch": 1.4249201277955272, + "grad_norm": 1.1303501334206185, + "learning_rate": 6.29559148757995e-06, + "loss": 0.1065, + "step": 3568 + }, + { + "epoch": 1.4257188498402555, + "grad_norm": 0.9640525041341275, + "learning_rate": 6.291102328862801e-06, + "loss": 0.0988, + "step": 3570 + }, + { + "epoch": 1.426517571884984, + "grad_norm": 1.0585869277703588, + "learning_rate": 6.286612054755056e-06, + "loss": 0.1022, + "step": 3572 + }, + { + "epoch": 1.4273162939297124, + "grad_norm": 1.0056038546914692, + "learning_rate": 6.282120669135892e-06, + "loss": 0.099, + "step": 3574 + }, + { + "epoch": 1.428115015974441, + "grad_norm": 1.1206641244335471, + "learning_rate": 6.277628175885437e-06, + "loss": 0.1167, + "step": 3576 + }, + { + "epoch": 1.4289137380191694, + "grad_norm": 1.1096458600950112, + "learning_rate": 6.273134578884785e-06, + "loss": 0.1058, + "step": 3578 + }, + { + "epoch": 1.4297124600638977, + "grad_norm": 0.9857461120602693, + "learning_rate": 6.2686398820159785e-06, + "loss": 0.0947, + "step": 3580 + }, + { + "epoch": 1.4305111821086263, + "grad_norm": 1.0203378985341014, + "learning_rate": 6.2641440891620146e-06, + "loss": 0.0914, + "step": 3582 + }, + { + "epoch": 1.4313099041533546, + "grad_norm": 1.0724408105973864, + "learning_rate": 6.2596472042068275e-06, + "loss": 0.0934, + "step": 3584 + }, + { + "epoch": 1.432108626198083, + "grad_norm": 1.2414438842066344, + "learning_rate": 6.2551492310353094e-06, + "loss": 0.1149, + "step": 3586 + }, + { + "epoch": 1.4329073482428116, + "grad_norm": 1.101684816522059, + "learning_rate": 6.250650173533279e-06, + "loss": 0.1142, + "step": 3588 + }, + { + "epoch": 1.43370607028754, + "grad_norm": 1.1753348478737677, + "learning_rate": 6.2461500355875e-06, + "loss": 0.116, + "step": 3590 + }, + { + "epoch": 1.4345047923322682, + "grad_norm": 1.1874416062991624, + "learning_rate": 6.241648821085666e-06, + "loss": 0.1073, + "step": 3592 + }, + { + "epoch": 1.4353035143769968, + "grad_norm": 1.0601160321918857, + "learning_rate": 6.237146533916402e-06, + "loss": 0.1013, + "step": 3594 + }, + { + "epoch": 1.4361022364217253, + "grad_norm": 1.108271170806747, + "learning_rate": 6.232643177969259e-06, + "loss": 0.0952, + "step": 3596 + }, + { + "epoch": 1.4369009584664536, + "grad_norm": 1.030322538054295, + "learning_rate": 6.2281387571347126e-06, + "loss": 0.1006, + "step": 3598 + }, + { + "epoch": 1.4376996805111821, + "grad_norm": 1.1028346810795997, + "learning_rate": 6.223633275304157e-06, + "loss": 0.111, + "step": 3600 + }, + { + "epoch": 1.4384984025559104, + "grad_norm": 1.0355367948281153, + "learning_rate": 6.2191267363699026e-06, + "loss": 0.1017, + "step": 3602 + }, + { + "epoch": 1.439297124600639, + "grad_norm": 1.2163948163518867, + "learning_rate": 6.214619144225176e-06, + "loss": 0.1077, + "step": 3604 + }, + { + "epoch": 1.4400958466453675, + "grad_norm": 1.1518703062495919, + "learning_rate": 6.210110502764107e-06, + "loss": 0.1155, + "step": 3606 + }, + { + "epoch": 1.4408945686900958, + "grad_norm": 1.1725592196005963, + "learning_rate": 6.205600815881741e-06, + "loss": 0.0979, + "step": 3608 + }, + { + "epoch": 1.4416932907348243, + "grad_norm": 1.0475879501811074, + "learning_rate": 6.2010900874740225e-06, + "loss": 0.106, + "step": 3610 + }, + { + "epoch": 1.4424920127795526, + "grad_norm": 1.07813779648937, + "learning_rate": 6.1965783214377895e-06, + "loss": 0.1022, + "step": 3612 + }, + { + "epoch": 1.4432907348242812, + "grad_norm": 1.0886593706957781, + "learning_rate": 6.192065521670787e-06, + "loss": 0.1051, + "step": 3614 + }, + { + "epoch": 1.4440894568690097, + "grad_norm": 1.1378082559578835, + "learning_rate": 6.187551692071648e-06, + "loss": 0.1084, + "step": 3616 + }, + { + "epoch": 1.444888178913738, + "grad_norm": 1.2011389042931888, + "learning_rate": 6.183036836539893e-06, + "loss": 0.1092, + "step": 3618 + }, + { + "epoch": 1.4456869009584665, + "grad_norm": 1.1251988986799606, + "learning_rate": 6.178520958975933e-06, + "loss": 0.1068, + "step": 3620 + }, + { + "epoch": 1.4464856230031948, + "grad_norm": 1.0150796615451205, + "learning_rate": 6.17400406328106e-06, + "loss": 0.1075, + "step": 3622 + }, + { + "epoch": 1.4472843450479234, + "grad_norm": 1.0885442213663745, + "learning_rate": 6.1694861533574445e-06, + "loss": 0.1127, + "step": 3624 + }, + { + "epoch": 1.4480830670926519, + "grad_norm": 1.064305780213124, + "learning_rate": 6.164967233108137e-06, + "loss": 0.0966, + "step": 3626 + }, + { + "epoch": 1.4488817891373802, + "grad_norm": 1.0844139564754607, + "learning_rate": 6.160447306437055e-06, + "loss": 0.1076, + "step": 3628 + }, + { + "epoch": 1.4496805111821085, + "grad_norm": 1.034026504245696, + "learning_rate": 6.1559263772489905e-06, + "loss": 0.096, + "step": 3630 + }, + { + "epoch": 1.450479233226837, + "grad_norm": 1.001104101700096, + "learning_rate": 6.1514044494496e-06, + "loss": 0.0994, + "step": 3632 + }, + { + "epoch": 1.4512779552715656, + "grad_norm": 1.1773822780221779, + "learning_rate": 6.146881526945401e-06, + "loss": 0.1076, + "step": 3634 + }, + { + "epoch": 1.4520766773162939, + "grad_norm": 1.0588404307223935, + "learning_rate": 6.142357613643773e-06, + "loss": 0.0929, + "step": 3636 + }, + { + "epoch": 1.4528753993610224, + "grad_norm": 1.2341260840634936, + "learning_rate": 6.13783271345295e-06, + "loss": 0.1156, + "step": 3638 + }, + { + "epoch": 1.4536741214057507, + "grad_norm": 1.2029429887737177, + "learning_rate": 6.133306830282021e-06, + "loss": 0.1131, + "step": 3640 + }, + { + "epoch": 1.4544728434504792, + "grad_norm": 0.9829158742595866, + "learning_rate": 6.128779968040917e-06, + "loss": 0.0977, + "step": 3642 + }, + { + "epoch": 1.4552715654952078, + "grad_norm": 1.0321776789284078, + "learning_rate": 6.1242521306404236e-06, + "loss": 0.0928, + "step": 3644 + }, + { + "epoch": 1.456070287539936, + "grad_norm": 1.0207766757489227, + "learning_rate": 6.119723321992164e-06, + "loss": 0.0956, + "step": 3646 + }, + { + "epoch": 1.4568690095846646, + "grad_norm": 1.2565358842354744, + "learning_rate": 6.115193546008602e-06, + "loss": 0.1075, + "step": 3648 + }, + { + "epoch": 1.457667731629393, + "grad_norm": 1.0795577230748994, + "learning_rate": 6.110662806603036e-06, + "loss": 0.1073, + "step": 3650 + }, + { + "epoch": 1.4584664536741214, + "grad_norm": 1.1116179183482724, + "learning_rate": 6.106131107689599e-06, + "loss": 0.1091, + "step": 3652 + }, + { + "epoch": 1.45926517571885, + "grad_norm": 1.1185062550976665, + "learning_rate": 6.101598453183248e-06, + "loss": 0.0981, + "step": 3654 + }, + { + "epoch": 1.4600638977635783, + "grad_norm": 1.1929060063575452, + "learning_rate": 6.097064846999774e-06, + "loss": 0.1114, + "step": 3656 + }, + { + "epoch": 1.4608626198083068, + "grad_norm": 1.1672964109989814, + "learning_rate": 6.09253029305578e-06, + "loss": 0.1048, + "step": 3658 + }, + { + "epoch": 1.461661341853035, + "grad_norm": 1.3459483840304294, + "learning_rate": 6.087994795268696e-06, + "loss": 0.1175, + "step": 3660 + }, + { + "epoch": 1.4624600638977636, + "grad_norm": 1.1453927282681513, + "learning_rate": 6.0834583575567606e-06, + "loss": 0.1024, + "step": 3662 + }, + { + "epoch": 1.4632587859424921, + "grad_norm": 1.0361766854922676, + "learning_rate": 6.078920983839032e-06, + "loss": 0.0971, + "step": 3664 + }, + { + "epoch": 1.4640575079872205, + "grad_norm": 1.0777649114365144, + "learning_rate": 6.07438267803537e-06, + "loss": 0.1028, + "step": 3666 + }, + { + "epoch": 1.4648562300319488, + "grad_norm": 1.1306654064542798, + "learning_rate": 6.069843444066444e-06, + "loss": 0.1095, + "step": 3668 + }, + { + "epoch": 1.4656549520766773, + "grad_norm": 1.0195281420397648, + "learning_rate": 6.065303285853724e-06, + "loss": 0.1075, + "step": 3670 + }, + { + "epoch": 1.4664536741214058, + "grad_norm": 1.023251007369703, + "learning_rate": 6.060762207319479e-06, + "loss": 0.1002, + "step": 3672 + }, + { + "epoch": 1.4672523961661341, + "grad_norm": 1.0920777216930633, + "learning_rate": 6.056220212386769e-06, + "loss": 0.1051, + "step": 3674 + }, + { + "epoch": 1.4680511182108626, + "grad_norm": 1.1039024614877324, + "learning_rate": 6.0516773049794545e-06, + "loss": 0.098, + "step": 3676 + }, + { + "epoch": 1.468849840255591, + "grad_norm": 0.9870962605725515, + "learning_rate": 6.0471334890221735e-06, + "loss": 0.1052, + "step": 3678 + }, + { + "epoch": 1.4696485623003195, + "grad_norm": 1.0307618080780037, + "learning_rate": 6.042588768440358e-06, + "loss": 0.0995, + "step": 3680 + }, + { + "epoch": 1.470447284345048, + "grad_norm": 1.0513694638233513, + "learning_rate": 6.038043147160215e-06, + "loss": 0.0954, + "step": 3682 + }, + { + "epoch": 1.4712460063897763, + "grad_norm": 1.0657839501710005, + "learning_rate": 6.033496629108736e-06, + "loss": 0.0995, + "step": 3684 + }, + { + "epoch": 1.4720447284345048, + "grad_norm": 1.1159795703718869, + "learning_rate": 6.02894921821368e-06, + "loss": 0.0966, + "step": 3686 + }, + { + "epoch": 1.4728434504792332, + "grad_norm": 0.9598839228144777, + "learning_rate": 6.024400918403581e-06, + "loss": 0.0968, + "step": 3688 + }, + { + "epoch": 1.4736421725239617, + "grad_norm": 1.0384010675372382, + "learning_rate": 6.019851733607744e-06, + "loss": 0.0955, + "step": 3690 + }, + { + "epoch": 1.4744408945686902, + "grad_norm": 1.1078140257609426, + "learning_rate": 6.015301667756234e-06, + "loss": 0.0942, + "step": 3692 + }, + { + "epoch": 1.4752396166134185, + "grad_norm": 1.2701615597150369, + "learning_rate": 6.0107507247798765e-06, + "loss": 0.0995, + "step": 3694 + }, + { + "epoch": 1.476038338658147, + "grad_norm": 1.2102770076457041, + "learning_rate": 6.006198908610261e-06, + "loss": 0.1202, + "step": 3696 + }, + { + "epoch": 1.4768370607028753, + "grad_norm": 1.145740181813383, + "learning_rate": 6.0016462231797225e-06, + "loss": 0.1117, + "step": 3698 + }, + { + "epoch": 1.4776357827476039, + "grad_norm": 1.0833428968981065, + "learning_rate": 5.997092672421356e-06, + "loss": 0.1037, + "step": 3700 + }, + { + "epoch": 1.4784345047923324, + "grad_norm": 1.0801478244071052, + "learning_rate": 5.9925382602689974e-06, + "loss": 0.1019, + "step": 3702 + }, + { + "epoch": 1.4792332268370607, + "grad_norm": 1.2134338336543735, + "learning_rate": 5.987982990657229e-06, + "loss": 0.1139, + "step": 3704 + }, + { + "epoch": 1.480031948881789, + "grad_norm": 1.0064298338790032, + "learning_rate": 5.9834268675213745e-06, + "loss": 0.1045, + "step": 3706 + }, + { + "epoch": 1.4808306709265175, + "grad_norm": 1.1699403671691384, + "learning_rate": 5.978869894797494e-06, + "loss": 0.1054, + "step": 3708 + }, + { + "epoch": 1.481629392971246, + "grad_norm": 0.9388532857646246, + "learning_rate": 5.974312076422381e-06, + "loss": 0.098, + "step": 3710 + }, + { + "epoch": 1.4824281150159744, + "grad_norm": 1.0345840078184048, + "learning_rate": 5.9697534163335645e-06, + "loss": 0.1077, + "step": 3712 + }, + { + "epoch": 1.483226837060703, + "grad_norm": 1.0824548650493346, + "learning_rate": 5.965193918469292e-06, + "loss": 0.099, + "step": 3714 + }, + { + "epoch": 1.4840255591054312, + "grad_norm": 1.1507346874731688, + "learning_rate": 5.9606335867685424e-06, + "loss": 0.1013, + "step": 3716 + }, + { + "epoch": 1.4848242811501597, + "grad_norm": 0.9628660977174563, + "learning_rate": 5.9560724251710116e-06, + "loss": 0.0943, + "step": 3718 + }, + { + "epoch": 1.4856230031948883, + "grad_norm": 1.0718435481145705, + "learning_rate": 5.95151043761711e-06, + "loss": 0.1014, + "step": 3720 + }, + { + "epoch": 1.4864217252396166, + "grad_norm": 0.929707333376811, + "learning_rate": 5.9469476280479685e-06, + "loss": 0.0907, + "step": 3722 + }, + { + "epoch": 1.487220447284345, + "grad_norm": 1.0656203252614662, + "learning_rate": 5.9423840004054235e-06, + "loss": 0.1024, + "step": 3724 + }, + { + "epoch": 1.4880191693290734, + "grad_norm": 1.1218591644267362, + "learning_rate": 5.9378195586320155e-06, + "loss": 0.1046, + "step": 3726 + }, + { + "epoch": 1.488817891373802, + "grad_norm": 1.171155440361567, + "learning_rate": 5.933254306670995e-06, + "loss": 0.1007, + "step": 3728 + }, + { + "epoch": 1.4896166134185305, + "grad_norm": 1.0258926148385281, + "learning_rate": 5.9286882484663054e-06, + "loss": 0.0955, + "step": 3730 + }, + { + "epoch": 1.4904153354632588, + "grad_norm": 1.133084775825884, + "learning_rate": 5.924121387962594e-06, + "loss": 0.1002, + "step": 3732 + }, + { + "epoch": 1.4912140575079873, + "grad_norm": 1.1855051630642663, + "learning_rate": 5.919553729105194e-06, + "loss": 0.1012, + "step": 3734 + }, + { + "epoch": 1.4920127795527156, + "grad_norm": 1.1146676203875627, + "learning_rate": 5.914985275840135e-06, + "loss": 0.1081, + "step": 3736 + }, + { + "epoch": 1.4928115015974441, + "grad_norm": 1.1041278385511843, + "learning_rate": 5.910416032114128e-06, + "loss": 0.1026, + "step": 3738 + }, + { + "epoch": 1.4936102236421724, + "grad_norm": 0.9898027730365901, + "learning_rate": 5.905846001874566e-06, + "loss": 0.0888, + "step": 3740 + }, + { + "epoch": 1.494408945686901, + "grad_norm": 1.1585515500268038, + "learning_rate": 5.90127518906953e-06, + "loss": 0.1052, + "step": 3742 + }, + { + "epoch": 1.4952076677316293, + "grad_norm": 1.0347303041203528, + "learning_rate": 5.896703597647765e-06, + "loss": 0.1058, + "step": 3744 + }, + { + "epoch": 1.4960063897763578, + "grad_norm": 0.9935016480185027, + "learning_rate": 5.892131231558696e-06, + "loss": 0.1115, + "step": 3746 + }, + { + "epoch": 1.4968051118210863, + "grad_norm": 1.0501951176001236, + "learning_rate": 5.88755809475242e-06, + "loss": 0.1025, + "step": 3748 + }, + { + "epoch": 1.4976038338658146, + "grad_norm": 1.0008810336543572, + "learning_rate": 5.882984191179691e-06, + "loss": 0.0984, + "step": 3750 + }, + { + "epoch": 1.4984025559105432, + "grad_norm": 1.2054742114396801, + "learning_rate": 5.878409524791931e-06, + "loss": 0.1101, + "step": 3752 + }, + { + "epoch": 1.4992012779552715, + "grad_norm": 1.094413921625223, + "learning_rate": 5.8738340995412216e-06, + "loss": 0.1055, + "step": 3754 + }, + { + "epoch": 1.5, + "grad_norm": 1.0505101513083803, + "learning_rate": 5.869257919380298e-06, + "loss": 0.1026, + "step": 3756 + }, + { + "epoch": 1.5007987220447285, + "grad_norm": 0.970894387236391, + "learning_rate": 5.864680988262546e-06, + "loss": 0.0935, + "step": 3758 + }, + { + "epoch": 1.5015974440894568, + "grad_norm": 1.1554311867796712, + "learning_rate": 5.8601033101420055e-06, + "loss": 0.1188, + "step": 3760 + }, + { + "epoch": 1.5023961661341851, + "grad_norm": 1.0830901347700692, + "learning_rate": 5.855524888973358e-06, + "loss": 0.1106, + "step": 3762 + }, + { + "epoch": 1.5031948881789137, + "grad_norm": 1.057272294501186, + "learning_rate": 5.850945728711925e-06, + "loss": 0.1028, + "step": 3764 + }, + { + "epoch": 1.5039936102236422, + "grad_norm": 1.0336589663794098, + "learning_rate": 5.846365833313672e-06, + "loss": 0.1019, + "step": 3766 + }, + { + "epoch": 1.5047923322683707, + "grad_norm": 1.11484447745884, + "learning_rate": 5.841785206735192e-06, + "loss": 0.1009, + "step": 3768 + }, + { + "epoch": 1.505591054313099, + "grad_norm": 1.1990777552155663, + "learning_rate": 5.837203852933721e-06, + "loss": 0.1078, + "step": 3770 + }, + { + "epoch": 1.5063897763578273, + "grad_norm": 1.2341825295207347, + "learning_rate": 5.83262177586711e-06, + "loss": 0.1103, + "step": 3772 + }, + { + "epoch": 1.5071884984025559, + "grad_norm": 1.372034044865105, + "learning_rate": 5.828038979493844e-06, + "loss": 0.1056, + "step": 3774 + }, + { + "epoch": 1.5079872204472844, + "grad_norm": 1.1741674239048192, + "learning_rate": 5.823455467773027e-06, + "loss": 0.1079, + "step": 3776 + }, + { + "epoch": 1.508785942492013, + "grad_norm": 1.0252917230334802, + "learning_rate": 5.81887124466438e-06, + "loss": 0.0914, + "step": 3778 + }, + { + "epoch": 1.5095846645367412, + "grad_norm": 1.1697427527897497, + "learning_rate": 5.814286314128239e-06, + "loss": 0.0991, + "step": 3780 + }, + { + "epoch": 1.5103833865814695, + "grad_norm": 1.042769204041001, + "learning_rate": 5.809700680125552e-06, + "loss": 0.1059, + "step": 3782 + }, + { + "epoch": 1.511182108626198, + "grad_norm": 1.0296503251696472, + "learning_rate": 5.805114346617874e-06, + "loss": 0.0946, + "step": 3784 + }, + { + "epoch": 1.5119808306709266, + "grad_norm": 1.009763042669415, + "learning_rate": 5.800527317567365e-06, + "loss": 0.0968, + "step": 3786 + }, + { + "epoch": 1.5127795527156551, + "grad_norm": 1.0866525391827386, + "learning_rate": 5.795939596936783e-06, + "loss": 0.1014, + "step": 3788 + }, + { + "epoch": 1.5135782747603834, + "grad_norm": 1.1514100776695269, + "learning_rate": 5.791351188689489e-06, + "loss": 0.1016, + "step": 3790 + }, + { + "epoch": 1.5143769968051117, + "grad_norm": 1.2403482979785827, + "learning_rate": 5.786762096789431e-06, + "loss": 0.1046, + "step": 3792 + }, + { + "epoch": 1.5151757188498403, + "grad_norm": 1.1732506896242403, + "learning_rate": 5.782172325201155e-06, + "loss": 0.1127, + "step": 3794 + }, + { + "epoch": 1.5159744408945688, + "grad_norm": 1.0881019594599013, + "learning_rate": 5.777581877889788e-06, + "loss": 0.0894, + "step": 3796 + }, + { + "epoch": 1.516773162939297, + "grad_norm": 1.1933368615094555, + "learning_rate": 5.772990758821046e-06, + "loss": 0.1078, + "step": 3798 + }, + { + "epoch": 1.5175718849840254, + "grad_norm": 0.9420287917337775, + "learning_rate": 5.768398971961221e-06, + "loss": 0.0926, + "step": 3800 + }, + { + "epoch": 1.518370607028754, + "grad_norm": 1.0673845641768755, + "learning_rate": 5.763806521277184e-06, + "loss": 0.0958, + "step": 3802 + }, + { + "epoch": 1.5191693290734825, + "grad_norm": 1.0756212442333353, + "learning_rate": 5.759213410736377e-06, + "loss": 0.0984, + "step": 3804 + }, + { + "epoch": 1.519968051118211, + "grad_norm": 1.0303021520595748, + "learning_rate": 5.7546196443068195e-06, + "loss": 0.099, + "step": 3806 + }, + { + "epoch": 1.5207667731629393, + "grad_norm": 1.1602092966550535, + "learning_rate": 5.750025225957086e-06, + "loss": 0.1052, + "step": 3808 + }, + { + "epoch": 1.5215654952076676, + "grad_norm": 1.1125067715645671, + "learning_rate": 5.745430159656324e-06, + "loss": 0.0988, + "step": 3810 + }, + { + "epoch": 1.5223642172523961, + "grad_norm": 1.0076518432605175, + "learning_rate": 5.740834449374237e-06, + "loss": 0.1006, + "step": 3812 + }, + { + "epoch": 1.5231629392971247, + "grad_norm": 1.1636274304360332, + "learning_rate": 5.7362380990810836e-06, + "loss": 0.0982, + "step": 3814 + }, + { + "epoch": 1.5239616613418532, + "grad_norm": 1.589543826243573, + "learning_rate": 5.731641112747679e-06, + "loss": 0.1033, + "step": 3816 + }, + { + "epoch": 1.5247603833865815, + "grad_norm": 1.130992557679762, + "learning_rate": 5.7270434943453844e-06, + "loss": 0.098, + "step": 3818 + }, + { + "epoch": 1.5255591054313098, + "grad_norm": 0.9432691902452544, + "learning_rate": 5.722445247846107e-06, + "loss": 0.0888, + "step": 3820 + }, + { + "epoch": 1.5263578274760383, + "grad_norm": 1.106347020699136, + "learning_rate": 5.717846377222302e-06, + "loss": 0.0936, + "step": 3822 + }, + { + "epoch": 1.5271565495207668, + "grad_norm": 1.1052796082215754, + "learning_rate": 5.713246886446954e-06, + "loss": 0.1007, + "step": 3824 + }, + { + "epoch": 1.5279552715654952, + "grad_norm": 1.1941612947080527, + "learning_rate": 5.708646779493592e-06, + "loss": 0.1107, + "step": 3826 + }, + { + "epoch": 1.5287539936102237, + "grad_norm": 1.0917404470546062, + "learning_rate": 5.704046060336276e-06, + "loss": 0.0985, + "step": 3828 + }, + { + "epoch": 1.529552715654952, + "grad_norm": 1.026940535614031, + "learning_rate": 5.699444732949592e-06, + "loss": 0.1008, + "step": 3830 + }, + { + "epoch": 1.5303514376996805, + "grad_norm": 1.1825164845911036, + "learning_rate": 5.694842801308651e-06, + "loss": 0.1039, + "step": 3832 + }, + { + "epoch": 1.531150159744409, + "grad_norm": 1.2523857121893918, + "learning_rate": 5.69024026938909e-06, + "loss": 0.1073, + "step": 3834 + }, + { + "epoch": 1.5319488817891374, + "grad_norm": 0.9618590795563883, + "learning_rate": 5.6856371411670605e-06, + "loss": 0.0895, + "step": 3836 + }, + { + "epoch": 1.5327476038338657, + "grad_norm": 1.087295231368687, + "learning_rate": 5.681033420619233e-06, + "loss": 0.1079, + "step": 3838 + }, + { + "epoch": 1.5335463258785942, + "grad_norm": 1.0751240176712655, + "learning_rate": 5.676429111722786e-06, + "loss": 0.119, + "step": 3840 + }, + { + "epoch": 1.5343450479233227, + "grad_norm": 0.9761262443774417, + "learning_rate": 5.67182421845541e-06, + "loss": 0.0985, + "step": 3842 + }, + { + "epoch": 1.5351437699680512, + "grad_norm": 1.064750803984951, + "learning_rate": 5.6672187447952944e-06, + "loss": 0.1164, + "step": 3844 + }, + { + "epoch": 1.5359424920127795, + "grad_norm": 1.0336803651600035, + "learning_rate": 5.662612694721139e-06, + "loss": 0.0933, + "step": 3846 + }, + { + "epoch": 1.5367412140575079, + "grad_norm": 1.216380707446393, + "learning_rate": 5.6580060722121325e-06, + "loss": 0.0985, + "step": 3848 + }, + { + "epoch": 1.5375399361022364, + "grad_norm": 1.0486840682788419, + "learning_rate": 5.6533988812479626e-06, + "loss": 0.0884, + "step": 3850 + }, + { + "epoch": 1.538338658146965, + "grad_norm": 1.0017531938240458, + "learning_rate": 5.648791125808809e-06, + "loss": 0.0971, + "step": 3852 + }, + { + "epoch": 1.5391373801916934, + "grad_norm": 0.9711321520409679, + "learning_rate": 5.644182809875338e-06, + "loss": 0.0967, + "step": 3854 + }, + { + "epoch": 1.5399361022364217, + "grad_norm": 1.0444214456541656, + "learning_rate": 5.639573937428699e-06, + "loss": 0.1005, + "step": 3856 + }, + { + "epoch": 1.54073482428115, + "grad_norm": 1.1384563164764703, + "learning_rate": 5.634964512450522e-06, + "loss": 0.0937, + "step": 3858 + }, + { + "epoch": 1.5415335463258786, + "grad_norm": 1.1704875948472755, + "learning_rate": 5.630354538922916e-06, + "loss": 0.1069, + "step": 3860 + }, + { + "epoch": 1.542332268370607, + "grad_norm": 1.153542545533936, + "learning_rate": 5.6257440208284645e-06, + "loss": 0.0944, + "step": 3862 + }, + { + "epoch": 1.5431309904153354, + "grad_norm": 1.2017126948215129, + "learning_rate": 5.621132962150216e-06, + "loss": 0.1056, + "step": 3864 + }, + { + "epoch": 1.543929712460064, + "grad_norm": 1.1096553295923908, + "learning_rate": 5.616521366871697e-06, + "loss": 0.1084, + "step": 3866 + }, + { + "epoch": 1.5447284345047922, + "grad_norm": 1.2932559709159934, + "learning_rate": 5.611909238976885e-06, + "loss": 0.1077, + "step": 3868 + }, + { + "epoch": 1.5455271565495208, + "grad_norm": 1.1009979044862985, + "learning_rate": 5.607296582450224e-06, + "loss": 0.0946, + "step": 3870 + }, + { + "epoch": 1.5463258785942493, + "grad_norm": 1.1753388398369289, + "learning_rate": 5.6026834012766155e-06, + "loss": 0.1056, + "step": 3872 + }, + { + "epoch": 1.5471246006389776, + "grad_norm": 1.1300535782520567, + "learning_rate": 5.598069699441414e-06, + "loss": 0.1072, + "step": 3874 + }, + { + "epoch": 1.547923322683706, + "grad_norm": 1.1361407815794689, + "learning_rate": 5.5934554809304184e-06, + "loss": 0.1057, + "step": 3876 + }, + { + "epoch": 1.5487220447284344, + "grad_norm": 1.0774923728003276, + "learning_rate": 5.5888407497298824e-06, + "loss": 0.0978, + "step": 3878 + }, + { + "epoch": 1.549520766773163, + "grad_norm": 1.1078465378909663, + "learning_rate": 5.584225509826497e-06, + "loss": 0.0847, + "step": 3880 + }, + { + "epoch": 1.5503194888178915, + "grad_norm": 1.117154173356503, + "learning_rate": 5.579609765207393e-06, + "loss": 0.1053, + "step": 3882 + }, + { + "epoch": 1.5511182108626198, + "grad_norm": 1.1442174998289312, + "learning_rate": 5.574993519860139e-06, + "loss": 0.1024, + "step": 3884 + }, + { + "epoch": 1.5519169329073481, + "grad_norm": 1.1475487693664175, + "learning_rate": 5.5703767777727354e-06, + "loss": 0.1073, + "step": 3886 + }, + { + "epoch": 1.5527156549520766, + "grad_norm": 1.1044103730150632, + "learning_rate": 5.565759542933612e-06, + "loss": 0.1073, + "step": 3888 + }, + { + "epoch": 1.5535143769968052, + "grad_norm": 1.1544497802356082, + "learning_rate": 5.561141819331624e-06, + "loss": 0.1055, + "step": 3890 + }, + { + "epoch": 1.5543130990415337, + "grad_norm": 1.13696637094894, + "learning_rate": 5.556523610956049e-06, + "loss": 0.1032, + "step": 3892 + }, + { + "epoch": 1.555111821086262, + "grad_norm": 1.0623393891516841, + "learning_rate": 5.55190492179658e-06, + "loss": 0.1139, + "step": 3894 + }, + { + "epoch": 1.5559105431309903, + "grad_norm": 1.0054614979615821, + "learning_rate": 5.547285755843334e-06, + "loss": 0.0888, + "step": 3896 + }, + { + "epoch": 1.5567092651757188, + "grad_norm": 0.9923971125933808, + "learning_rate": 5.542666117086832e-06, + "loss": 0.0884, + "step": 3898 + }, + { + "epoch": 1.5575079872204474, + "grad_norm": 1.1566532715523212, + "learning_rate": 5.538046009518007e-06, + "loss": 0.1053, + "step": 3900 + }, + { + "epoch": 1.5583067092651757, + "grad_norm": 1.0988640530351912, + "learning_rate": 5.5334254371281934e-06, + "loss": 0.0987, + "step": 3902 + }, + { + "epoch": 1.5591054313099042, + "grad_norm": 1.1389960010587739, + "learning_rate": 5.5288044039091335e-06, + "loss": 0.1075, + "step": 3904 + }, + { + "epoch": 1.5599041533546325, + "grad_norm": 1.2172334905611906, + "learning_rate": 5.524182913852961e-06, + "loss": 0.1017, + "step": 3906 + }, + { + "epoch": 1.560702875399361, + "grad_norm": 1.0961769840109552, + "learning_rate": 5.519560970952208e-06, + "loss": 0.1074, + "step": 3908 + }, + { + "epoch": 1.5615015974440896, + "grad_norm": 0.9984956732002762, + "learning_rate": 5.514938579199798e-06, + "loss": 0.1151, + "step": 3910 + }, + { + "epoch": 1.5623003194888179, + "grad_norm": 1.0275799079683399, + "learning_rate": 5.510315742589042e-06, + "loss": 0.1024, + "step": 3912 + }, + { + "epoch": 1.5630990415335462, + "grad_norm": 1.11512381478294, + "learning_rate": 5.505692465113633e-06, + "loss": 0.1132, + "step": 3914 + }, + { + "epoch": 1.5638977635782747, + "grad_norm": 1.0178232850189146, + "learning_rate": 5.5010687507676466e-06, + "loss": 0.1075, + "step": 3916 + }, + { + "epoch": 1.5646964856230032, + "grad_norm": 1.0142014809665243, + "learning_rate": 5.496444603545535e-06, + "loss": 0.0962, + "step": 3918 + }, + { + "epoch": 1.5654952076677318, + "grad_norm": 1.046870424333617, + "learning_rate": 5.491820027442126e-06, + "loss": 0.1043, + "step": 3920 + }, + { + "epoch": 1.56629392971246, + "grad_norm": 1.1510480151229603, + "learning_rate": 5.487195026452619e-06, + "loss": 0.102, + "step": 3922 + }, + { + "epoch": 1.5670926517571884, + "grad_norm": 1.27959847253193, + "learning_rate": 5.482569604572577e-06, + "loss": 0.1021, + "step": 3924 + }, + { + "epoch": 1.567891373801917, + "grad_norm": 0.9918976439707066, + "learning_rate": 5.477943765797926e-06, + "loss": 0.1, + "step": 3926 + }, + { + "epoch": 1.5686900958466454, + "grad_norm": 1.080408951439636, + "learning_rate": 5.473317514124958e-06, + "loss": 0.1054, + "step": 3928 + }, + { + "epoch": 1.569488817891374, + "grad_norm": 1.081550022627091, + "learning_rate": 5.4686908535503135e-06, + "loss": 0.1019, + "step": 3930 + }, + { + "epoch": 1.5702875399361023, + "grad_norm": 1.0811985668125383, + "learning_rate": 5.464063788070996e-06, + "loss": 0.0936, + "step": 3932 + }, + { + "epoch": 1.5710862619808306, + "grad_norm": 0.9821534674944753, + "learning_rate": 5.459436321684348e-06, + "loss": 0.0933, + "step": 3934 + }, + { + "epoch": 1.571884984025559, + "grad_norm": 1.0914512560209013, + "learning_rate": 5.454808458388069e-06, + "loss": 0.1148, + "step": 3936 + }, + { + "epoch": 1.5726837060702876, + "grad_norm": 1.164283845889731, + "learning_rate": 5.4501802021801935e-06, + "loss": 0.1018, + "step": 3938 + }, + { + "epoch": 1.573482428115016, + "grad_norm": 0.970687102295011, + "learning_rate": 5.445551557059098e-06, + "loss": 0.0936, + "step": 3940 + }, + { + "epoch": 1.5742811501597445, + "grad_norm": 1.1042280103745048, + "learning_rate": 5.440922527023494e-06, + "loss": 0.093, + "step": 3942 + }, + { + "epoch": 1.5750798722044728, + "grad_norm": 1.2131440637336284, + "learning_rate": 5.436293116072431e-06, + "loss": 0.1136, + "step": 3944 + }, + { + "epoch": 1.5758785942492013, + "grad_norm": 1.055324845043691, + "learning_rate": 5.431663328205279e-06, + "loss": 0.1043, + "step": 3946 + }, + { + "epoch": 1.5766773162939298, + "grad_norm": 0.9811618715390004, + "learning_rate": 5.42703316742174e-06, + "loss": 0.093, + "step": 3948 + }, + { + "epoch": 1.5774760383386581, + "grad_norm": 1.2173508502121098, + "learning_rate": 5.4224026377218365e-06, + "loss": 0.1045, + "step": 3950 + }, + { + "epoch": 1.5782747603833864, + "grad_norm": 1.1661185719259703, + "learning_rate": 5.417771743105908e-06, + "loss": 0.1098, + "step": 3952 + }, + { + "epoch": 1.579073482428115, + "grad_norm": 1.0058471682242138, + "learning_rate": 5.413140487574608e-06, + "loss": 0.1106, + "step": 3954 + }, + { + "epoch": 1.5798722044728435, + "grad_norm": 1.0394406333548927, + "learning_rate": 5.408508875128911e-06, + "loss": 0.0929, + "step": 3956 + }, + { + "epoch": 1.580670926517572, + "grad_norm": 1.1297610914562695, + "learning_rate": 5.403876909770087e-06, + "loss": 0.101, + "step": 3958 + }, + { + "epoch": 1.5814696485623003, + "grad_norm": 1.1543975422216644, + "learning_rate": 5.399244595499721e-06, + "loss": 0.1121, + "step": 3960 + }, + { + "epoch": 1.5822683706070286, + "grad_norm": 1.1306410767900936, + "learning_rate": 5.394611936319692e-06, + "loss": 0.1016, + "step": 3962 + }, + { + "epoch": 1.5830670926517572, + "grad_norm": 0.9448150345192816, + "learning_rate": 5.389978936232185e-06, + "loss": 0.0893, + "step": 3964 + }, + { + "epoch": 1.5838658146964857, + "grad_norm": 0.9463251751033753, + "learning_rate": 5.385345599239669e-06, + "loss": 0.0946, + "step": 3966 + }, + { + "epoch": 1.5846645367412142, + "grad_norm": 1.0596847272661667, + "learning_rate": 5.380711929344915e-06, + "loss": 0.1151, + "step": 3968 + }, + { + "epoch": 1.5854632587859425, + "grad_norm": 1.0752597682928209, + "learning_rate": 5.376077930550973e-06, + "loss": 0.107, + "step": 3970 + }, + { + "epoch": 1.5862619808306708, + "grad_norm": 1.116596755502733, + "learning_rate": 5.371443606861186e-06, + "loss": 0.1019, + "step": 3972 + }, + { + "epoch": 1.5870607028753994, + "grad_norm": 1.0051885963957903, + "learning_rate": 5.366808962279166e-06, + "loss": 0.0962, + "step": 3974 + }, + { + "epoch": 1.5878594249201279, + "grad_norm": 1.027781803302719, + "learning_rate": 5.362174000808813e-06, + "loss": 0.0971, + "step": 3976 + }, + { + "epoch": 1.5886581469648562, + "grad_norm": 0.8827091346960793, + "learning_rate": 5.3575387264542934e-06, + "loss": 0.0855, + "step": 3978 + }, + { + "epoch": 1.5894568690095847, + "grad_norm": 1.072987187940347, + "learning_rate": 5.352903143220051e-06, + "loss": 0.0967, + "step": 3980 + }, + { + "epoch": 1.590255591054313, + "grad_norm": 1.0526867412563083, + "learning_rate": 5.348267255110787e-06, + "loss": 0.092, + "step": 3982 + }, + { + "epoch": 1.5910543130990416, + "grad_norm": 1.1489504347454111, + "learning_rate": 5.343631066131476e-06, + "loss": 0.1105, + "step": 3984 + }, + { + "epoch": 1.59185303514377, + "grad_norm": 1.1540840295187986, + "learning_rate": 5.338994580287345e-06, + "loss": 0.1011, + "step": 3986 + }, + { + "epoch": 1.5926517571884984, + "grad_norm": 1.1394245571406438, + "learning_rate": 5.334357801583882e-06, + "loss": 0.0996, + "step": 3988 + }, + { + "epoch": 1.5934504792332267, + "grad_norm": 1.0287583891620165, + "learning_rate": 5.329720734026824e-06, + "loss": 0.0957, + "step": 3990 + }, + { + "epoch": 1.5942492012779552, + "grad_norm": 1.0779004754661263, + "learning_rate": 5.325083381622165e-06, + "loss": 0.0979, + "step": 3992 + }, + { + "epoch": 1.5950479233226837, + "grad_norm": 1.200252910830259, + "learning_rate": 5.320445748376133e-06, + "loss": 0.0968, + "step": 3994 + }, + { + "epoch": 1.5958466453674123, + "grad_norm": 1.2695805654962276, + "learning_rate": 5.3158078382952095e-06, + "loss": 0.1137, + "step": 3996 + }, + { + "epoch": 1.5966453674121406, + "grad_norm": 1.0386371478246852, + "learning_rate": 5.311169655386112e-06, + "loss": 0.0981, + "step": 3998 + }, + { + "epoch": 1.5974440894568689, + "grad_norm": 1.1857255254365702, + "learning_rate": 5.30653120365579e-06, + "loss": 0.1205, + "step": 4000 + }, + { + "epoch": 1.5974440894568689, + "eval_loss": 0.15445688366889954, + "eval_runtime": 417.6616, + "eval_samples_per_second": 42.635, + "eval_steps_per_second": 5.33, + "step": 4000 + }, + { + "epoch": 1.5982428115015974, + "grad_norm": 1.1097474658690198, + "learning_rate": 5.301892487111431e-06, + "loss": 0.1025, + "step": 4002 + }, + { + "epoch": 1.599041533546326, + "grad_norm": 1.1919685218314882, + "learning_rate": 5.2972535097604474e-06, + "loss": 0.112, + "step": 4004 + }, + { + "epoch": 1.5998402555910545, + "grad_norm": 0.9947826284264967, + "learning_rate": 5.292614275610476e-06, + "loss": 0.1021, + "step": 4006 + }, + { + "epoch": 1.6006389776357828, + "grad_norm": 0.9427973792484328, + "learning_rate": 5.28797478866938e-06, + "loss": 0.091, + "step": 4008 + }, + { + "epoch": 1.601437699680511, + "grad_norm": 1.1411139408840176, + "learning_rate": 5.283335052945238e-06, + "loss": 0.0975, + "step": 4010 + }, + { + "epoch": 1.6022364217252396, + "grad_norm": 1.1012457970159555, + "learning_rate": 5.278695072446342e-06, + "loss": 0.0951, + "step": 4012 + }, + { + "epoch": 1.6030351437699681, + "grad_norm": 1.1626019144814381, + "learning_rate": 5.2740548511812e-06, + "loss": 0.0998, + "step": 4014 + }, + { + "epoch": 1.6038338658146964, + "grad_norm": 1.1201103649096615, + "learning_rate": 5.269414393158523e-06, + "loss": 0.0987, + "step": 4016 + }, + { + "epoch": 1.604632587859425, + "grad_norm": 1.1630697149501579, + "learning_rate": 5.264773702387232e-06, + "loss": 0.1022, + "step": 4018 + }, + { + "epoch": 1.6054313099041533, + "grad_norm": 1.2522557555471332, + "learning_rate": 5.2601327828764415e-06, + "loss": 0.1042, + "step": 4020 + }, + { + "epoch": 1.6062300319488818, + "grad_norm": 1.1002664728813976, + "learning_rate": 5.255491638635472e-06, + "loss": 0.1131, + "step": 4022 + }, + { + "epoch": 1.6070287539936103, + "grad_norm": 1.1476460289094692, + "learning_rate": 5.250850273673831e-06, + "loss": 0.1041, + "step": 4024 + }, + { + "epoch": 1.6078274760383386, + "grad_norm": 1.2997227467476715, + "learning_rate": 5.246208692001224e-06, + "loss": 0.109, + "step": 4026 + }, + { + "epoch": 1.608626198083067, + "grad_norm": 1.094935343872026, + "learning_rate": 5.241566897627536e-06, + "loss": 0.1007, + "step": 4028 + }, + { + "epoch": 1.6094249201277955, + "grad_norm": 1.083296893505705, + "learning_rate": 5.236924894562841e-06, + "loss": 0.102, + "step": 4030 + }, + { + "epoch": 1.610223642172524, + "grad_norm": 1.0723795202774216, + "learning_rate": 5.232282686817392e-06, + "loss": 0.0996, + "step": 4032 + }, + { + "epoch": 1.6110223642172525, + "grad_norm": 1.078393017722415, + "learning_rate": 5.227640278401616e-06, + "loss": 0.1047, + "step": 4034 + }, + { + "epoch": 1.6118210862619808, + "grad_norm": 1.1241219264829283, + "learning_rate": 5.222997673326118e-06, + "loss": 0.1125, + "step": 4036 + }, + { + "epoch": 1.6126198083067091, + "grad_norm": 1.1494086764740628, + "learning_rate": 5.218354875601672e-06, + "loss": 0.1077, + "step": 4038 + }, + { + "epoch": 1.6134185303514377, + "grad_norm": 1.0313683486241818, + "learning_rate": 5.213711889239214e-06, + "loss": 0.1056, + "step": 4040 + }, + { + "epoch": 1.6142172523961662, + "grad_norm": 1.0785913080866496, + "learning_rate": 5.209068718249849e-06, + "loss": 0.1026, + "step": 4042 + }, + { + "epoch": 1.6150159744408947, + "grad_norm": 1.1093921293966087, + "learning_rate": 5.2044253666448364e-06, + "loss": 0.1019, + "step": 4044 + }, + { + "epoch": 1.615814696485623, + "grad_norm": 1.048516005384363, + "learning_rate": 5.1997818384355945e-06, + "loss": 0.1198, + "step": 4046 + }, + { + "epoch": 1.6166134185303513, + "grad_norm": 1.0667431272462546, + "learning_rate": 5.195138137633695e-06, + "loss": 0.1, + "step": 4048 + }, + { + "epoch": 1.6174121405750799, + "grad_norm": 1.162801685632335, + "learning_rate": 5.190494268250856e-06, + "loss": 0.108, + "step": 4050 + }, + { + "epoch": 1.6182108626198084, + "grad_norm": 1.0168007347140544, + "learning_rate": 5.185850234298943e-06, + "loss": 0.0927, + "step": 4052 + }, + { + "epoch": 1.6190095846645367, + "grad_norm": 1.0037984889528644, + "learning_rate": 5.1812060397899624e-06, + "loss": 0.0876, + "step": 4054 + }, + { + "epoch": 1.619808306709265, + "grad_norm": 1.0382499080256122, + "learning_rate": 5.17656168873606e-06, + "loss": 0.107, + "step": 4056 + }, + { + "epoch": 1.6206070287539935, + "grad_norm": 1.1185835251899559, + "learning_rate": 5.171917185149518e-06, + "loss": 0.1044, + "step": 4058 + }, + { + "epoch": 1.621405750798722, + "grad_norm": 1.0657788685206113, + "learning_rate": 5.167272533042748e-06, + "loss": 0.097, + "step": 4060 + }, + { + "epoch": 1.6222044728434506, + "grad_norm": 1.0542383484498254, + "learning_rate": 5.162627736428293e-06, + "loss": 0.0946, + "step": 4062 + }, + { + "epoch": 1.623003194888179, + "grad_norm": 1.1196684728128978, + "learning_rate": 5.157982799318817e-06, + "loss": 0.1071, + "step": 4064 + }, + { + "epoch": 1.6238019169329072, + "grad_norm": 1.0973928801710329, + "learning_rate": 5.153337725727109e-06, + "loss": 0.0965, + "step": 4066 + }, + { + "epoch": 1.6246006389776357, + "grad_norm": 1.085899130068226, + "learning_rate": 5.148692519666072e-06, + "loss": 0.087, + "step": 4068 + }, + { + "epoch": 1.6253993610223643, + "grad_norm": 1.0337553281158798, + "learning_rate": 5.1440471851487286e-06, + "loss": 0.1031, + "step": 4070 + }, + { + "epoch": 1.6261980830670928, + "grad_norm": 1.0455524042989242, + "learning_rate": 5.139401726188208e-06, + "loss": 0.1012, + "step": 4072 + }, + { + "epoch": 1.626996805111821, + "grad_norm": 1.1232283088505088, + "learning_rate": 5.1347561467977495e-06, + "loss": 0.1048, + "step": 4074 + }, + { + "epoch": 1.6277955271565494, + "grad_norm": 1.1377273856980297, + "learning_rate": 5.130110450990694e-06, + "loss": 0.1015, + "step": 4076 + }, + { + "epoch": 1.628594249201278, + "grad_norm": 1.0376776381456139, + "learning_rate": 5.1254646427804855e-06, + "loss": 0.0892, + "step": 4078 + }, + { + "epoch": 1.6293929712460065, + "grad_norm": 1.0639780774252217, + "learning_rate": 5.120818726180662e-06, + "loss": 0.0909, + "step": 4080 + }, + { + "epoch": 1.630191693290735, + "grad_norm": 1.0527532913896227, + "learning_rate": 5.116172705204859e-06, + "loss": 0.092, + "step": 4082 + }, + { + "epoch": 1.6309904153354633, + "grad_norm": 1.129982374301854, + "learning_rate": 5.111526583866801e-06, + "loss": 0.1016, + "step": 4084 + }, + { + "epoch": 1.6317891373801916, + "grad_norm": 1.0125124329212987, + "learning_rate": 5.106880366180297e-06, + "loss": 0.0933, + "step": 4086 + }, + { + "epoch": 1.6325878594249201, + "grad_norm": 1.0905837841114607, + "learning_rate": 5.1022340561592396e-06, + "loss": 0.1028, + "step": 4088 + }, + { + "epoch": 1.6333865814696487, + "grad_norm": 1.058617488709683, + "learning_rate": 5.097587657817605e-06, + "loss": 0.0988, + "step": 4090 + }, + { + "epoch": 1.634185303514377, + "grad_norm": 1.1990334757384018, + "learning_rate": 5.09294117516944e-06, + "loss": 0.1045, + "step": 4092 + }, + { + "epoch": 1.6349840255591053, + "grad_norm": 1.1978345768548917, + "learning_rate": 5.08829461222887e-06, + "loss": 0.1134, + "step": 4094 + }, + { + "epoch": 1.6357827476038338, + "grad_norm": 1.082413217695199, + "learning_rate": 5.083647973010085e-06, + "loss": 0.1002, + "step": 4096 + }, + { + "epoch": 1.6365814696485623, + "grad_norm": 1.1386432805954145, + "learning_rate": 5.079001261527345e-06, + "loss": 0.1029, + "step": 4098 + }, + { + "epoch": 1.6373801916932909, + "grad_norm": 1.1557604620641984, + "learning_rate": 5.074354481794969e-06, + "loss": 0.0951, + "step": 4100 + }, + { + "epoch": 1.6381789137380192, + "grad_norm": 1.2243404366242359, + "learning_rate": 5.069707637827336e-06, + "loss": 0.1121, + "step": 4102 + }, + { + "epoch": 1.6389776357827475, + "grad_norm": 1.157977041885688, + "learning_rate": 5.065060733638878e-06, + "loss": 0.1104, + "step": 4104 + }, + { + "epoch": 1.639776357827476, + "grad_norm": 1.094920449521866, + "learning_rate": 5.0604137732440875e-06, + "loss": 0.0898, + "step": 4106 + }, + { + "epoch": 1.6405750798722045, + "grad_norm": 0.9795046872308669, + "learning_rate": 5.055766760657497e-06, + "loss": 0.0933, + "step": 4108 + }, + { + "epoch": 1.641373801916933, + "grad_norm": 1.2246816953116304, + "learning_rate": 5.051119699893686e-06, + "loss": 0.1047, + "step": 4110 + }, + { + "epoch": 1.6421725239616614, + "grad_norm": 0.9474742886147536, + "learning_rate": 5.046472594967279e-06, + "loss": 0.0825, + "step": 4112 + }, + { + "epoch": 1.6429712460063897, + "grad_norm": 1.050959142356909, + "learning_rate": 5.041825449892933e-06, + "loss": 0.102, + "step": 4114 + }, + { + "epoch": 1.6437699680511182, + "grad_norm": 1.0798115551123924, + "learning_rate": 5.037178268685345e-06, + "loss": 0.0943, + "step": 4116 + }, + { + "epoch": 1.6445686900958467, + "grad_norm": 0.9443315270439328, + "learning_rate": 5.032531055359241e-06, + "loss": 0.094, + "step": 4118 + }, + { + "epoch": 1.645367412140575, + "grad_norm": 1.0292007756942556, + "learning_rate": 5.027883813929374e-06, + "loss": 0.0981, + "step": 4120 + }, + { + "epoch": 1.6461661341853036, + "grad_norm": 1.036373592711972, + "learning_rate": 5.0232365484105235e-06, + "loss": 0.0955, + "step": 4122 + }, + { + "epoch": 1.6469648562300319, + "grad_norm": 1.0940288914430132, + "learning_rate": 5.018589262817488e-06, + "loss": 0.106, + "step": 4124 + }, + { + "epoch": 1.6477635782747604, + "grad_norm": 1.1951722585755589, + "learning_rate": 5.013941961165082e-06, + "loss": 0.1127, + "step": 4126 + }, + { + "epoch": 1.648562300319489, + "grad_norm": 1.0463562631972685, + "learning_rate": 5.009294647468137e-06, + "loss": 0.0902, + "step": 4128 + }, + { + "epoch": 1.6493610223642172, + "grad_norm": 1.1257302379959406, + "learning_rate": 5.004647325741495e-06, + "loss": 0.0911, + "step": 4130 + }, + { + "epoch": 1.6501597444089455, + "grad_norm": 0.9285397218586545, + "learning_rate": 5e-06, + "loss": 0.0821, + "step": 4132 + }, + { + "epoch": 1.650958466453674, + "grad_norm": 1.13221857976679, + "learning_rate": 4.9953526742585065e-06, + "loss": 0.093, + "step": 4134 + }, + { + "epoch": 1.6517571884984026, + "grad_norm": 1.119100390728916, + "learning_rate": 4.990705352531864e-06, + "loss": 0.096, + "step": 4136 + }, + { + "epoch": 1.6525559105431311, + "grad_norm": 1.0890152144340473, + "learning_rate": 4.9860580388349196e-06, + "loss": 0.0968, + "step": 4138 + }, + { + "epoch": 1.6533546325878594, + "grad_norm": 1.0415531324033633, + "learning_rate": 4.981410737182515e-06, + "loss": 0.1018, + "step": 4140 + }, + { + "epoch": 1.6541533546325877, + "grad_norm": 1.1279757348197161, + "learning_rate": 4.976763451589478e-06, + "loss": 0.0989, + "step": 4142 + }, + { + "epoch": 1.6549520766773163, + "grad_norm": 1.1174158609629532, + "learning_rate": 4.972116186070626e-06, + "loss": 0.0981, + "step": 4144 + }, + { + "epoch": 1.6557507987220448, + "grad_norm": 0.978788514984628, + "learning_rate": 4.96746894464076e-06, + "loss": 0.086, + "step": 4146 + }, + { + "epoch": 1.6565495207667733, + "grad_norm": 1.0391532776923562, + "learning_rate": 4.962821731314656e-06, + "loss": 0.0893, + "step": 4148 + }, + { + "epoch": 1.6573482428115016, + "grad_norm": 1.1408365847624822, + "learning_rate": 4.958174550107069e-06, + "loss": 0.0926, + "step": 4150 + }, + { + "epoch": 1.65814696485623, + "grad_norm": 1.0976480883499857, + "learning_rate": 4.953527405032723e-06, + "loss": 0.0969, + "step": 4152 + }, + { + "epoch": 1.6589456869009584, + "grad_norm": 1.099197170300517, + "learning_rate": 4.948880300106315e-06, + "loss": 0.0889, + "step": 4154 + }, + { + "epoch": 1.659744408945687, + "grad_norm": 0.9962207801035776, + "learning_rate": 4.944233239342505e-06, + "loss": 0.0828, + "step": 4156 + }, + { + "epoch": 1.6605431309904153, + "grad_norm": 1.164367259050149, + "learning_rate": 4.939586226755913e-06, + "loss": 0.0993, + "step": 4158 + }, + { + "epoch": 1.6613418530351438, + "grad_norm": 1.0915643883560848, + "learning_rate": 4.934939266361123e-06, + "loss": 0.0958, + "step": 4160 + }, + { + "epoch": 1.6621405750798721, + "grad_norm": 1.0384971354394401, + "learning_rate": 4.930292362172667e-06, + "loss": 0.0987, + "step": 4162 + }, + { + "epoch": 1.6629392971246006, + "grad_norm": 0.9714525373847863, + "learning_rate": 4.9256455182050345e-06, + "loss": 0.0985, + "step": 4164 + }, + { + "epoch": 1.6637380191693292, + "grad_norm": 1.0774781447464505, + "learning_rate": 4.920998738472657e-06, + "loss": 0.0956, + "step": 4166 + }, + { + "epoch": 1.6645367412140575, + "grad_norm": 1.1101370347081296, + "learning_rate": 4.916352026989914e-06, + "loss": 0.1085, + "step": 4168 + }, + { + "epoch": 1.6653354632587858, + "grad_norm": 1.1576145184604405, + "learning_rate": 4.911705387771131e-06, + "loss": 0.0998, + "step": 4170 + }, + { + "epoch": 1.6661341853035143, + "grad_norm": 1.0183758786516857, + "learning_rate": 4.90705882483056e-06, + "loss": 0.0856, + "step": 4172 + }, + { + "epoch": 1.6669329073482428, + "grad_norm": 1.110373708674366, + "learning_rate": 4.902412342182396e-06, + "loss": 0.0922, + "step": 4174 + }, + { + "epoch": 1.6677316293929714, + "grad_norm": 1.0395390430208176, + "learning_rate": 4.897765943840761e-06, + "loss": 0.0986, + "step": 4176 + }, + { + "epoch": 1.6685303514376997, + "grad_norm": 1.0700900909956106, + "learning_rate": 4.8931196338197045e-06, + "loss": 0.1094, + "step": 4178 + }, + { + "epoch": 1.669329073482428, + "grad_norm": 1.0744452627922665, + "learning_rate": 4.888473416133201e-06, + "loss": 0.0945, + "step": 4180 + }, + { + "epoch": 1.6701277955271565, + "grad_norm": 1.1251668177944376, + "learning_rate": 4.883827294795142e-06, + "loss": 0.0973, + "step": 4182 + }, + { + "epoch": 1.670926517571885, + "grad_norm": 0.9568523532004309, + "learning_rate": 4.87918127381934e-06, + "loss": 0.0924, + "step": 4184 + }, + { + "epoch": 1.6717252396166136, + "grad_norm": 1.0605161422920366, + "learning_rate": 4.874535357219517e-06, + "loss": 0.0969, + "step": 4186 + }, + { + "epoch": 1.6725239616613419, + "grad_norm": 1.0949585639802137, + "learning_rate": 4.869889549009309e-06, + "loss": 0.1023, + "step": 4188 + }, + { + "epoch": 1.6733226837060702, + "grad_norm": 1.0200170497583934, + "learning_rate": 4.8652438532022505e-06, + "loss": 0.0968, + "step": 4190 + }, + { + "epoch": 1.6741214057507987, + "grad_norm": 1.1240991571874712, + "learning_rate": 4.860598273811793e-06, + "loss": 0.1056, + "step": 4192 + }, + { + "epoch": 1.6749201277955272, + "grad_norm": 1.1355929086530765, + "learning_rate": 4.855952814851272e-06, + "loss": 0.096, + "step": 4194 + }, + { + "epoch": 1.6757188498402555, + "grad_norm": 0.9333875889896245, + "learning_rate": 4.851307480333929e-06, + "loss": 0.0939, + "step": 4196 + }, + { + "epoch": 1.676517571884984, + "grad_norm": 1.093435156403032, + "learning_rate": 4.846662274272893e-06, + "loss": 0.0945, + "step": 4198 + }, + { + "epoch": 1.6773162939297124, + "grad_norm": 1.0883095662785185, + "learning_rate": 4.842017200681185e-06, + "loss": 0.0916, + "step": 4200 + }, + { + "epoch": 1.678115015974441, + "grad_norm": 1.0711389939013298, + "learning_rate": 4.8373722635717095e-06, + "loss": 0.1045, + "step": 4202 + }, + { + "epoch": 1.6789137380191694, + "grad_norm": 0.9565453405933899, + "learning_rate": 4.832727466957254e-06, + "loss": 0.084, + "step": 4204 + }, + { + "epoch": 1.6797124600638977, + "grad_norm": 0.9827521143045396, + "learning_rate": 4.828082814850484e-06, + "loss": 0.0895, + "step": 4206 + }, + { + "epoch": 1.680511182108626, + "grad_norm": 1.0715525135749726, + "learning_rate": 4.823438311263943e-06, + "loss": 0.0879, + "step": 4208 + }, + { + "epoch": 1.6813099041533546, + "grad_norm": 1.1041262630985615, + "learning_rate": 4.81879396021004e-06, + "loss": 0.1067, + "step": 4210 + }, + { + "epoch": 1.682108626198083, + "grad_norm": 1.0586655240128, + "learning_rate": 4.814149765701059e-06, + "loss": 0.1022, + "step": 4212 + }, + { + "epoch": 1.6829073482428116, + "grad_norm": 1.186055020520042, + "learning_rate": 4.809505731749144e-06, + "loss": 0.1139, + "step": 4214 + }, + { + "epoch": 1.68370607028754, + "grad_norm": 1.1122466690968793, + "learning_rate": 4.804861862366306e-06, + "loss": 0.1051, + "step": 4216 + }, + { + "epoch": 1.6845047923322682, + "grad_norm": 1.0754569180945435, + "learning_rate": 4.8002181615644055e-06, + "loss": 0.0961, + "step": 4218 + }, + { + "epoch": 1.6853035143769968, + "grad_norm": 1.0878287200397418, + "learning_rate": 4.795574633355165e-06, + "loss": 0.0885, + "step": 4220 + }, + { + "epoch": 1.6861022364217253, + "grad_norm": 1.1364082104476718, + "learning_rate": 4.790931281750152e-06, + "loss": 0.1, + "step": 4222 + }, + { + "epoch": 1.6869009584664538, + "grad_norm": 1.0119111534228, + "learning_rate": 4.786288110760787e-06, + "loss": 0.09, + "step": 4224 + }, + { + "epoch": 1.6876996805111821, + "grad_norm": 1.072752741199254, + "learning_rate": 4.78164512439833e-06, + "loss": 0.1017, + "step": 4226 + }, + { + "epoch": 1.6884984025559104, + "grad_norm": 1.27263253817292, + "learning_rate": 4.777002326673884e-06, + "loss": 0.1046, + "step": 4228 + }, + { + "epoch": 1.689297124600639, + "grad_norm": 1.0255757488823853, + "learning_rate": 4.772359721598386e-06, + "loss": 0.1052, + "step": 4230 + }, + { + "epoch": 1.6900958466453675, + "grad_norm": 1.1943745213053019, + "learning_rate": 4.767717313182611e-06, + "loss": 0.0992, + "step": 4232 + }, + { + "epoch": 1.6908945686900958, + "grad_norm": 1.1636475166052027, + "learning_rate": 4.763075105437161e-06, + "loss": 0.0991, + "step": 4234 + }, + { + "epoch": 1.6916932907348243, + "grad_norm": 1.0844729963688502, + "learning_rate": 4.758433102372466e-06, + "loss": 0.1026, + "step": 4236 + }, + { + "epoch": 1.6924920127795526, + "grad_norm": 1.1114480349576357, + "learning_rate": 4.753791307998776e-06, + "loss": 0.093, + "step": 4238 + }, + { + "epoch": 1.6932907348242812, + "grad_norm": 0.9636290087080969, + "learning_rate": 4.74914972632617e-06, + "loss": 0.0865, + "step": 4240 + }, + { + "epoch": 1.6940894568690097, + "grad_norm": 1.0002038172553764, + "learning_rate": 4.744508361364529e-06, + "loss": 0.0907, + "step": 4242 + }, + { + "epoch": 1.694888178913738, + "grad_norm": 1.0275565003608014, + "learning_rate": 4.73986721712356e-06, + "loss": 0.0944, + "step": 4244 + }, + { + "epoch": 1.6956869009584663, + "grad_norm": 1.0458584389637045, + "learning_rate": 4.73522629761277e-06, + "loss": 0.0982, + "step": 4246 + }, + { + "epoch": 1.6964856230031948, + "grad_norm": 1.0999887308140721, + "learning_rate": 4.730585606841479e-06, + "loss": 0.0994, + "step": 4248 + }, + { + "epoch": 1.6972843450479234, + "grad_norm": 1.0805343621335457, + "learning_rate": 4.725945148818801e-06, + "loss": 0.0922, + "step": 4250 + }, + { + "epoch": 1.6980830670926519, + "grad_norm": 1.1009086534316133, + "learning_rate": 4.721304927553659e-06, + "loss": 0.0978, + "step": 4252 + }, + { + "epoch": 1.6988817891373802, + "grad_norm": 1.0686039798032159, + "learning_rate": 4.716664947054764e-06, + "loss": 0.0926, + "step": 4254 + }, + { + "epoch": 1.6996805111821085, + "grad_norm": 1.2264200698539793, + "learning_rate": 4.7120252113306216e-06, + "loss": 0.0997, + "step": 4256 + }, + { + "epoch": 1.700479233226837, + "grad_norm": 1.0512823258442452, + "learning_rate": 4.707385724389526e-06, + "loss": 0.1054, + "step": 4258 + }, + { + "epoch": 1.7012779552715656, + "grad_norm": 1.2065999780404362, + "learning_rate": 4.702746490239554e-06, + "loss": 0.1031, + "step": 4260 + }, + { + "epoch": 1.702076677316294, + "grad_norm": 1.190627424392971, + "learning_rate": 4.69810751288857e-06, + "loss": 0.0996, + "step": 4262 + }, + { + "epoch": 1.7028753993610224, + "grad_norm": 1.136415340105126, + "learning_rate": 4.693468796344211e-06, + "loss": 0.1046, + "step": 4264 + }, + { + "epoch": 1.7036741214057507, + "grad_norm": 1.056631284731312, + "learning_rate": 4.6888303446138895e-06, + "loss": 0.0916, + "step": 4266 + }, + { + "epoch": 1.7044728434504792, + "grad_norm": 1.0261838244170962, + "learning_rate": 4.684192161704792e-06, + "loss": 0.0876, + "step": 4268 + }, + { + "epoch": 1.7052715654952078, + "grad_norm": 0.9913120866257363, + "learning_rate": 4.679554251623869e-06, + "loss": 0.0975, + "step": 4270 + }, + { + "epoch": 1.706070287539936, + "grad_norm": 0.9909781303643188, + "learning_rate": 4.6749166183778375e-06, + "loss": 0.0899, + "step": 4272 + }, + { + "epoch": 1.7068690095846646, + "grad_norm": 1.0133453073875816, + "learning_rate": 4.670279265973177e-06, + "loss": 0.0944, + "step": 4274 + }, + { + "epoch": 1.707667731629393, + "grad_norm": 1.084241919717283, + "learning_rate": 4.665642198416119e-06, + "loss": 0.1014, + "step": 4276 + }, + { + "epoch": 1.7084664536741214, + "grad_norm": 1.1144409380935076, + "learning_rate": 4.661005419712657e-06, + "loss": 0.1041, + "step": 4278 + }, + { + "epoch": 1.70926517571885, + "grad_norm": 1.0934629080398204, + "learning_rate": 4.656368933868525e-06, + "loss": 0.0912, + "step": 4280 + }, + { + "epoch": 1.7100638977635783, + "grad_norm": 1.1463371146812387, + "learning_rate": 4.651732744889215e-06, + "loss": 0.1037, + "step": 4282 + }, + { + "epoch": 1.7108626198083066, + "grad_norm": 1.1729562022224107, + "learning_rate": 4.647096856779951e-06, + "loss": 0.0965, + "step": 4284 + }, + { + "epoch": 1.711661341853035, + "grad_norm": 1.1118256334824752, + "learning_rate": 4.642461273545707e-06, + "loss": 0.1034, + "step": 4286 + }, + { + "epoch": 1.7124600638977636, + "grad_norm": 1.132712399437204, + "learning_rate": 4.637825999191189e-06, + "loss": 0.0982, + "step": 4288 + }, + { + "epoch": 1.7132587859424921, + "grad_norm": 1.0848347886256517, + "learning_rate": 4.633191037720835e-06, + "loss": 0.1018, + "step": 4290 + }, + { + "epoch": 1.7140575079872205, + "grad_norm": 1.0658858217343385, + "learning_rate": 4.628556393138816e-06, + "loss": 0.097, + "step": 4292 + }, + { + "epoch": 1.7148562300319488, + "grad_norm": 1.0923087477574933, + "learning_rate": 4.623922069449028e-06, + "loss": 0.1016, + "step": 4294 + }, + { + "epoch": 1.7156549520766773, + "grad_norm": 1.161421995581851, + "learning_rate": 4.619288070655086e-06, + "loss": 0.0925, + "step": 4296 + }, + { + "epoch": 1.7164536741214058, + "grad_norm": 0.9470108447713622, + "learning_rate": 4.614654400760333e-06, + "loss": 0.0989, + "step": 4298 + }, + { + "epoch": 1.7172523961661343, + "grad_norm": 1.0516808788040757, + "learning_rate": 4.610021063767818e-06, + "loss": 0.1055, + "step": 4300 + }, + { + "epoch": 1.7180511182108626, + "grad_norm": 1.1453407398766986, + "learning_rate": 4.60538806368031e-06, + "loss": 0.0986, + "step": 4302 + }, + { + "epoch": 1.718849840255591, + "grad_norm": 0.9890278332037488, + "learning_rate": 4.600755404500281e-06, + "loss": 0.0956, + "step": 4304 + }, + { + "epoch": 1.7196485623003195, + "grad_norm": 1.0910031893484988, + "learning_rate": 4.596123090229913e-06, + "loss": 0.0959, + "step": 4306 + }, + { + "epoch": 1.720447284345048, + "grad_norm": 0.954845783952346, + "learning_rate": 4.59149112487109e-06, + "loss": 0.0866, + "step": 4308 + }, + { + "epoch": 1.7212460063897763, + "grad_norm": 1.0693628763645768, + "learning_rate": 4.5868595124253915e-06, + "loss": 0.0984, + "step": 4310 + }, + { + "epoch": 1.7220447284345048, + "grad_norm": 1.0048364707538442, + "learning_rate": 4.582228256894093e-06, + "loss": 0.0939, + "step": 4312 + }, + { + "epoch": 1.7228434504792332, + "grad_norm": 1.1603565075058264, + "learning_rate": 4.577597362278165e-06, + "loss": 0.1026, + "step": 4314 + }, + { + "epoch": 1.7236421725239617, + "grad_norm": 1.1312773137686531, + "learning_rate": 4.572966832578261e-06, + "loss": 0.1076, + "step": 4316 + }, + { + "epoch": 1.7244408945686902, + "grad_norm": 1.0962659717799779, + "learning_rate": 4.568336671794722e-06, + "loss": 0.0956, + "step": 4318 + }, + { + "epoch": 1.7252396166134185, + "grad_norm": 1.0220568178816403, + "learning_rate": 4.56370688392757e-06, + "loss": 0.0904, + "step": 4320 + }, + { + "epoch": 1.7260383386581468, + "grad_norm": 0.9834388109812503, + "learning_rate": 4.5590774729765076e-06, + "loss": 0.0939, + "step": 4322 + }, + { + "epoch": 1.7268370607028753, + "grad_norm": 1.2111951372748109, + "learning_rate": 4.554448442940905e-06, + "loss": 0.0968, + "step": 4324 + }, + { + "epoch": 1.7276357827476039, + "grad_norm": 1.1926580241309344, + "learning_rate": 4.549819797819809e-06, + "loss": 0.1017, + "step": 4326 + }, + { + "epoch": 1.7284345047923324, + "grad_norm": 1.0572382689425925, + "learning_rate": 4.545191541611933e-06, + "loss": 0.0871, + "step": 4328 + }, + { + "epoch": 1.7292332268370607, + "grad_norm": 1.0126727217605194, + "learning_rate": 4.540563678315652e-06, + "loss": 0.0895, + "step": 4330 + }, + { + "epoch": 1.730031948881789, + "grad_norm": 1.0365109166295632, + "learning_rate": 4.535936211929005e-06, + "loss": 0.0971, + "step": 4332 + }, + { + "epoch": 1.7308306709265175, + "grad_norm": 1.2600986522602922, + "learning_rate": 4.5313091464496865e-06, + "loss": 0.1067, + "step": 4334 + }, + { + "epoch": 1.731629392971246, + "grad_norm": 1.1349358517047685, + "learning_rate": 4.526682485875044e-06, + "loss": 0.0954, + "step": 4336 + }, + { + "epoch": 1.7324281150159746, + "grad_norm": 1.0404286721064964, + "learning_rate": 4.5220562342020755e-06, + "loss": 0.1014, + "step": 4338 + }, + { + "epoch": 1.733226837060703, + "grad_norm": 1.2958712155282752, + "learning_rate": 4.517430395427424e-06, + "loss": 0.1121, + "step": 4340 + }, + { + "epoch": 1.7340255591054312, + "grad_norm": 1.1652339074566052, + "learning_rate": 4.512804973547383e-06, + "loss": 0.0985, + "step": 4342 + }, + { + "epoch": 1.7348242811501597, + "grad_norm": 1.204403964641989, + "learning_rate": 4.508179972557875e-06, + "loss": 0.1071, + "step": 4344 + }, + { + "epoch": 1.7356230031948883, + "grad_norm": 1.0146957490804427, + "learning_rate": 4.503555396454468e-06, + "loss": 0.089, + "step": 4346 + }, + { + "epoch": 1.7364217252396166, + "grad_norm": 1.038473290463769, + "learning_rate": 4.498931249232357e-06, + "loss": 0.1022, + "step": 4348 + }, + { + "epoch": 1.7372204472843449, + "grad_norm": 1.0396114915923156, + "learning_rate": 4.49430753488637e-06, + "loss": 0.0958, + "step": 4350 + }, + { + "epoch": 1.7380191693290734, + "grad_norm": 0.9976185951916329, + "learning_rate": 4.489684257410959e-06, + "loss": 0.0872, + "step": 4352 + }, + { + "epoch": 1.738817891373802, + "grad_norm": 0.8940691455776141, + "learning_rate": 4.485061420800202e-06, + "loss": 0.0736, + "step": 4354 + }, + { + "epoch": 1.7396166134185305, + "grad_norm": 1.0562465504348986, + "learning_rate": 4.480439029047792e-06, + "loss": 0.0928, + "step": 4356 + }, + { + "epoch": 1.7404153354632588, + "grad_norm": 1.3220699498489616, + "learning_rate": 4.47581708614704e-06, + "loss": 0.1032, + "step": 4358 + }, + { + "epoch": 1.741214057507987, + "grad_norm": 1.044321957199521, + "learning_rate": 4.471195596090867e-06, + "loss": 0.0893, + "step": 4360 + }, + { + "epoch": 1.7420127795527156, + "grad_norm": 1.1907505200191142, + "learning_rate": 4.466574562871807e-06, + "loss": 0.0955, + "step": 4362 + }, + { + "epoch": 1.7428115015974441, + "grad_norm": 1.2092564945972575, + "learning_rate": 4.461953990481995e-06, + "loss": 0.1037, + "step": 4364 + }, + { + "epoch": 1.7436102236421727, + "grad_norm": 1.0977545829393272, + "learning_rate": 4.45733388291317e-06, + "loss": 0.0854, + "step": 4366 + }, + { + "epoch": 1.744408945686901, + "grad_norm": 1.0941242092439605, + "learning_rate": 4.452714244156667e-06, + "loss": 0.1029, + "step": 4368 + }, + { + "epoch": 1.7452076677316293, + "grad_norm": 1.1568260987190029, + "learning_rate": 4.448095078203421e-06, + "loss": 0.0954, + "step": 4370 + }, + { + "epoch": 1.7460063897763578, + "grad_norm": 1.021261678557611, + "learning_rate": 4.443476389043955e-06, + "loss": 0.1055, + "step": 4372 + }, + { + "epoch": 1.7468051118210863, + "grad_norm": 1.1554032258458062, + "learning_rate": 4.438858180668379e-06, + "loss": 0.0907, + "step": 4374 + }, + { + "epoch": 1.7476038338658149, + "grad_norm": 0.9759468899388195, + "learning_rate": 4.434240457066388e-06, + "loss": 0.0893, + "step": 4376 + }, + { + "epoch": 1.7484025559105432, + "grad_norm": 1.1342860014921987, + "learning_rate": 4.429623222227265e-06, + "loss": 0.0965, + "step": 4378 + }, + { + "epoch": 1.7492012779552715, + "grad_norm": 1.0507089455768055, + "learning_rate": 4.425006480139861e-06, + "loss": 0.0925, + "step": 4380 + }, + { + "epoch": 1.75, + "grad_norm": 1.044726162269031, + "learning_rate": 4.420390234792608e-06, + "loss": 0.1003, + "step": 4382 + }, + { + "epoch": 1.7507987220447285, + "grad_norm": 1.1039342031586883, + "learning_rate": 4.415774490173504e-06, + "loss": 0.0893, + "step": 4384 + }, + { + "epoch": 1.7515974440894568, + "grad_norm": 1.075578365679244, + "learning_rate": 4.411159250270119e-06, + "loss": 0.0948, + "step": 4386 + }, + { + "epoch": 1.7523961661341851, + "grad_norm": 1.0755507149516828, + "learning_rate": 4.406544519069582e-06, + "loss": 0.0943, + "step": 4388 + }, + { + "epoch": 1.7531948881789137, + "grad_norm": 1.2422536273434488, + "learning_rate": 4.401930300558588e-06, + "loss": 0.1042, + "step": 4390 + }, + { + "epoch": 1.7539936102236422, + "grad_norm": 1.125409271455376, + "learning_rate": 4.397316598723385e-06, + "loss": 0.1008, + "step": 4392 + }, + { + "epoch": 1.7547923322683707, + "grad_norm": 0.9912325306555418, + "learning_rate": 4.392703417549777e-06, + "loss": 0.0893, + "step": 4394 + }, + { + "epoch": 1.755591054313099, + "grad_norm": 1.1298243639433942, + "learning_rate": 4.388090761023118e-06, + "loss": 0.0968, + "step": 4396 + }, + { + "epoch": 1.7563897763578273, + "grad_norm": 1.03503405875718, + "learning_rate": 4.3834786331283055e-06, + "loss": 0.0925, + "step": 4398 + }, + { + "epoch": 1.7571884984025559, + "grad_norm": 1.1423391405624483, + "learning_rate": 4.3788670378497836e-06, + "loss": 0.0989, + "step": 4400 + }, + { + "epoch": 1.7579872204472844, + "grad_norm": 1.0654118903753726, + "learning_rate": 4.374255979171538e-06, + "loss": 0.0993, + "step": 4402 + }, + { + "epoch": 1.758785942492013, + "grad_norm": 1.1141658088401454, + "learning_rate": 4.369645461077085e-06, + "loss": 0.1005, + "step": 4404 + }, + { + "epoch": 1.7595846645367412, + "grad_norm": 1.0554546089720755, + "learning_rate": 4.365035487549481e-06, + "loss": 0.0894, + "step": 4406 + }, + { + "epoch": 1.7603833865814695, + "grad_norm": 1.0768929412416388, + "learning_rate": 4.360426062571303e-06, + "loss": 0.0984, + "step": 4408 + }, + { + "epoch": 1.761182108626198, + "grad_norm": 1.0579464025983196, + "learning_rate": 4.3558171901246635e-06, + "loss": 0.0989, + "step": 4410 + }, + { + "epoch": 1.7619808306709266, + "grad_norm": 1.0630018800627314, + "learning_rate": 4.351208874191192e-06, + "loss": 0.1076, + "step": 4412 + }, + { + "epoch": 1.7627795527156551, + "grad_norm": 1.0546218890926364, + "learning_rate": 4.346601118752039e-06, + "loss": 0.0952, + "step": 4414 + }, + { + "epoch": 1.7635782747603834, + "grad_norm": 0.9956147311105759, + "learning_rate": 4.341993927787871e-06, + "loss": 0.0947, + "step": 4416 + }, + { + "epoch": 1.7643769968051117, + "grad_norm": 1.0747196506285701, + "learning_rate": 4.337387305278864e-06, + "loss": 0.0928, + "step": 4418 + }, + { + "epoch": 1.7651757188498403, + "grad_norm": 1.121082140094148, + "learning_rate": 4.332781255204708e-06, + "loss": 0.0919, + "step": 4420 + }, + { + "epoch": 1.7659744408945688, + "grad_norm": 1.1615527137887067, + "learning_rate": 4.328175781544593e-06, + "loss": 0.108, + "step": 4422 + }, + { + "epoch": 1.766773162939297, + "grad_norm": 0.9564313983946037, + "learning_rate": 4.323570888277215e-06, + "loss": 0.0932, + "step": 4424 + }, + { + "epoch": 1.7675718849840254, + "grad_norm": 1.0463601569604801, + "learning_rate": 4.318966579380768e-06, + "loss": 0.0895, + "step": 4426 + }, + { + "epoch": 1.768370607028754, + "grad_norm": 1.163800342765404, + "learning_rate": 4.31436285883294e-06, + "loss": 0.1033, + "step": 4428 + }, + { + "epoch": 1.7691693290734825, + "grad_norm": 1.039700123732381, + "learning_rate": 4.3097597306109115e-06, + "loss": 0.0958, + "step": 4430 + }, + { + "epoch": 1.769968051118211, + "grad_norm": 1.1932778000690207, + "learning_rate": 4.305157198691351e-06, + "loss": 0.0972, + "step": 4432 + }, + { + "epoch": 1.7707667731629393, + "grad_norm": 1.0406189324731365, + "learning_rate": 4.30055526705041e-06, + "loss": 0.0978, + "step": 4434 + }, + { + "epoch": 1.7715654952076676, + "grad_norm": 1.1896973942703604, + "learning_rate": 4.2959539396637265e-06, + "loss": 0.1038, + "step": 4436 + }, + { + "epoch": 1.7723642172523961, + "grad_norm": 1.1009904452623798, + "learning_rate": 4.291353220506409e-06, + "loss": 0.0933, + "step": 4438 + }, + { + "epoch": 1.7731629392971247, + "grad_norm": 1.0740791345413632, + "learning_rate": 4.286753113553049e-06, + "loss": 0.0901, + "step": 4440 + }, + { + "epoch": 1.7739616613418532, + "grad_norm": 1.0663374997120347, + "learning_rate": 4.2821536227777016e-06, + "loss": 0.0899, + "step": 4442 + }, + { + "epoch": 1.7747603833865815, + "grad_norm": 0.9560869355538776, + "learning_rate": 4.277554752153895e-06, + "loss": 0.0862, + "step": 4444 + }, + { + "epoch": 1.7755591054313098, + "grad_norm": 1.1931319093376327, + "learning_rate": 4.272956505654616e-06, + "loss": 0.1013, + "step": 4446 + }, + { + "epoch": 1.7763578274760383, + "grad_norm": 1.1029308559048692, + "learning_rate": 4.268358887252322e-06, + "loss": 0.0971, + "step": 4448 + }, + { + "epoch": 1.7771565495207668, + "grad_norm": 1.0214787393703604, + "learning_rate": 4.263761900918916e-06, + "loss": 0.0896, + "step": 4450 + }, + { + "epoch": 1.7779552715654952, + "grad_norm": 1.1538006513738512, + "learning_rate": 4.259165550625765e-06, + "loss": 0.1008, + "step": 4452 + }, + { + "epoch": 1.7787539936102237, + "grad_norm": 1.1069143300956876, + "learning_rate": 4.254569840343677e-06, + "loss": 0.0904, + "step": 4454 + }, + { + "epoch": 1.779552715654952, + "grad_norm": 1.2455527249765532, + "learning_rate": 4.249974774042915e-06, + "loss": 0.109, + "step": 4456 + }, + { + "epoch": 1.7803514376996805, + "grad_norm": 1.0281866104050703, + "learning_rate": 4.245380355693183e-06, + "loss": 0.0902, + "step": 4458 + }, + { + "epoch": 1.781150159744409, + "grad_norm": 1.0141761099764164, + "learning_rate": 4.240786589263623e-06, + "loss": 0.0941, + "step": 4460 + }, + { + "epoch": 1.7819488817891374, + "grad_norm": 1.0607373591565843, + "learning_rate": 4.236193478722818e-06, + "loss": 0.0991, + "step": 4462 + }, + { + "epoch": 1.7827476038338657, + "grad_norm": 0.9907340835532562, + "learning_rate": 4.231601028038781e-06, + "loss": 0.0797, + "step": 4464 + }, + { + "epoch": 1.7835463258785942, + "grad_norm": 1.112591753336717, + "learning_rate": 4.2270092411789555e-06, + "loss": 0.0888, + "step": 4466 + }, + { + "epoch": 1.7843450479233227, + "grad_norm": 1.2313881450954993, + "learning_rate": 4.222418122110212e-06, + "loss": 0.1055, + "step": 4468 + }, + { + "epoch": 1.7851437699680512, + "grad_norm": 1.0862948713868201, + "learning_rate": 4.217827674798845e-06, + "loss": 0.098, + "step": 4470 + }, + { + "epoch": 1.7859424920127795, + "grad_norm": 1.060155034044239, + "learning_rate": 4.2132379032105695e-06, + "loss": 0.0996, + "step": 4472 + }, + { + "epoch": 1.7867412140575079, + "grad_norm": 1.1021893535744396, + "learning_rate": 4.208648811310513e-06, + "loss": 0.1053, + "step": 4474 + }, + { + "epoch": 1.7875399361022364, + "grad_norm": 1.188590840385766, + "learning_rate": 4.204060403063218e-06, + "loss": 0.1067, + "step": 4476 + }, + { + "epoch": 1.788338658146965, + "grad_norm": 1.1601988700851704, + "learning_rate": 4.199472682432637e-06, + "loss": 0.1025, + "step": 4478 + }, + { + "epoch": 1.7891373801916934, + "grad_norm": 1.2008156142165938, + "learning_rate": 4.194885653382128e-06, + "loss": 0.1031, + "step": 4480 + }, + { + "epoch": 1.7899361022364217, + "grad_norm": 1.1590922201903628, + "learning_rate": 4.190299319874449e-06, + "loss": 0.0935, + "step": 4482 + }, + { + "epoch": 1.79073482428115, + "grad_norm": 1.1080943430270318, + "learning_rate": 4.185713685871763e-06, + "loss": 0.1121, + "step": 4484 + }, + { + "epoch": 1.7915335463258786, + "grad_norm": 1.0680082395002684, + "learning_rate": 4.1811287553356214e-06, + "loss": 0.0997, + "step": 4486 + }, + { + "epoch": 1.792332268370607, + "grad_norm": 1.169219691370761, + "learning_rate": 4.176544532226974e-06, + "loss": 0.106, + "step": 4488 + }, + { + "epoch": 1.7931309904153354, + "grad_norm": 1.0745747283017117, + "learning_rate": 4.171961020506158e-06, + "loss": 0.0981, + "step": 4490 + }, + { + "epoch": 1.793929712460064, + "grad_norm": 1.3455597906264747, + "learning_rate": 4.167378224132891e-06, + "loss": 0.0982, + "step": 4492 + }, + { + "epoch": 1.7947284345047922, + "grad_norm": 1.1972510456665748, + "learning_rate": 4.162796147066279e-06, + "loss": 0.0978, + "step": 4494 + }, + { + "epoch": 1.7955271565495208, + "grad_norm": 1.1197171631008844, + "learning_rate": 4.158214793264808e-06, + "loss": 0.0837, + "step": 4496 + }, + { + "epoch": 1.7963258785942493, + "grad_norm": 0.9880917577387588, + "learning_rate": 4.15363416668633e-06, + "loss": 0.0884, + "step": 4498 + }, + { + "epoch": 1.7971246006389776, + "grad_norm": 1.080269566035693, + "learning_rate": 4.149054271288076e-06, + "loss": 0.102, + "step": 4500 + }, + { + "epoch": 1.7971246006389776, + "eval_loss": 0.14962869882583618, + "eval_runtime": 418.7656, + "eval_samples_per_second": 42.523, + "eval_steps_per_second": 5.316, + "step": 4500 + }, + { + "epoch": 1.797923322683706, + "grad_norm": 1.067144176619292, + "learning_rate": 4.144475111026643e-06, + "loss": 0.104, + "step": 4502 + }, + { + "epoch": 1.7987220447284344, + "grad_norm": 1.060342189299496, + "learning_rate": 4.139896689857995e-06, + "loss": 0.0939, + "step": 4504 + }, + { + "epoch": 1.799520766773163, + "grad_norm": 1.059142250496446, + "learning_rate": 4.1353190117374545e-06, + "loss": 0.0987, + "step": 4506 + }, + { + "epoch": 1.8003194888178915, + "grad_norm": 1.0930753497438312, + "learning_rate": 4.130742080619704e-06, + "loss": 0.0907, + "step": 4508 + }, + { + "epoch": 1.8011182108626198, + "grad_norm": 1.0270924676421964, + "learning_rate": 4.126165900458781e-06, + "loss": 0.09, + "step": 4510 + }, + { + "epoch": 1.8019169329073481, + "grad_norm": 1.1110244428747031, + "learning_rate": 4.121590475208071e-06, + "loss": 0.1013, + "step": 4512 + }, + { + "epoch": 1.8027156549520766, + "grad_norm": 1.0236243613153262, + "learning_rate": 4.11701580882031e-06, + "loss": 0.0976, + "step": 4514 + }, + { + "epoch": 1.8035143769968052, + "grad_norm": 1.06524813010489, + "learning_rate": 4.1124419052475815e-06, + "loss": 0.1, + "step": 4516 + }, + { + "epoch": 1.8043130990415337, + "grad_norm": 1.1330875679173593, + "learning_rate": 4.107868768441304e-06, + "loss": 0.1166, + "step": 4518 + }, + { + "epoch": 1.805111821086262, + "grad_norm": 1.4531489997318303, + "learning_rate": 4.1032964023522366e-06, + "loss": 0.0858, + "step": 4520 + }, + { + "epoch": 1.8059105431309903, + "grad_norm": 1.1107724952996303, + "learning_rate": 4.098724810930472e-06, + "loss": 0.1035, + "step": 4522 + }, + { + "epoch": 1.8067092651757188, + "grad_norm": 1.0926573298123923, + "learning_rate": 4.0941539981254345e-06, + "loss": 0.1026, + "step": 4524 + }, + { + "epoch": 1.8075079872204474, + "grad_norm": 0.9721623108703124, + "learning_rate": 4.089583967885874e-06, + "loss": 0.1025, + "step": 4526 + }, + { + "epoch": 1.8083067092651757, + "grad_norm": 1.0370754238710584, + "learning_rate": 4.085014724159866e-06, + "loss": 0.0942, + "step": 4528 + }, + { + "epoch": 1.8091054313099042, + "grad_norm": 1.0940664318814297, + "learning_rate": 4.0804462708948076e-06, + "loss": 0.0988, + "step": 4530 + }, + { + "epoch": 1.8099041533546325, + "grad_norm": 1.0244857172089716, + "learning_rate": 4.075878612037408e-06, + "loss": 0.089, + "step": 4532 + }, + { + "epoch": 1.810702875399361, + "grad_norm": 1.0706540589086802, + "learning_rate": 4.071311751533696e-06, + "loss": 0.0996, + "step": 4534 + }, + { + "epoch": 1.8115015974440896, + "grad_norm": 1.1352354117693362, + "learning_rate": 4.066745693329008e-06, + "loss": 0.0924, + "step": 4536 + }, + { + "epoch": 1.8123003194888179, + "grad_norm": 1.0659040735499372, + "learning_rate": 4.062180441367985e-06, + "loss": 0.0939, + "step": 4538 + }, + { + "epoch": 1.8130990415335462, + "grad_norm": 1.1129307554195655, + "learning_rate": 4.057615999594578e-06, + "loss": 0.1015, + "step": 4540 + }, + { + "epoch": 1.8138977635782747, + "grad_norm": 1.0090716344766517, + "learning_rate": 4.053052371952032e-06, + "loss": 0.1119, + "step": 4542 + }, + { + "epoch": 1.8146964856230032, + "grad_norm": 1.1454677517831415, + "learning_rate": 4.0484895623828906e-06, + "loss": 0.0984, + "step": 4544 + }, + { + "epoch": 1.8154952076677318, + "grad_norm": 1.1153321341980642, + "learning_rate": 4.04392757482899e-06, + "loss": 0.0977, + "step": 4546 + }, + { + "epoch": 1.81629392971246, + "grad_norm": 1.0654841000292614, + "learning_rate": 4.039366413231458e-06, + "loss": 0.0746, + "step": 4548 + }, + { + "epoch": 1.8170926517571884, + "grad_norm": 1.0311164679367486, + "learning_rate": 4.034806081530709e-06, + "loss": 0.0932, + "step": 4550 + }, + { + "epoch": 1.817891373801917, + "grad_norm": 1.044082427416449, + "learning_rate": 4.030246583666437e-06, + "loss": 0.0967, + "step": 4552 + }, + { + "epoch": 1.8186900958466454, + "grad_norm": 1.1455338692396, + "learning_rate": 4.0256879235776195e-06, + "loss": 0.097, + "step": 4554 + }, + { + "epoch": 1.819488817891374, + "grad_norm": 1.1193697932818554, + "learning_rate": 4.0211301052025075e-06, + "loss": 0.1049, + "step": 4556 + }, + { + "epoch": 1.8202875399361023, + "grad_norm": 0.9888093223647104, + "learning_rate": 4.016573132478628e-06, + "loss": 0.0877, + "step": 4558 + }, + { + "epoch": 1.8210862619808306, + "grad_norm": 0.9046844324915236, + "learning_rate": 4.012017009342773e-06, + "loss": 0.0953, + "step": 4560 + }, + { + "epoch": 1.821884984025559, + "grad_norm": 0.8970891044543646, + "learning_rate": 4.007461739731003e-06, + "loss": 0.0782, + "step": 4562 + }, + { + "epoch": 1.8226837060702876, + "grad_norm": 1.0504566189159226, + "learning_rate": 4.002907327578644e-06, + "loss": 0.096, + "step": 4564 + }, + { + "epoch": 1.823482428115016, + "grad_norm": 1.0396603169543621, + "learning_rate": 3.9983537768202775e-06, + "loss": 0.0966, + "step": 4566 + }, + { + "epoch": 1.8242811501597445, + "grad_norm": 1.0690123100147384, + "learning_rate": 3.99380109138974e-06, + "loss": 0.0781, + "step": 4568 + }, + { + "epoch": 1.8250798722044728, + "grad_norm": 1.0129487553480776, + "learning_rate": 3.989249275220124e-06, + "loss": 0.0884, + "step": 4570 + }, + { + "epoch": 1.8258785942492013, + "grad_norm": 1.2215632281871855, + "learning_rate": 3.984698332243767e-06, + "loss": 0.1092, + "step": 4572 + }, + { + "epoch": 1.8266773162939298, + "grad_norm": 1.0991702997365314, + "learning_rate": 3.980148266392257e-06, + "loss": 0.0998, + "step": 4574 + }, + { + "epoch": 1.8274760383386581, + "grad_norm": 1.0590928610197108, + "learning_rate": 3.97559908159642e-06, + "loss": 0.0884, + "step": 4576 + }, + { + "epoch": 1.8282747603833864, + "grad_norm": 1.1608407892475257, + "learning_rate": 3.971050781786323e-06, + "loss": 0.0974, + "step": 4578 + }, + { + "epoch": 1.829073482428115, + "grad_norm": 1.0916397944105056, + "learning_rate": 3.966503370891266e-06, + "loss": 0.0906, + "step": 4580 + }, + { + "epoch": 1.8298722044728435, + "grad_norm": 1.0395162456922145, + "learning_rate": 3.961956852839787e-06, + "loss": 0.0923, + "step": 4582 + }, + { + "epoch": 1.830670926517572, + "grad_norm": 1.068914817127223, + "learning_rate": 3.9574112315596425e-06, + "loss": 0.1025, + "step": 4584 + }, + { + "epoch": 1.8314696485623003, + "grad_norm": 1.0785020160593977, + "learning_rate": 3.952866510977827e-06, + "loss": 0.0863, + "step": 4586 + }, + { + "epoch": 1.8322683706070286, + "grad_norm": 1.1371133744493924, + "learning_rate": 3.948322695020546e-06, + "loss": 0.0911, + "step": 4588 + }, + { + "epoch": 1.8330670926517572, + "grad_norm": 0.9829285073485866, + "learning_rate": 3.943779787613231e-06, + "loss": 0.0857, + "step": 4590 + }, + { + "epoch": 1.8338658146964857, + "grad_norm": 1.5286458187672483, + "learning_rate": 3.9392377926805226e-06, + "loss": 0.0919, + "step": 4592 + }, + { + "epoch": 1.8346645367412142, + "grad_norm": 1.0894667065959676, + "learning_rate": 3.934696714146277e-06, + "loss": 0.0965, + "step": 4594 + }, + { + "epoch": 1.8354632587859425, + "grad_norm": 1.147015209954055, + "learning_rate": 3.930156555933557e-06, + "loss": 0.0959, + "step": 4596 + }, + { + "epoch": 1.8362619808306708, + "grad_norm": 1.0015372010863606, + "learning_rate": 3.925617321964632e-06, + "loss": 0.0912, + "step": 4598 + }, + { + "epoch": 1.8370607028753994, + "grad_norm": 1.0413881872588078, + "learning_rate": 3.92107901616097e-06, + "loss": 0.1001, + "step": 4600 + }, + { + "epoch": 1.8378594249201279, + "grad_norm": 1.1162230686005519, + "learning_rate": 3.916541642443242e-06, + "loss": 0.0963, + "step": 4602 + }, + { + "epoch": 1.8386581469648562, + "grad_norm": 1.0840497131856408, + "learning_rate": 3.912005204731307e-06, + "loss": 0.1011, + "step": 4604 + }, + { + "epoch": 1.8394568690095847, + "grad_norm": 1.0114188708406382, + "learning_rate": 3.907469706944222e-06, + "loss": 0.0894, + "step": 4606 + }, + { + "epoch": 1.840255591054313, + "grad_norm": 1.0694376480162955, + "learning_rate": 3.9029351530002264e-06, + "loss": 0.086, + "step": 4608 + }, + { + "epoch": 1.8410543130990416, + "grad_norm": 1.3957274955440564, + "learning_rate": 3.898401546816752e-06, + "loss": 0.0927, + "step": 4610 + }, + { + "epoch": 1.84185303514377, + "grad_norm": 0.9916548350535773, + "learning_rate": 3.8938688923104015e-06, + "loss": 0.0877, + "step": 4612 + }, + { + "epoch": 1.8426517571884984, + "grad_norm": 1.0958198194521465, + "learning_rate": 3.8893371933969644e-06, + "loss": 0.0921, + "step": 4614 + }, + { + "epoch": 1.8434504792332267, + "grad_norm": 0.9890953898234616, + "learning_rate": 3.884806453991399e-06, + "loss": 0.0898, + "step": 4616 + }, + { + "epoch": 1.8442492012779552, + "grad_norm": 1.099760617918534, + "learning_rate": 3.880276678007838e-06, + "loss": 0.0967, + "step": 4618 + }, + { + "epoch": 1.8450479233226837, + "grad_norm": 1.1341946755721188, + "learning_rate": 3.875747869359578e-06, + "loss": 0.0911, + "step": 4620 + }, + { + "epoch": 1.8458466453674123, + "grad_norm": 1.0924073653891035, + "learning_rate": 3.871220031959085e-06, + "loss": 0.0991, + "step": 4622 + }, + { + "epoch": 1.8466453674121406, + "grad_norm": 1.1994386671625121, + "learning_rate": 3.866693169717982e-06, + "loss": 0.1012, + "step": 4624 + }, + { + "epoch": 1.8474440894568689, + "grad_norm": 1.1087200452695056, + "learning_rate": 3.8621672865470505e-06, + "loss": 0.0901, + "step": 4626 + }, + { + "epoch": 1.8482428115015974, + "grad_norm": 1.1622238598835266, + "learning_rate": 3.8576423863562285e-06, + "loss": 0.0949, + "step": 4628 + }, + { + "epoch": 1.849041533546326, + "grad_norm": 1.031896254949219, + "learning_rate": 3.8531184730546e-06, + "loss": 0.0986, + "step": 4630 + }, + { + "epoch": 1.8498402555910545, + "grad_norm": 1.1267405061140343, + "learning_rate": 3.848595550550401e-06, + "loss": 0.0963, + "step": 4632 + }, + { + "epoch": 1.8506389776357828, + "grad_norm": 1.091392917402568, + "learning_rate": 3.84407362275101e-06, + "loss": 0.0952, + "step": 4634 + }, + { + "epoch": 1.851437699680511, + "grad_norm": 1.07201414860687, + "learning_rate": 3.839552693562946e-06, + "loss": 0.1002, + "step": 4636 + }, + { + "epoch": 1.8522364217252396, + "grad_norm": 1.0286376655218412, + "learning_rate": 3.835032766891865e-06, + "loss": 0.0948, + "step": 4638 + }, + { + "epoch": 1.8530351437699681, + "grad_norm": 1.0789695524482392, + "learning_rate": 3.830513846642556e-06, + "loss": 0.0967, + "step": 4640 + }, + { + "epoch": 1.8538338658146964, + "grad_norm": 1.1844906017376726, + "learning_rate": 3.825995936718942e-06, + "loss": 0.0951, + "step": 4642 + }, + { + "epoch": 1.854632587859425, + "grad_norm": 0.9957944115590945, + "learning_rate": 3.821479041024069e-06, + "loss": 0.0902, + "step": 4644 + }, + { + "epoch": 1.8554313099041533, + "grad_norm": 1.0579821850330315, + "learning_rate": 3.816963163460109e-06, + "loss": 0.1, + "step": 4646 + }, + { + "epoch": 1.8562300319488818, + "grad_norm": 1.080547600595392, + "learning_rate": 3.8124483079283546e-06, + "loss": 0.1039, + "step": 4648 + }, + { + "epoch": 1.8570287539936103, + "grad_norm": 1.0913158139520798, + "learning_rate": 3.8079344783292145e-06, + "loss": 0.1021, + "step": 4650 + }, + { + "epoch": 1.8578274760383386, + "grad_norm": 1.1039317269281381, + "learning_rate": 3.803421678562213e-06, + "loss": 0.096, + "step": 4652 + }, + { + "epoch": 1.858626198083067, + "grad_norm": 1.0837414701686472, + "learning_rate": 3.79890991252598e-06, + "loss": 0.0931, + "step": 4654 + }, + { + "epoch": 1.8594249201277955, + "grad_norm": 0.9817886186922399, + "learning_rate": 3.7943991841182586e-06, + "loss": 0.0918, + "step": 4656 + }, + { + "epoch": 1.860223642172524, + "grad_norm": 1.2753358547680838, + "learning_rate": 3.7898894972358934e-06, + "loss": 0.0879, + "step": 4658 + }, + { + "epoch": 1.8610223642172525, + "grad_norm": 1.0132549906162278, + "learning_rate": 3.7853808557748263e-06, + "loss": 0.0982, + "step": 4660 + }, + { + "epoch": 1.8618210862619808, + "grad_norm": 1.0604493491043667, + "learning_rate": 3.7808732636300987e-06, + "loss": 0.0909, + "step": 4662 + }, + { + "epoch": 1.8626198083067091, + "grad_norm": 1.0110270014624545, + "learning_rate": 3.7763667246958447e-06, + "loss": 0.0913, + "step": 4664 + }, + { + "epoch": 1.8634185303514377, + "grad_norm": 1.1816528766134167, + "learning_rate": 3.771861242865288e-06, + "loss": 0.0987, + "step": 4666 + }, + { + "epoch": 1.8642172523961662, + "grad_norm": 1.1276543836075537, + "learning_rate": 3.767356822030742e-06, + "loss": 0.1006, + "step": 4668 + }, + { + "epoch": 1.8650159744408947, + "grad_norm": 1.0029943146012643, + "learning_rate": 3.7628534660835996e-06, + "loss": 0.0925, + "step": 4670 + }, + { + "epoch": 1.865814696485623, + "grad_norm": 1.1636739447782507, + "learning_rate": 3.758351178914336e-06, + "loss": 0.0946, + "step": 4672 + }, + { + "epoch": 1.8666134185303513, + "grad_norm": 1.0912114279896796, + "learning_rate": 3.753849964412502e-06, + "loss": 0.09, + "step": 4674 + }, + { + "epoch": 1.8674121405750799, + "grad_norm": 1.2020359973073858, + "learning_rate": 3.749349826466724e-06, + "loss": 0.1037, + "step": 4676 + }, + { + "epoch": 1.8682108626198084, + "grad_norm": 0.9661948745934541, + "learning_rate": 3.744850768964692e-06, + "loss": 0.0862, + "step": 4678 + }, + { + "epoch": 1.8690095846645367, + "grad_norm": 1.0588016583262083, + "learning_rate": 3.7403527957931716e-06, + "loss": 0.0929, + "step": 4680 + }, + { + "epoch": 1.869808306709265, + "grad_norm": 1.095002175972765, + "learning_rate": 3.7358559108379867e-06, + "loss": 0.0982, + "step": 4682 + }, + { + "epoch": 1.8706070287539935, + "grad_norm": 1.0767223792716407, + "learning_rate": 3.731360117984022e-06, + "loss": 0.0911, + "step": 4684 + }, + { + "epoch": 1.871405750798722, + "grad_norm": 1.0769547783245055, + "learning_rate": 3.7268654211152156e-06, + "loss": 0.0931, + "step": 4686 + }, + { + "epoch": 1.8722044728434506, + "grad_norm": 1.1230982099429845, + "learning_rate": 3.7223718241145646e-06, + "loss": 0.0946, + "step": 4688 + }, + { + "epoch": 1.873003194888179, + "grad_norm": 1.0232620319916697, + "learning_rate": 3.71787933086411e-06, + "loss": 0.0863, + "step": 4690 + }, + { + "epoch": 1.8738019169329072, + "grad_norm": 1.1155033054935346, + "learning_rate": 3.713387945244945e-06, + "loss": 0.1052, + "step": 4692 + }, + { + "epoch": 1.8746006389776357, + "grad_norm": 1.086463915995651, + "learning_rate": 3.7088976711372006e-06, + "loss": 0.0894, + "step": 4694 + }, + { + "epoch": 1.8753993610223643, + "grad_norm": 1.1158786507370728, + "learning_rate": 3.7044085124200517e-06, + "loss": 0.0897, + "step": 4696 + }, + { + "epoch": 1.8761980830670928, + "grad_norm": 1.1044666274555124, + "learning_rate": 3.6999204729717057e-06, + "loss": 0.0935, + "step": 4698 + }, + { + "epoch": 1.876996805111821, + "grad_norm": 1.0364347122571194, + "learning_rate": 3.695433556669406e-06, + "loss": 0.0881, + "step": 4700 + }, + { + "epoch": 1.8777955271565494, + "grad_norm": 1.179238289944297, + "learning_rate": 3.690947767389426e-06, + "loss": 0.0903, + "step": 4702 + }, + { + "epoch": 1.878594249201278, + "grad_norm": 1.2284443916952599, + "learning_rate": 3.6864631090070656e-06, + "loss": 0.1027, + "step": 4704 + }, + { + "epoch": 1.8793929712460065, + "grad_norm": 1.1165613439165785, + "learning_rate": 3.6819795853966435e-06, + "loss": 0.0908, + "step": 4706 + }, + { + "epoch": 1.880191693290735, + "grad_norm": 1.1221181152137276, + "learning_rate": 3.6774972004315035e-06, + "loss": 0.0994, + "step": 4708 + }, + { + "epoch": 1.8809904153354633, + "grad_norm": 1.120777637697723, + "learning_rate": 3.6730159579840007e-06, + "loss": 0.1016, + "step": 4710 + }, + { + "epoch": 1.8817891373801916, + "grad_norm": 1.0499328599769773, + "learning_rate": 3.668535861925509e-06, + "loss": 0.1007, + "step": 4712 + }, + { + "epoch": 1.8825878594249201, + "grad_norm": 1.0952348015619684, + "learning_rate": 3.6640569161264055e-06, + "loss": 0.1008, + "step": 4714 + }, + { + "epoch": 1.8833865814696487, + "grad_norm": 1.0556886162538723, + "learning_rate": 3.6595791244560795e-06, + "loss": 0.0909, + "step": 4716 + }, + { + "epoch": 1.884185303514377, + "grad_norm": 1.0779044498229438, + "learning_rate": 3.655102490782918e-06, + "loss": 0.1072, + "step": 4718 + }, + { + "epoch": 1.8849840255591053, + "grad_norm": 1.0237827262029766, + "learning_rate": 3.650627018974312e-06, + "loss": 0.0964, + "step": 4720 + }, + { + "epoch": 1.8857827476038338, + "grad_norm": 0.9712789192282018, + "learning_rate": 3.6461527128966457e-06, + "loss": 0.0942, + "step": 4722 + }, + { + "epoch": 1.8865814696485623, + "grad_norm": 1.1258507713014763, + "learning_rate": 3.6416795764152967e-06, + "loss": 0.104, + "step": 4724 + }, + { + "epoch": 1.8873801916932909, + "grad_norm": 0.9821860855241218, + "learning_rate": 3.6372076133946353e-06, + "loss": 0.087, + "step": 4726 + }, + { + "epoch": 1.8881789137380192, + "grad_norm": 1.0700005208247971, + "learning_rate": 3.632736827698015e-06, + "loss": 0.0883, + "step": 4728 + }, + { + "epoch": 1.8889776357827475, + "grad_norm": 0.9731935760176809, + "learning_rate": 3.6282672231877714e-06, + "loss": 0.0883, + "step": 4730 + }, + { + "epoch": 1.889776357827476, + "grad_norm": 1.045015911757986, + "learning_rate": 3.623798803725223e-06, + "loss": 0.097, + "step": 4732 + }, + { + "epoch": 1.8905750798722045, + "grad_norm": 1.218717767375124, + "learning_rate": 3.619331573170661e-06, + "loss": 0.0921, + "step": 4734 + }, + { + "epoch": 1.891373801916933, + "grad_norm": 1.058626369915082, + "learning_rate": 3.6148655353833518e-06, + "loss": 0.0856, + "step": 4736 + }, + { + "epoch": 1.8921725239616614, + "grad_norm": 1.0773766259028001, + "learning_rate": 3.6104006942215296e-06, + "loss": 0.0909, + "step": 4738 + }, + { + "epoch": 1.8929712460063897, + "grad_norm": 1.0619367090404177, + "learning_rate": 3.605937053542398e-06, + "loss": 0.0854, + "step": 4740 + }, + { + "epoch": 1.8937699680511182, + "grad_norm": 1.0878885631245991, + "learning_rate": 3.6014746172021197e-06, + "loss": 0.096, + "step": 4742 + }, + { + "epoch": 1.8945686900958467, + "grad_norm": 1.0235847877959412, + "learning_rate": 3.5970133890558184e-06, + "loss": 0.0826, + "step": 4744 + }, + { + "epoch": 1.895367412140575, + "grad_norm": 1.1515886076002215, + "learning_rate": 3.5925533729575745e-06, + "loss": 0.0891, + "step": 4746 + }, + { + "epoch": 1.8961661341853036, + "grad_norm": 1.0787838461269919, + "learning_rate": 3.588094572760423e-06, + "loss": 0.0947, + "step": 4748 + }, + { + "epoch": 1.8969648562300319, + "grad_norm": 1.1230844413759828, + "learning_rate": 3.583636992316345e-06, + "loss": 0.0902, + "step": 4750 + }, + { + "epoch": 1.8977635782747604, + "grad_norm": 1.0936152470611447, + "learning_rate": 3.5791806354762702e-06, + "loss": 0.0998, + "step": 4752 + }, + { + "epoch": 1.898562300319489, + "grad_norm": 1.2224698612193565, + "learning_rate": 3.5747255060900687e-06, + "loss": 0.1032, + "step": 4754 + }, + { + "epoch": 1.8993610223642172, + "grad_norm": 1.0690769327967091, + "learning_rate": 3.5702716080065546e-06, + "loss": 0.0856, + "step": 4756 + }, + { + "epoch": 1.9001597444089455, + "grad_norm": 1.0519872936846355, + "learning_rate": 3.5658189450734727e-06, + "loss": 0.088, + "step": 4758 + }, + { + "epoch": 1.900958466453674, + "grad_norm": 1.1511736996930644, + "learning_rate": 3.5613675211375066e-06, + "loss": 0.093, + "step": 4760 + }, + { + "epoch": 1.9017571884984026, + "grad_norm": 1.1254396775195248, + "learning_rate": 3.5569173400442634e-06, + "loss": 0.086, + "step": 4762 + }, + { + "epoch": 1.9025559105431311, + "grad_norm": 1.2048144094026239, + "learning_rate": 3.5524684056382824e-06, + "loss": 0.096, + "step": 4764 + }, + { + "epoch": 1.9033546325878594, + "grad_norm": 1.0379587437323166, + "learning_rate": 3.5480207217630224e-06, + "loss": 0.0929, + "step": 4766 + }, + { + "epoch": 1.9041533546325877, + "grad_norm": 1.0861096082948125, + "learning_rate": 3.5435742922608618e-06, + "loss": 0.0843, + "step": 4768 + }, + { + "epoch": 1.9049520766773163, + "grad_norm": 1.094770765692993, + "learning_rate": 3.539129120973095e-06, + "loss": 0.0913, + "step": 4770 + }, + { + "epoch": 1.9057507987220448, + "grad_norm": 1.0808303484556492, + "learning_rate": 3.534685211739935e-06, + "loss": 0.084, + "step": 4772 + }, + { + "epoch": 1.9065495207667733, + "grad_norm": 1.0531941918931411, + "learning_rate": 3.5302425684004957e-06, + "loss": 0.0837, + "step": 4774 + }, + { + "epoch": 1.9073482428115016, + "grad_norm": 1.0972839581319511, + "learning_rate": 3.525801194792805e-06, + "loss": 0.0969, + "step": 4776 + }, + { + "epoch": 1.90814696485623, + "grad_norm": 1.1088742467231305, + "learning_rate": 3.521361094753788e-06, + "loss": 0.09, + "step": 4778 + }, + { + "epoch": 1.9089456869009584, + "grad_norm": 1.062941595145544, + "learning_rate": 3.516922272119274e-06, + "loss": 0.0826, + "step": 4780 + }, + { + "epoch": 1.909744408945687, + "grad_norm": 1.01692209679147, + "learning_rate": 3.5124847307239863e-06, + "loss": 0.0857, + "step": 4782 + }, + { + "epoch": 1.9105431309904153, + "grad_norm": 1.2055133548353663, + "learning_rate": 3.508048474401541e-06, + "loss": 0.096, + "step": 4784 + }, + { + "epoch": 1.9113418530351438, + "grad_norm": 1.001313631088606, + "learning_rate": 3.503613506984447e-06, + "loss": 0.0884, + "step": 4786 + }, + { + "epoch": 1.9121405750798721, + "grad_norm": 1.1432556607777271, + "learning_rate": 3.499179832304096e-06, + "loss": 0.0909, + "step": 4788 + }, + { + "epoch": 1.9129392971246006, + "grad_norm": 1.1768617881519154, + "learning_rate": 3.4947474541907655e-06, + "loss": 0.0895, + "step": 4790 + }, + { + "epoch": 1.9137380191693292, + "grad_norm": 1.054267796873096, + "learning_rate": 3.4903163764736104e-06, + "loss": 0.0944, + "step": 4792 + }, + { + "epoch": 1.9145367412140575, + "grad_norm": 1.1032448450072194, + "learning_rate": 3.4858866029806658e-06, + "loss": 0.0956, + "step": 4794 + }, + { + "epoch": 1.9153354632587858, + "grad_norm": 0.9183906781681136, + "learning_rate": 3.4814581375388384e-06, + "loss": 0.0838, + "step": 4796 + }, + { + "epoch": 1.9161341853035143, + "grad_norm": 1.1483868020594459, + "learning_rate": 3.4770309839739026e-06, + "loss": 0.0913, + "step": 4798 + }, + { + "epoch": 1.9169329073482428, + "grad_norm": 1.0510791282744207, + "learning_rate": 3.4726051461105016e-06, + "loss": 0.0895, + "step": 4800 + }, + { + "epoch": 1.9177316293929714, + "grad_norm": 1.0681148740204216, + "learning_rate": 3.468180627772144e-06, + "loss": 0.0901, + "step": 4802 + }, + { + "epoch": 1.9185303514376997, + "grad_norm": 1.056725620741662, + "learning_rate": 3.4637574327811934e-06, + "loss": 0.087, + "step": 4804 + }, + { + "epoch": 1.919329073482428, + "grad_norm": 1.1165517293384117, + "learning_rate": 3.459335564958875e-06, + "loss": 0.0949, + "step": 4806 + }, + { + "epoch": 1.9201277955271565, + "grad_norm": 1.1714069822717896, + "learning_rate": 3.4549150281252635e-06, + "loss": 0.1003, + "step": 4808 + }, + { + "epoch": 1.920926517571885, + "grad_norm": 1.0857128594280534, + "learning_rate": 3.4504958260992877e-06, + "loss": 0.0917, + "step": 4810 + }, + { + "epoch": 1.9217252396166136, + "grad_norm": 0.9514303291715913, + "learning_rate": 3.4460779626987186e-06, + "loss": 0.081, + "step": 4812 + }, + { + "epoch": 1.9225239616613419, + "grad_norm": 1.1178021513245053, + "learning_rate": 3.441661441740176e-06, + "loss": 0.0945, + "step": 4814 + }, + { + "epoch": 1.9233226837060702, + "grad_norm": 1.0436166022295423, + "learning_rate": 3.437246267039115e-06, + "loss": 0.0939, + "step": 4816 + }, + { + "epoch": 1.9241214057507987, + "grad_norm": 0.9558565581069985, + "learning_rate": 3.4328324424098315e-06, + "loss": 0.0866, + "step": 4818 + }, + { + "epoch": 1.9249201277955272, + "grad_norm": 1.0690618672851848, + "learning_rate": 3.4284199716654526e-06, + "loss": 0.0971, + "step": 4820 + }, + { + "epoch": 1.9257188498402555, + "grad_norm": 1.0330544019396488, + "learning_rate": 3.424008858617939e-06, + "loss": 0.0877, + "step": 4822 + }, + { + "epoch": 1.926517571884984, + "grad_norm": 1.090972476610282, + "learning_rate": 3.419599107078073e-06, + "loss": 0.0933, + "step": 4824 + }, + { + "epoch": 1.9273162939297124, + "grad_norm": 1.1389205515436118, + "learning_rate": 3.4151907208554657e-06, + "loss": 0.0914, + "step": 4826 + }, + { + "epoch": 1.928115015974441, + "grad_norm": 1.0521512673067093, + "learning_rate": 3.4107837037585463e-06, + "loss": 0.0956, + "step": 4828 + }, + { + "epoch": 1.9289137380191694, + "grad_norm": 0.9943276734942584, + "learning_rate": 3.4063780595945627e-06, + "loss": 0.0971, + "step": 4830 + }, + { + "epoch": 1.9297124600638977, + "grad_norm": 1.1979073104114557, + "learning_rate": 3.401973792169574e-06, + "loss": 0.0929, + "step": 4832 + }, + { + "epoch": 1.930511182108626, + "grad_norm": 0.9579355850140132, + "learning_rate": 3.397570905288453e-06, + "loss": 0.0895, + "step": 4834 + }, + { + "epoch": 1.9313099041533546, + "grad_norm": 1.0976423710464434, + "learning_rate": 3.393169402754878e-06, + "loss": 0.0844, + "step": 4836 + }, + { + "epoch": 1.932108626198083, + "grad_norm": 1.1417293656435412, + "learning_rate": 3.388769288371333e-06, + "loss": 0.0954, + "step": 4838 + }, + { + "epoch": 1.9329073482428116, + "grad_norm": 0.9657207657700554, + "learning_rate": 3.384370565939098e-06, + "loss": 0.08, + "step": 4840 + }, + { + "epoch": 1.93370607028754, + "grad_norm": 1.122410966064945, + "learning_rate": 3.3799732392582598e-06, + "loss": 0.1073, + "step": 4842 + }, + { + "epoch": 1.9345047923322682, + "grad_norm": 0.9499390486261202, + "learning_rate": 3.375577312127689e-06, + "loss": 0.081, + "step": 4844 + }, + { + "epoch": 1.9353035143769968, + "grad_norm": 1.089004347076352, + "learning_rate": 3.3711827883450552e-06, + "loss": 0.0916, + "step": 4846 + }, + { + "epoch": 1.9361022364217253, + "grad_norm": 1.121477411735401, + "learning_rate": 3.3667896717068105e-06, + "loss": 0.0944, + "step": 4848 + }, + { + "epoch": 1.9369009584664538, + "grad_norm": 1.137024861471035, + "learning_rate": 3.3623979660081944e-06, + "loss": 0.0898, + "step": 4850 + }, + { + "epoch": 1.9376996805111821, + "grad_norm": 1.036107942325039, + "learning_rate": 3.3580076750432244e-06, + "loss": 0.1024, + "step": 4852 + }, + { + "epoch": 1.9384984025559104, + "grad_norm": 1.0079391443696057, + "learning_rate": 3.3536188026047e-06, + "loss": 0.0806, + "step": 4854 + }, + { + "epoch": 1.939297124600639, + "grad_norm": 1.0869459285245229, + "learning_rate": 3.34923135248419e-06, + "loss": 0.0918, + "step": 4856 + }, + { + "epoch": 1.9400958466453675, + "grad_norm": 1.1976937501317675, + "learning_rate": 3.3448453284720407e-06, + "loss": 0.0992, + "step": 4858 + }, + { + "epoch": 1.9408945686900958, + "grad_norm": 1.1088825356732732, + "learning_rate": 3.340460734357359e-06, + "loss": 0.0986, + "step": 4860 + }, + { + "epoch": 1.9416932907348243, + "grad_norm": 1.03132569544229, + "learning_rate": 3.336077573928023e-06, + "loss": 0.0864, + "step": 4862 + }, + { + "epoch": 1.9424920127795526, + "grad_norm": 1.029156495025623, + "learning_rate": 3.33169585097067e-06, + "loss": 0.0865, + "step": 4864 + }, + { + "epoch": 1.9432907348242812, + "grad_norm": 1.1169233865438082, + "learning_rate": 3.3273155692706956e-06, + "loss": 0.087, + "step": 4866 + }, + { + "epoch": 1.9440894568690097, + "grad_norm": 1.0237220064497436, + "learning_rate": 3.3229367326122475e-06, + "loss": 0.088, + "step": 4868 + }, + { + "epoch": 1.944888178913738, + "grad_norm": 1.0716607652719201, + "learning_rate": 3.318559344778231e-06, + "loss": 0.094, + "step": 4870 + }, + { + "epoch": 1.9456869009584663, + "grad_norm": 0.9965570911044683, + "learning_rate": 3.314183409550293e-06, + "loss": 0.0913, + "step": 4872 + }, + { + "epoch": 1.9464856230031948, + "grad_norm": 1.0605235496788143, + "learning_rate": 3.3098089307088307e-06, + "loss": 0.088, + "step": 4874 + }, + { + "epoch": 1.9472843450479234, + "grad_norm": 1.0509076016295873, + "learning_rate": 3.3054359120329788e-06, + "loss": 0.0967, + "step": 4876 + }, + { + "epoch": 1.9480830670926519, + "grad_norm": 1.0048257649363173, + "learning_rate": 3.301064357300615e-06, + "loss": 0.0924, + "step": 4878 + }, + { + "epoch": 1.9488817891373802, + "grad_norm": 1.1009166155154577, + "learning_rate": 3.2966942702883494e-06, + "loss": 0.0979, + "step": 4880 + }, + { + "epoch": 1.9496805111821085, + "grad_norm": 1.123335138251925, + "learning_rate": 3.2923256547715245e-06, + "loss": 0.0952, + "step": 4882 + }, + { + "epoch": 1.950479233226837, + "grad_norm": 1.0429249357254708, + "learning_rate": 3.287958514524212e-06, + "loss": 0.0868, + "step": 4884 + }, + { + "epoch": 1.9512779552715656, + "grad_norm": 1.0420461671724415, + "learning_rate": 3.2835928533192086e-06, + "loss": 0.091, + "step": 4886 + }, + { + "epoch": 1.952076677316294, + "grad_norm": 1.066480483177895, + "learning_rate": 3.279228674928035e-06, + "loss": 0.0911, + "step": 4888 + }, + { + "epoch": 1.9528753993610224, + "grad_norm": 1.034614202531424, + "learning_rate": 3.2748659831209293e-06, + "loss": 0.0925, + "step": 4890 + }, + { + "epoch": 1.9536741214057507, + "grad_norm": 1.010806358143463, + "learning_rate": 3.270504781666845e-06, + "loss": 0.0948, + "step": 4892 + }, + { + "epoch": 1.9544728434504792, + "grad_norm": 0.9676259211028123, + "learning_rate": 3.2661450743334495e-06, + "loss": 0.0838, + "step": 4894 + }, + { + "epoch": 1.9552715654952078, + "grad_norm": 1.9272872858361931, + "learning_rate": 3.261786864887117e-06, + "loss": 0.0941, + "step": 4896 + }, + { + "epoch": 1.956070287539936, + "grad_norm": 1.1661864224993708, + "learning_rate": 3.2574301570929313e-06, + "loss": 0.0946, + "step": 4898 + }, + { + "epoch": 1.9568690095846646, + "grad_norm": 1.0658970850145277, + "learning_rate": 3.2530749547146745e-06, + "loss": 0.0846, + "step": 4900 + }, + { + "epoch": 1.957667731629393, + "grad_norm": 1.1441635938072476, + "learning_rate": 3.2487212615148316e-06, + "loss": 0.0957, + "step": 4902 + }, + { + "epoch": 1.9584664536741214, + "grad_norm": 1.0929232430584095, + "learning_rate": 3.244369081254585e-06, + "loss": 0.0881, + "step": 4904 + }, + { + "epoch": 1.95926517571885, + "grad_norm": 0.9707906667424272, + "learning_rate": 3.240018417693803e-06, + "loss": 0.0854, + "step": 4906 + }, + { + "epoch": 1.9600638977635783, + "grad_norm": 1.166565929217826, + "learning_rate": 3.235669274591051e-06, + "loss": 0.0952, + "step": 4908 + }, + { + "epoch": 1.9608626198083066, + "grad_norm": 0.9958821823082478, + "learning_rate": 3.231321655703581e-06, + "loss": 0.0896, + "step": 4910 + }, + { + "epoch": 1.961661341853035, + "grad_norm": 1.16345333204102, + "learning_rate": 3.226975564787322e-06, + "loss": 0.0967, + "step": 4912 + }, + { + "epoch": 1.9624600638977636, + "grad_norm": 1.0194684910483922, + "learning_rate": 3.222631005596888e-06, + "loss": 0.0792, + "step": 4914 + }, + { + "epoch": 1.9632587859424921, + "grad_norm": 1.1421002598232535, + "learning_rate": 3.218287981885567e-06, + "loss": 0.0998, + "step": 4916 + }, + { + "epoch": 1.9640575079872205, + "grad_norm": 1.0778986286246897, + "learning_rate": 3.2139464974053225e-06, + "loss": 0.0979, + "step": 4918 + }, + { + "epoch": 1.9648562300319488, + "grad_norm": 1.071349946349977, + "learning_rate": 3.209606555906788e-06, + "loss": 0.0792, + "step": 4920 + }, + { + "epoch": 1.9656549520766773, + "grad_norm": 0.9608643299126582, + "learning_rate": 3.2052681611392616e-06, + "loss": 0.0946, + "step": 4922 + }, + { + "epoch": 1.9664536741214058, + "grad_norm": 1.0944937155116632, + "learning_rate": 3.20093131685071e-06, + "loss": 0.094, + "step": 4924 + }, + { + "epoch": 1.9672523961661343, + "grad_norm": 1.045526176848772, + "learning_rate": 3.1965960267877544e-06, + "loss": 0.0986, + "step": 4926 + }, + { + "epoch": 1.9680511182108626, + "grad_norm": 1.2126133137340172, + "learning_rate": 3.192262294695679e-06, + "loss": 0.1038, + "step": 4928 + }, + { + "epoch": 1.968849840255591, + "grad_norm": 1.1192306783949446, + "learning_rate": 3.187930124318417e-06, + "loss": 0.0928, + "step": 4930 + }, + { + "epoch": 1.9696485623003195, + "grad_norm": 1.1586123177119687, + "learning_rate": 3.1835995193985548e-06, + "loss": 0.0973, + "step": 4932 + }, + { + "epoch": 1.970447284345048, + "grad_norm": 1.1103102164856757, + "learning_rate": 3.1792704836773303e-06, + "loss": 0.0953, + "step": 4934 + }, + { + "epoch": 1.9712460063897763, + "grad_norm": 1.165044371843411, + "learning_rate": 3.174943020894618e-06, + "loss": 0.0933, + "step": 4936 + }, + { + "epoch": 1.9720447284345048, + "grad_norm": 0.9945637263764828, + "learning_rate": 3.170617134788939e-06, + "loss": 0.0801, + "step": 4938 + }, + { + "epoch": 1.9728434504792332, + "grad_norm": 1.0071008319354997, + "learning_rate": 3.1662928290974514e-06, + "loss": 0.0916, + "step": 4940 + }, + { + "epoch": 1.9736421725239617, + "grad_norm": 1.1375941744992564, + "learning_rate": 3.161970107555945e-06, + "loss": 0.1005, + "step": 4942 + }, + { + "epoch": 1.9744408945686902, + "grad_norm": 1.0807791951129158, + "learning_rate": 3.1576489738988457e-06, + "loss": 0.0919, + "step": 4944 + }, + { + "epoch": 1.9752396166134185, + "grad_norm": 1.0609866630309928, + "learning_rate": 3.153329431859204e-06, + "loss": 0.0915, + "step": 4946 + }, + { + "epoch": 1.9760383386581468, + "grad_norm": 1.1417443819482096, + "learning_rate": 3.1490114851686984e-06, + "loss": 0.0931, + "step": 4948 + }, + { + "epoch": 1.9768370607028753, + "grad_norm": 0.9100647284833052, + "learning_rate": 3.144695137557624e-06, + "loss": 0.0772, + "step": 4950 + }, + { + "epoch": 1.9776357827476039, + "grad_norm": 0.9703032312651547, + "learning_rate": 3.140380392754901e-06, + "loss": 0.098, + "step": 4952 + }, + { + "epoch": 1.9784345047923324, + "grad_norm": 1.0592857088516412, + "learning_rate": 3.1360672544880586e-06, + "loss": 0.0962, + "step": 4954 + }, + { + "epoch": 1.9792332268370607, + "grad_norm": 1.0580116375862416, + "learning_rate": 3.1317557264832454e-06, + "loss": 0.0841, + "step": 4956 + }, + { + "epoch": 1.980031948881789, + "grad_norm": 1.0265861095673052, + "learning_rate": 3.1274458124652117e-06, + "loss": 0.0901, + "step": 4958 + }, + { + "epoch": 1.9808306709265175, + "grad_norm": 1.0372575193954332, + "learning_rate": 3.12313751615732e-06, + "loss": 0.0736, + "step": 4960 + }, + { + "epoch": 1.981629392971246, + "grad_norm": 0.9602359888946597, + "learning_rate": 3.1188308412815276e-06, + "loss": 0.087, + "step": 4962 + }, + { + "epoch": 1.9824281150159746, + "grad_norm": 1.1199736388647585, + "learning_rate": 3.114525791558398e-06, + "loss": 0.0867, + "step": 4964 + }, + { + "epoch": 1.983226837060703, + "grad_norm": 1.0792261618908379, + "learning_rate": 3.1102223707070865e-06, + "loss": 0.0887, + "step": 4966 + }, + { + "epoch": 1.9840255591054312, + "grad_norm": 1.1181889010486792, + "learning_rate": 3.1059205824453446e-06, + "loss": 0.0929, + "step": 4968 + }, + { + "epoch": 1.9848242811501597, + "grad_norm": 1.0142291902192897, + "learning_rate": 3.101620430489509e-06, + "loss": 0.0876, + "step": 4970 + }, + { + "epoch": 1.9856230031948883, + "grad_norm": 1.2214112942901962, + "learning_rate": 3.0973219185545077e-06, + "loss": 0.1051, + "step": 4972 + }, + { + "epoch": 1.9864217252396166, + "grad_norm": 1.1881886135246487, + "learning_rate": 3.093025050353847e-06, + "loss": 0.0861, + "step": 4974 + }, + { + "epoch": 1.9872204472843449, + "grad_norm": 1.0284769527949214, + "learning_rate": 3.0887298295996183e-06, + "loss": 0.0848, + "step": 4976 + }, + { + "epoch": 1.9880191693290734, + "grad_norm": 1.0067298405880323, + "learning_rate": 3.0844362600024813e-06, + "loss": 0.0859, + "step": 4978 + }, + { + "epoch": 1.988817891373802, + "grad_norm": 1.0990861865733983, + "learning_rate": 3.0801443452716835e-06, + "loss": 0.0938, + "step": 4980 + }, + { + "epoch": 1.9896166134185305, + "grad_norm": 0.914893405123526, + "learning_rate": 3.0758540891150286e-06, + "loss": 0.0785, + "step": 4982 + }, + { + "epoch": 1.9904153354632588, + "grad_norm": 1.04304535262742, + "learning_rate": 3.0715654952388957e-06, + "loss": 0.0926, + "step": 4984 + }, + { + "epoch": 1.991214057507987, + "grad_norm": 1.1088676583366113, + "learning_rate": 3.067278567348223e-06, + "loss": 0.0885, + "step": 4986 + }, + { + "epoch": 1.9920127795527156, + "grad_norm": 1.0903427682959805, + "learning_rate": 3.062993309146514e-06, + "loss": 0.083, + "step": 4988 + }, + { + "epoch": 1.9928115015974441, + "grad_norm": 1.1545810549870772, + "learning_rate": 3.0587097243358254e-06, + "loss": 0.0835, + "step": 4990 + }, + { + "epoch": 1.9936102236421727, + "grad_norm": 1.080925053565247, + "learning_rate": 3.054427816616773e-06, + "loss": 0.0955, + "step": 4992 + }, + { + "epoch": 1.994408945686901, + "grad_norm": 1.0715872751652953, + "learning_rate": 3.0501475896885175e-06, + "loss": 0.0842, + "step": 4994 + }, + { + "epoch": 1.9952076677316293, + "grad_norm": 1.1378173558732894, + "learning_rate": 3.045869047248774e-06, + "loss": 0.0974, + "step": 4996 + }, + { + "epoch": 1.9960063897763578, + "grad_norm": 1.110967676470196, + "learning_rate": 3.041592192993798e-06, + "loss": 0.0928, + "step": 4998 + }, + { + "epoch": 1.9968051118210863, + "grad_norm": 0.9911386613472772, + "learning_rate": 3.0373170306183885e-06, + "loss": 0.0737, + "step": 5000 + }, + { + "epoch": 1.9968051118210863, + "eval_loss": 0.1455243080854416, + "eval_runtime": 417.8708, + "eval_samples_per_second": 42.614, + "eval_steps_per_second": 5.327, + "step": 5000 + }, + { + "epoch": 1.9976038338658149, + "grad_norm": 1.124610364572997, + "learning_rate": 3.0330435638158805e-06, + "loss": 0.0935, + "step": 5002 + }, + { + "epoch": 1.9984025559105432, + "grad_norm": 1.053117611083023, + "learning_rate": 3.028771796278151e-06, + "loss": 0.1017, + "step": 5004 + }, + { + "epoch": 1.9992012779552715, + "grad_norm": 1.151547544509955, + "learning_rate": 3.0245017316956e-06, + "loss": 0.0902, + "step": 5006 + }, + { + "epoch": 2.0, + "grad_norm": 1.1612757379061158, + "learning_rate": 3.020233373757162e-06, + "loss": 0.0987, + "step": 5008 + }, + { + "epoch": 2.0007987220447285, + "grad_norm": 0.6298174764155958, + "learning_rate": 3.0159667261502944e-06, + "loss": 0.0395, + "step": 5010 + }, + { + "epoch": 2.001597444089457, + "grad_norm": 0.625623431879438, + "learning_rate": 3.0117017925609802e-06, + "loss": 0.0417, + "step": 5012 + }, + { + "epoch": 2.002396166134185, + "grad_norm": 0.6897871271662769, + "learning_rate": 3.007438576673717e-06, + "loss": 0.0421, + "step": 5014 + }, + { + "epoch": 2.0031948881789137, + "grad_norm": 0.6744101045802077, + "learning_rate": 3.0031770821715233e-06, + "loss": 0.0378, + "step": 5016 + }, + { + "epoch": 2.003993610223642, + "grad_norm": 0.7756444837427113, + "learning_rate": 2.9989173127359267e-06, + "loss": 0.04, + "step": 5018 + }, + { + "epoch": 2.0047923322683707, + "grad_norm": 0.7859449397270831, + "learning_rate": 2.9946592720469662e-06, + "loss": 0.037, + "step": 5020 + }, + { + "epoch": 2.0055910543130993, + "grad_norm": 0.6704787926555024, + "learning_rate": 2.9904029637831887e-06, + "loss": 0.0347, + "step": 5022 + }, + { + "epoch": 2.0063897763578273, + "grad_norm": 0.751662927080756, + "learning_rate": 2.9861483916216404e-06, + "loss": 0.0389, + "step": 5024 + }, + { + "epoch": 2.007188498402556, + "grad_norm": 0.9221469194496961, + "learning_rate": 2.981895559237873e-06, + "loss": 0.0447, + "step": 5026 + }, + { + "epoch": 2.0079872204472844, + "grad_norm": 0.80221078116781, + "learning_rate": 2.9776444703059316e-06, + "loss": 0.0345, + "step": 5028 + }, + { + "epoch": 2.008785942492013, + "grad_norm": 0.8059814098019393, + "learning_rate": 2.9733951284983555e-06, + "loss": 0.0373, + "step": 5030 + }, + { + "epoch": 2.009584664536741, + "grad_norm": 0.9358045993876437, + "learning_rate": 2.969147537486175e-06, + "loss": 0.0408, + "step": 5032 + }, + { + "epoch": 2.0103833865814695, + "grad_norm": 0.8885370742724994, + "learning_rate": 2.9649017009389077e-06, + "loss": 0.0483, + "step": 5034 + }, + { + "epoch": 2.011182108626198, + "grad_norm": 1.057708534412327, + "learning_rate": 2.9606576225245566e-06, + "loss": 0.0379, + "step": 5036 + }, + { + "epoch": 2.0119808306709266, + "grad_norm": 0.8731366761911997, + "learning_rate": 2.9564153059096047e-06, + "loss": 0.0375, + "step": 5038 + }, + { + "epoch": 2.012779552715655, + "grad_norm": 0.8694849953429314, + "learning_rate": 2.952174754759012e-06, + "loss": 0.0359, + "step": 5040 + }, + { + "epoch": 2.013578274760383, + "grad_norm": 0.8842840721862356, + "learning_rate": 2.947935972736217e-06, + "loss": 0.0351, + "step": 5042 + }, + { + "epoch": 2.0143769968051117, + "grad_norm": 0.8362117175566706, + "learning_rate": 2.9436989635031253e-06, + "loss": 0.0353, + "step": 5044 + }, + { + "epoch": 2.0151757188498403, + "grad_norm": 0.8610101010994339, + "learning_rate": 2.9394637307201156e-06, + "loss": 0.0365, + "step": 5046 + }, + { + "epoch": 2.015974440894569, + "grad_norm": 0.8646570046386132, + "learning_rate": 2.935230278046025e-06, + "loss": 0.0376, + "step": 5048 + }, + { + "epoch": 2.0167731629392973, + "grad_norm": 0.8838043984613749, + "learning_rate": 2.9309986091381616e-06, + "loss": 0.0331, + "step": 5050 + }, + { + "epoch": 2.0175718849840254, + "grad_norm": 0.8681957347092829, + "learning_rate": 2.9267687276522876e-06, + "loss": 0.0355, + "step": 5052 + }, + { + "epoch": 2.018370607028754, + "grad_norm": 0.9197550042905789, + "learning_rate": 2.922540637242619e-06, + "loss": 0.0366, + "step": 5054 + }, + { + "epoch": 2.0191693290734825, + "grad_norm": 0.9123850396028426, + "learning_rate": 2.9183143415618297e-06, + "loss": 0.035, + "step": 5056 + }, + { + "epoch": 2.019968051118211, + "grad_norm": 0.8949092568287165, + "learning_rate": 2.9140898442610375e-06, + "loss": 0.0383, + "step": 5058 + }, + { + "epoch": 2.0207667731629395, + "grad_norm": 1.0371718273633521, + "learning_rate": 2.909867148989812e-06, + "loss": 0.0365, + "step": 5060 + }, + { + "epoch": 2.0215654952076676, + "grad_norm": 0.7653356631917099, + "learning_rate": 2.905646259396162e-06, + "loss": 0.0358, + "step": 5062 + }, + { + "epoch": 2.022364217252396, + "grad_norm": 0.7314418159084973, + "learning_rate": 2.9014271791265403e-06, + "loss": 0.031, + "step": 5064 + }, + { + "epoch": 2.0231629392971247, + "grad_norm": 0.8465101892289698, + "learning_rate": 2.8972099118258305e-06, + "loss": 0.0313, + "step": 5066 + }, + { + "epoch": 2.023961661341853, + "grad_norm": 0.8580294083980158, + "learning_rate": 2.8929944611373555e-06, + "loss": 0.0349, + "step": 5068 + }, + { + "epoch": 2.0247603833865813, + "grad_norm": 0.820776207639085, + "learning_rate": 2.888780830702867e-06, + "loss": 0.0337, + "step": 5070 + }, + { + "epoch": 2.02555910543131, + "grad_norm": 0.9528932671606859, + "learning_rate": 2.8845690241625437e-06, + "loss": 0.0396, + "step": 5072 + }, + { + "epoch": 2.0263578274760383, + "grad_norm": 1.0061064855225768, + "learning_rate": 2.88035904515499e-06, + "loss": 0.0397, + "step": 5074 + }, + { + "epoch": 2.027156549520767, + "grad_norm": 0.8452096897085726, + "learning_rate": 2.8761508973172293e-06, + "loss": 0.0346, + "step": 5076 + }, + { + "epoch": 2.0279552715654954, + "grad_norm": 0.7634443097067318, + "learning_rate": 2.871944584284705e-06, + "loss": 0.0322, + "step": 5078 + }, + { + "epoch": 2.0287539936102235, + "grad_norm": 1.0425744751303125, + "learning_rate": 2.867740109691277e-06, + "loss": 0.0355, + "step": 5080 + }, + { + "epoch": 2.029552715654952, + "grad_norm": 1.0565258664896207, + "learning_rate": 2.86353747716921e-06, + "loss": 0.0378, + "step": 5082 + }, + { + "epoch": 2.0303514376996805, + "grad_norm": 1.0498591946749731, + "learning_rate": 2.859336690349185e-06, + "loss": 0.0407, + "step": 5084 + }, + { + "epoch": 2.031150159744409, + "grad_norm": 0.7704667558370644, + "learning_rate": 2.8551377528602836e-06, + "loss": 0.0325, + "step": 5086 + }, + { + "epoch": 2.0319488817891376, + "grad_norm": 0.9433544175624843, + "learning_rate": 2.850940668329996e-06, + "loss": 0.034, + "step": 5088 + }, + { + "epoch": 2.0327476038338657, + "grad_norm": 0.8469629469335119, + "learning_rate": 2.8467454403842005e-06, + "loss": 0.0341, + "step": 5090 + }, + { + "epoch": 2.033546325878594, + "grad_norm": 0.7628386290158324, + "learning_rate": 2.842552072647182e-06, + "loss": 0.0306, + "step": 5092 + }, + { + "epoch": 2.0343450479233227, + "grad_norm": 0.9647679056914126, + "learning_rate": 2.838360568741613e-06, + "loss": 0.0419, + "step": 5094 + }, + { + "epoch": 2.0351437699680512, + "grad_norm": 0.8818509646802964, + "learning_rate": 2.8341709322885624e-06, + "loss": 0.0322, + "step": 5096 + }, + { + "epoch": 2.0359424920127798, + "grad_norm": 0.9443959085201566, + "learning_rate": 2.8299831669074744e-06, + "loss": 0.04, + "step": 5098 + }, + { + "epoch": 2.036741214057508, + "grad_norm": 1.0294482005419765, + "learning_rate": 2.8257972762161865e-06, + "loss": 0.0371, + "step": 5100 + }, + { + "epoch": 2.0375399361022364, + "grad_norm": 1.0211095104000223, + "learning_rate": 2.8216132638309124e-06, + "loss": 0.0365, + "step": 5102 + }, + { + "epoch": 2.038338658146965, + "grad_norm": 1.9713879821461935, + "learning_rate": 2.817431133366246e-06, + "loss": 0.0437, + "step": 5104 + }, + { + "epoch": 2.0391373801916934, + "grad_norm": 0.9823140675823749, + "learning_rate": 2.8132508884351504e-06, + "loss": 0.0379, + "step": 5106 + }, + { + "epoch": 2.0399361022364215, + "grad_norm": 0.8967790877119935, + "learning_rate": 2.809072532648963e-06, + "loss": 0.0335, + "step": 5108 + }, + { + "epoch": 2.04073482428115, + "grad_norm": 0.9950222932994464, + "learning_rate": 2.804896069617391e-06, + "loss": 0.0414, + "step": 5110 + }, + { + "epoch": 2.0415335463258786, + "grad_norm": 1.0503584997850632, + "learning_rate": 2.800721502948506e-06, + "loss": 0.0371, + "step": 5112 + }, + { + "epoch": 2.042332268370607, + "grad_norm": 0.8863498646029845, + "learning_rate": 2.7965488362487337e-06, + "loss": 0.037, + "step": 5114 + }, + { + "epoch": 2.0431309904153356, + "grad_norm": 0.9129952865118284, + "learning_rate": 2.7923780731228665e-06, + "loss": 0.0379, + "step": 5116 + }, + { + "epoch": 2.0439297124600637, + "grad_norm": 0.9360258175259091, + "learning_rate": 2.7882092171740544e-06, + "loss": 0.0386, + "step": 5118 + }, + { + "epoch": 2.0447284345047922, + "grad_norm": 0.9038391882536216, + "learning_rate": 2.7840422720037943e-06, + "loss": 0.0336, + "step": 5120 + }, + { + "epoch": 2.0455271565495208, + "grad_norm": 0.861340429852411, + "learning_rate": 2.77987724121193e-06, + "loss": 0.0396, + "step": 5122 + }, + { + "epoch": 2.0463258785942493, + "grad_norm": 0.9598176808433139, + "learning_rate": 2.775714128396658e-06, + "loss": 0.0353, + "step": 5124 + }, + { + "epoch": 2.047124600638978, + "grad_norm": 0.8776533817537266, + "learning_rate": 2.7715529371545138e-06, + "loss": 0.0357, + "step": 5126 + }, + { + "epoch": 2.047923322683706, + "grad_norm": 0.9993457119073764, + "learning_rate": 2.767393671080376e-06, + "loss": 0.0357, + "step": 5128 + }, + { + "epoch": 2.0487220447284344, + "grad_norm": 0.9982845161626676, + "learning_rate": 2.763236333767455e-06, + "loss": 0.0329, + "step": 5130 + }, + { + "epoch": 2.049520766773163, + "grad_norm": 0.9890197900955416, + "learning_rate": 2.7590809288073e-06, + "loss": 0.0334, + "step": 5132 + }, + { + "epoch": 2.0503194888178915, + "grad_norm": 0.838107029970905, + "learning_rate": 2.7549274597897878e-06, + "loss": 0.0349, + "step": 5134 + }, + { + "epoch": 2.0511182108626196, + "grad_norm": 0.8727714025662038, + "learning_rate": 2.7507759303031257e-06, + "loss": 0.0358, + "step": 5136 + }, + { + "epoch": 2.051916932907348, + "grad_norm": 0.9757184296946524, + "learning_rate": 2.7466263439338424e-06, + "loss": 0.0388, + "step": 5138 + }, + { + "epoch": 2.0527156549520766, + "grad_norm": 0.9770465103402337, + "learning_rate": 2.7424787042667856e-06, + "loss": 0.0423, + "step": 5140 + }, + { + "epoch": 2.053514376996805, + "grad_norm": 0.8990138413398377, + "learning_rate": 2.7383330148851293e-06, + "loss": 0.0315, + "step": 5142 + }, + { + "epoch": 2.0543130990415337, + "grad_norm": 0.8588787728847247, + "learning_rate": 2.7341892793703594e-06, + "loss": 0.0315, + "step": 5144 + }, + { + "epoch": 2.055111821086262, + "grad_norm": 0.9632194247423946, + "learning_rate": 2.7300475013022666e-06, + "loss": 0.0335, + "step": 5146 + }, + { + "epoch": 2.0559105431309903, + "grad_norm": 0.730091684485515, + "learning_rate": 2.7259076842589595e-06, + "loss": 0.0316, + "step": 5148 + }, + { + "epoch": 2.056709265175719, + "grad_norm": 0.8722727704441476, + "learning_rate": 2.721769831816849e-06, + "loss": 0.0385, + "step": 5150 + }, + { + "epoch": 2.0575079872204474, + "grad_norm": 0.9033151795461563, + "learning_rate": 2.7176339475506515e-06, + "loss": 0.0369, + "step": 5152 + }, + { + "epoch": 2.058306709265176, + "grad_norm": 0.9092639705567416, + "learning_rate": 2.7135000350333762e-06, + "loss": 0.0371, + "step": 5154 + }, + { + "epoch": 2.059105431309904, + "grad_norm": 1.0234924384548971, + "learning_rate": 2.7093680978363367e-06, + "loss": 0.0379, + "step": 5156 + }, + { + "epoch": 2.0599041533546325, + "grad_norm": 0.9677538639597134, + "learning_rate": 2.7052381395291355e-06, + "loss": 0.0308, + "step": 5158 + }, + { + "epoch": 2.060702875399361, + "grad_norm": 1.0727950843621823, + "learning_rate": 2.7011101636796677e-06, + "loss": 0.0454, + "step": 5160 + }, + { + "epoch": 2.0615015974440896, + "grad_norm": 1.02920375202306, + "learning_rate": 2.6969841738541165e-06, + "loss": 0.0421, + "step": 5162 + }, + { + "epoch": 2.062300319488818, + "grad_norm": 1.1885181849696531, + "learning_rate": 2.6928601736169423e-06, + "loss": 0.0444, + "step": 5164 + }, + { + "epoch": 2.063099041533546, + "grad_norm": 1.0717318540862604, + "learning_rate": 2.6887381665308977e-06, + "loss": 0.0385, + "step": 5166 + }, + { + "epoch": 2.0638977635782747, + "grad_norm": 0.7332635219694293, + "learning_rate": 2.6846181561570085e-06, + "loss": 0.0322, + "step": 5168 + }, + { + "epoch": 2.0646964856230032, + "grad_norm": 0.854254385375307, + "learning_rate": 2.68050014605457e-06, + "loss": 0.037, + "step": 5170 + }, + { + "epoch": 2.0654952076677318, + "grad_norm": 0.8081983560893325, + "learning_rate": 2.6763841397811576e-06, + "loss": 0.034, + "step": 5172 + }, + { + "epoch": 2.06629392971246, + "grad_norm": 0.9741434482130531, + "learning_rate": 2.6722701408926117e-06, + "loss": 0.0356, + "step": 5174 + }, + { + "epoch": 2.0670926517571884, + "grad_norm": 0.8836539172435507, + "learning_rate": 2.668158152943039e-06, + "loss": 0.0402, + "step": 5176 + }, + { + "epoch": 2.067891373801917, + "grad_norm": 0.9171861393472643, + "learning_rate": 2.664048179484812e-06, + "loss": 0.0354, + "step": 5178 + }, + { + "epoch": 2.0686900958466454, + "grad_norm": 0.9101919637114287, + "learning_rate": 2.6599402240685546e-06, + "loss": 0.0354, + "step": 5180 + }, + { + "epoch": 2.069488817891374, + "grad_norm": 0.9747609472900693, + "learning_rate": 2.6558342902431553e-06, + "loss": 0.0365, + "step": 5182 + }, + { + "epoch": 2.070287539936102, + "grad_norm": 0.8324045055842462, + "learning_rate": 2.651730381555754e-06, + "loss": 0.0386, + "step": 5184 + }, + { + "epoch": 2.0710862619808306, + "grad_norm": 0.7931217955187458, + "learning_rate": 2.64762850155174e-06, + "loss": 0.0329, + "step": 5186 + }, + { + "epoch": 2.071884984025559, + "grad_norm": 0.797923260607255, + "learning_rate": 2.6435286537747512e-06, + "loss": 0.0336, + "step": 5188 + }, + { + "epoch": 2.0726837060702876, + "grad_norm": 0.958305531859498, + "learning_rate": 2.6394308417666686e-06, + "loss": 0.0376, + "step": 5190 + }, + { + "epoch": 2.073482428115016, + "grad_norm": 0.9482044419754099, + "learning_rate": 2.635335069067617e-06, + "loss": 0.0393, + "step": 5192 + }, + { + "epoch": 2.0742811501597442, + "grad_norm": 1.0868323089043634, + "learning_rate": 2.6312413392159553e-06, + "loss": 0.0379, + "step": 5194 + }, + { + "epoch": 2.0750798722044728, + "grad_norm": 0.8207739576375747, + "learning_rate": 2.6271496557482795e-06, + "loss": 0.0343, + "step": 5196 + }, + { + "epoch": 2.0758785942492013, + "grad_norm": 0.8539413020355823, + "learning_rate": 2.6230600221994195e-06, + "loss": 0.0397, + "step": 5198 + }, + { + "epoch": 2.07667731629393, + "grad_norm": 0.8965919210232974, + "learning_rate": 2.618972442102432e-06, + "loss": 0.0381, + "step": 5200 + }, + { + "epoch": 2.0774760383386583, + "grad_norm": 0.8973056412608038, + "learning_rate": 2.614886918988604e-06, + "loss": 0.0342, + "step": 5202 + }, + { + "epoch": 2.0782747603833864, + "grad_norm": 0.9313693805488207, + "learning_rate": 2.610803456387436e-06, + "loss": 0.0358, + "step": 5204 + }, + { + "epoch": 2.079073482428115, + "grad_norm": 0.7917376380998945, + "learning_rate": 2.6067220578266574e-06, + "loss": 0.0329, + "step": 5206 + }, + { + "epoch": 2.0798722044728435, + "grad_norm": 0.961444379029632, + "learning_rate": 2.602642726832212e-06, + "loss": 0.0361, + "step": 5208 + }, + { + "epoch": 2.080670926517572, + "grad_norm": 0.8429511197266044, + "learning_rate": 2.5985654669282556e-06, + "loss": 0.0334, + "step": 5210 + }, + { + "epoch": 2.0814696485623, + "grad_norm": 0.9355752641552857, + "learning_rate": 2.5944902816371573e-06, + "loss": 0.0395, + "step": 5212 + }, + { + "epoch": 2.0822683706070286, + "grad_norm": 0.8965794716087095, + "learning_rate": 2.5904171744794927e-06, + "loss": 0.031, + "step": 5214 + }, + { + "epoch": 2.083067092651757, + "grad_norm": 0.906394944166487, + "learning_rate": 2.5863461489740403e-06, + "loss": 0.0362, + "step": 5216 + }, + { + "epoch": 2.0838658146964857, + "grad_norm": 0.916184630737243, + "learning_rate": 2.5822772086377863e-06, + "loss": 0.035, + "step": 5218 + }, + { + "epoch": 2.084664536741214, + "grad_norm": 0.9777881509465214, + "learning_rate": 2.5782103569859057e-06, + "loss": 0.0388, + "step": 5220 + }, + { + "epoch": 2.0854632587859423, + "grad_norm": 1.0639934309744312, + "learning_rate": 2.5741455975317776e-06, + "loss": 0.0379, + "step": 5222 + }, + { + "epoch": 2.086261980830671, + "grad_norm": 0.8855617385185786, + "learning_rate": 2.57008293378697e-06, + "loss": 0.0366, + "step": 5224 + }, + { + "epoch": 2.0870607028753994, + "grad_norm": 0.9693916259214078, + "learning_rate": 2.566022369261243e-06, + "loss": 0.0384, + "step": 5226 + }, + { + "epoch": 2.087859424920128, + "grad_norm": 0.9541503121738648, + "learning_rate": 2.5619639074625374e-06, + "loss": 0.0367, + "step": 5228 + }, + { + "epoch": 2.0886581469648564, + "grad_norm": 1.0451416066763384, + "learning_rate": 2.557907551896984e-06, + "loss": 0.0352, + "step": 5230 + }, + { + "epoch": 2.0894568690095845, + "grad_norm": 0.9406486155465448, + "learning_rate": 2.553853306068888e-06, + "loss": 0.0326, + "step": 5232 + }, + { + "epoch": 2.090255591054313, + "grad_norm": 0.97097723247182, + "learning_rate": 2.549801173480742e-06, + "loss": 0.0348, + "step": 5234 + }, + { + "epoch": 2.0910543130990416, + "grad_norm": 1.0076436948344818, + "learning_rate": 2.5457511576332008e-06, + "loss": 0.0423, + "step": 5236 + }, + { + "epoch": 2.09185303514377, + "grad_norm": 1.1036965373373038, + "learning_rate": 2.5417032620250962e-06, + "loss": 0.0392, + "step": 5238 + }, + { + "epoch": 2.0926517571884986, + "grad_norm": 0.9846203021426592, + "learning_rate": 2.5376574901534303e-06, + "loss": 0.0382, + "step": 5240 + }, + { + "epoch": 2.0934504792332267, + "grad_norm": 1.346629151831746, + "learning_rate": 2.5336138455133684e-06, + "loss": 0.0379, + "step": 5242 + }, + { + "epoch": 2.094249201277955, + "grad_norm": 0.822209398062977, + "learning_rate": 2.5295723315982344e-06, + "loss": 0.0308, + "step": 5244 + }, + { + "epoch": 2.0950479233226837, + "grad_norm": 0.912394753377447, + "learning_rate": 2.5255329518995185e-06, + "loss": 0.0325, + "step": 5246 + }, + { + "epoch": 2.0958466453674123, + "grad_norm": 0.9775498681194755, + "learning_rate": 2.5214957099068613e-06, + "loss": 0.0384, + "step": 5248 + }, + { + "epoch": 2.0966453674121404, + "grad_norm": 0.810870250386815, + "learning_rate": 2.517460609108063e-06, + "loss": 0.0362, + "step": 5250 + }, + { + "epoch": 2.097444089456869, + "grad_norm": 1.3581064330087438, + "learning_rate": 2.5134276529890646e-06, + "loss": 0.0388, + "step": 5252 + }, + { + "epoch": 2.0982428115015974, + "grad_norm": 0.9668476880810652, + "learning_rate": 2.509396845033962e-06, + "loss": 0.036, + "step": 5254 + }, + { + "epoch": 2.099041533546326, + "grad_norm": 0.9552646055277381, + "learning_rate": 2.5053681887249916e-06, + "loss": 0.0354, + "step": 5256 + }, + { + "epoch": 2.0998402555910545, + "grad_norm": 0.962167773860268, + "learning_rate": 2.501341687542538e-06, + "loss": 0.0378, + "step": 5258 + }, + { + "epoch": 2.1006389776357826, + "grad_norm": 0.8997107031650471, + "learning_rate": 2.497317344965111e-06, + "loss": 0.0378, + "step": 5260 + }, + { + "epoch": 2.101437699680511, + "grad_norm": 0.9369128058048221, + "learning_rate": 2.493295164469367e-06, + "loss": 0.0352, + "step": 5262 + }, + { + "epoch": 2.1022364217252396, + "grad_norm": 0.8376692502141213, + "learning_rate": 2.4892751495300893e-06, + "loss": 0.0319, + "step": 5264 + }, + { + "epoch": 2.103035143769968, + "grad_norm": 0.9548206607669597, + "learning_rate": 2.4852573036201937e-06, + "loss": 0.0395, + "step": 5266 + }, + { + "epoch": 2.1038338658146967, + "grad_norm": 0.9387516661598057, + "learning_rate": 2.481241630210716e-06, + "loss": 0.0349, + "step": 5268 + }, + { + "epoch": 2.1046325878594248, + "grad_norm": 0.874167713863454, + "learning_rate": 2.4772281327708213e-06, + "loss": 0.0335, + "step": 5270 + }, + { + "epoch": 2.1054313099041533, + "grad_norm": 1.0185097250661654, + "learning_rate": 2.4732168147677927e-06, + "loss": 0.0357, + "step": 5272 + }, + { + "epoch": 2.106230031948882, + "grad_norm": 1.043882328934825, + "learning_rate": 2.46920767966703e-06, + "loss": 0.0347, + "step": 5274 + }, + { + "epoch": 2.1070287539936103, + "grad_norm": 1.0232740029306155, + "learning_rate": 2.4652007309320497e-06, + "loss": 0.0412, + "step": 5276 + }, + { + "epoch": 2.107827476038339, + "grad_norm": 0.884509400992361, + "learning_rate": 2.461195972024472e-06, + "loss": 0.0345, + "step": 5278 + }, + { + "epoch": 2.108626198083067, + "grad_norm": 0.9621954527543679, + "learning_rate": 2.4571934064040364e-06, + "loss": 0.0392, + "step": 5280 + }, + { + "epoch": 2.1094249201277955, + "grad_norm": 0.9325329293080564, + "learning_rate": 2.453193037528582e-06, + "loss": 0.0372, + "step": 5282 + }, + { + "epoch": 2.110223642172524, + "grad_norm": 0.77944405137395, + "learning_rate": 2.449194868854046e-06, + "loss": 0.0326, + "step": 5284 + }, + { + "epoch": 2.1110223642172525, + "grad_norm": 0.8761868440249553, + "learning_rate": 2.4451989038344713e-06, + "loss": 0.0341, + "step": 5286 + }, + { + "epoch": 2.1118210862619806, + "grad_norm": 0.8178367213212951, + "learning_rate": 2.4412051459219945e-06, + "loss": 0.0306, + "step": 5288 + }, + { + "epoch": 2.112619808306709, + "grad_norm": 0.8640047451958446, + "learning_rate": 2.4372135985668473e-06, + "loss": 0.0332, + "step": 5290 + }, + { + "epoch": 2.1134185303514377, + "grad_norm": 1.0879914849221164, + "learning_rate": 2.433224265217346e-06, + "loss": 0.0374, + "step": 5292 + }, + { + "epoch": 2.114217252396166, + "grad_norm": 0.9388437125454063, + "learning_rate": 2.4292371493199e-06, + "loss": 0.039, + "step": 5294 + }, + { + "epoch": 2.1150159744408947, + "grad_norm": 0.9016588347052378, + "learning_rate": 2.425252254319002e-06, + "loss": 0.0298, + "step": 5296 + }, + { + "epoch": 2.115814696485623, + "grad_norm": 0.9448880094558859, + "learning_rate": 2.4212695836572255e-06, + "loss": 0.0348, + "step": 5298 + }, + { + "epoch": 2.1166134185303513, + "grad_norm": 1.058137286496039, + "learning_rate": 2.4172891407752225e-06, + "loss": 0.0407, + "step": 5300 + }, + { + "epoch": 2.11741214057508, + "grad_norm": 1.0908676673345152, + "learning_rate": 2.4133109291117156e-06, + "loss": 0.0337, + "step": 5302 + }, + { + "epoch": 2.1182108626198084, + "grad_norm": 1.0100591708294226, + "learning_rate": 2.4093349521035105e-06, + "loss": 0.0357, + "step": 5304 + }, + { + "epoch": 2.119009584664537, + "grad_norm": 0.9113240938208252, + "learning_rate": 2.405361213185475e-06, + "loss": 0.0395, + "step": 5306 + }, + { + "epoch": 2.119808306709265, + "grad_norm": 1.0238016688467406, + "learning_rate": 2.4013897157905414e-06, + "loss": 0.0414, + "step": 5308 + }, + { + "epoch": 2.1206070287539935, + "grad_norm": 0.9249144986993447, + "learning_rate": 2.39742046334971e-06, + "loss": 0.038, + "step": 5310 + }, + { + "epoch": 2.121405750798722, + "grad_norm": 0.8853057009535151, + "learning_rate": 2.3934534592920416e-06, + "loss": 0.0381, + "step": 5312 + }, + { + "epoch": 2.1222044728434506, + "grad_norm": 0.9382408504614896, + "learning_rate": 2.3894887070446526e-06, + "loss": 0.0327, + "step": 5314 + }, + { + "epoch": 2.123003194888179, + "grad_norm": 0.9682097923317222, + "learning_rate": 2.385526210032717e-06, + "loss": 0.0345, + "step": 5316 + }, + { + "epoch": 2.123801916932907, + "grad_norm": 1.007959850484497, + "learning_rate": 2.3815659716794544e-06, + "loss": 0.0362, + "step": 5318 + }, + { + "epoch": 2.1246006389776357, + "grad_norm": 0.87498608229541, + "learning_rate": 2.3776079954061385e-06, + "loss": 0.0314, + "step": 5320 + }, + { + "epoch": 2.1253993610223643, + "grad_norm": 0.8230237293988656, + "learning_rate": 2.3736522846320894e-06, + "loss": 0.0329, + "step": 5322 + }, + { + "epoch": 2.126198083067093, + "grad_norm": 0.893770656802708, + "learning_rate": 2.369698842774667e-06, + "loss": 0.0379, + "step": 5324 + }, + { + "epoch": 2.126996805111821, + "grad_norm": 0.9516447751453979, + "learning_rate": 2.365747673249268e-06, + "loss": 0.0398, + "step": 5326 + }, + { + "epoch": 2.1277955271565494, + "grad_norm": 0.8303186005151683, + "learning_rate": 2.3617987794693358e-06, + "loss": 0.0291, + "step": 5328 + }, + { + "epoch": 2.128594249201278, + "grad_norm": 0.9512354460837069, + "learning_rate": 2.3578521648463414e-06, + "loss": 0.0326, + "step": 5330 + }, + { + "epoch": 2.1293929712460065, + "grad_norm": 0.9842083956952964, + "learning_rate": 2.3539078327897846e-06, + "loss": 0.036, + "step": 5332 + }, + { + "epoch": 2.130191693290735, + "grad_norm": 0.8378931235387028, + "learning_rate": 2.3499657867071973e-06, + "loss": 0.0289, + "step": 5334 + }, + { + "epoch": 2.130990415335463, + "grad_norm": 0.9684963805154867, + "learning_rate": 2.3460260300041355e-06, + "loss": 0.0319, + "step": 5336 + }, + { + "epoch": 2.1317891373801916, + "grad_norm": 0.9200136986577663, + "learning_rate": 2.342088566084177e-06, + "loss": 0.0345, + "step": 5338 + }, + { + "epoch": 2.13258785942492, + "grad_norm": 0.9761484143353245, + "learning_rate": 2.3381533983489213e-06, + "loss": 0.0296, + "step": 5340 + }, + { + "epoch": 2.1333865814696487, + "grad_norm": 0.8882033903773601, + "learning_rate": 2.334220530197979e-06, + "loss": 0.0305, + "step": 5342 + }, + { + "epoch": 2.134185303514377, + "grad_norm": 0.8114312897596324, + "learning_rate": 2.3302899650289773e-06, + "loss": 0.0317, + "step": 5344 + }, + { + "epoch": 2.1349840255591053, + "grad_norm": 0.9453542646442153, + "learning_rate": 2.3263617062375556e-06, + "loss": 0.0381, + "step": 5346 + }, + { + "epoch": 2.135782747603834, + "grad_norm": 1.0338865546370732, + "learning_rate": 2.322435757217357e-06, + "loss": 0.0361, + "step": 5348 + }, + { + "epoch": 2.1365814696485623, + "grad_norm": 0.9901385988788762, + "learning_rate": 2.3185121213600328e-06, + "loss": 0.0391, + "step": 5350 + }, + { + "epoch": 2.137380191693291, + "grad_norm": 0.9091912381004705, + "learning_rate": 2.314590802055232e-06, + "loss": 0.0316, + "step": 5352 + }, + { + "epoch": 2.1381789137380194, + "grad_norm": 0.9808804213459485, + "learning_rate": 2.3106718026906073e-06, + "loss": 0.0383, + "step": 5354 + }, + { + "epoch": 2.1389776357827475, + "grad_norm": 0.8455611288247308, + "learning_rate": 2.306755126651804e-06, + "loss": 0.0345, + "step": 5356 + }, + { + "epoch": 2.139776357827476, + "grad_norm": 0.8955449128754519, + "learning_rate": 2.3028407773224576e-06, + "loss": 0.0349, + "step": 5358 + }, + { + "epoch": 2.1405750798722045, + "grad_norm": 0.9436017969146995, + "learning_rate": 2.2989287580841985e-06, + "loss": 0.0399, + "step": 5360 + }, + { + "epoch": 2.141373801916933, + "grad_norm": 0.980164453106363, + "learning_rate": 2.2950190723166427e-06, + "loss": 0.0412, + "step": 5362 + }, + { + "epoch": 2.142172523961661, + "grad_norm": 0.8957004905788968, + "learning_rate": 2.291111723397391e-06, + "loss": 0.0327, + "step": 5364 + }, + { + "epoch": 2.1429712460063897, + "grad_norm": 0.8853186556177322, + "learning_rate": 2.2872067147020204e-06, + "loss": 0.0368, + "step": 5366 + }, + { + "epoch": 2.143769968051118, + "grad_norm": 0.9295930702988132, + "learning_rate": 2.2833040496040925e-06, + "loss": 0.0375, + "step": 5368 + }, + { + "epoch": 2.1445686900958467, + "grad_norm": 0.8269626905882466, + "learning_rate": 2.2794037314751412e-06, + "loss": 0.0377, + "step": 5370 + }, + { + "epoch": 2.1453674121405752, + "grad_norm": 0.9022026161705567, + "learning_rate": 2.275505763684674e-06, + "loss": 0.0377, + "step": 5372 + }, + { + "epoch": 2.1461661341853033, + "grad_norm": 0.867448073493281, + "learning_rate": 2.2716101496001663e-06, + "loss": 0.0385, + "step": 5374 + }, + { + "epoch": 2.146964856230032, + "grad_norm": 0.8312737054421381, + "learning_rate": 2.267716892587062e-06, + "loss": 0.0307, + "step": 5376 + }, + { + "epoch": 2.1477635782747604, + "grad_norm": 0.881602657629581, + "learning_rate": 2.2638259960087665e-06, + "loss": 0.0307, + "step": 5378 + }, + { + "epoch": 2.148562300319489, + "grad_norm": 0.9967389956242162, + "learning_rate": 2.2599374632266514e-06, + "loss": 0.0356, + "step": 5380 + }, + { + "epoch": 2.1493610223642174, + "grad_norm": 0.8461757104444423, + "learning_rate": 2.2560512976000366e-06, + "loss": 0.0347, + "step": 5382 + }, + { + "epoch": 2.1501597444089455, + "grad_norm": 1.0004876530323552, + "learning_rate": 2.252167502486205e-06, + "loss": 0.0353, + "step": 5384 + }, + { + "epoch": 2.150958466453674, + "grad_norm": 0.9652946295733036, + "learning_rate": 2.2482860812403887e-06, + "loss": 0.0322, + "step": 5386 + }, + { + "epoch": 2.1517571884984026, + "grad_norm": 0.8292517773055327, + "learning_rate": 2.2444070372157724e-06, + "loss": 0.0339, + "step": 5388 + }, + { + "epoch": 2.152555910543131, + "grad_norm": 0.8955686140212332, + "learning_rate": 2.2405303737634794e-06, + "loss": 0.0339, + "step": 5390 + }, + { + "epoch": 2.1533546325878596, + "grad_norm": 0.8707856960045441, + "learning_rate": 2.2366560942325833e-06, + "loss": 0.0378, + "step": 5392 + }, + { + "epoch": 2.1541533546325877, + "grad_norm": 0.9343304509307997, + "learning_rate": 2.232784201970094e-06, + "loss": 0.0329, + "step": 5394 + }, + { + "epoch": 2.1549520766773163, + "grad_norm": 0.8050623246924522, + "learning_rate": 2.228914700320967e-06, + "loss": 0.0316, + "step": 5396 + }, + { + "epoch": 2.155750798722045, + "grad_norm": 0.9979211632823265, + "learning_rate": 2.2250475926280814e-06, + "loss": 0.039, + "step": 5398 + }, + { + "epoch": 2.1565495207667733, + "grad_norm": 1.1270330722228508, + "learning_rate": 2.2211828822322547e-06, + "loss": 0.0391, + "step": 5400 + }, + { + "epoch": 2.1573482428115014, + "grad_norm": 1.1140624044951437, + "learning_rate": 2.217320572472232e-06, + "loss": 0.0357, + "step": 5402 + }, + { + "epoch": 2.15814696485623, + "grad_norm": 1.0128344684936457, + "learning_rate": 2.2134606666846863e-06, + "loss": 0.0402, + "step": 5404 + }, + { + "epoch": 2.1589456869009584, + "grad_norm": 0.8869317328324887, + "learning_rate": 2.209603168204209e-06, + "loss": 0.0301, + "step": 5406 + }, + { + "epoch": 2.159744408945687, + "grad_norm": 0.8719598409358753, + "learning_rate": 2.205748080363316e-06, + "loss": 0.0327, + "step": 5408 + }, + { + "epoch": 2.1605431309904155, + "grad_norm": 0.8425272637471012, + "learning_rate": 2.2018954064924392e-06, + "loss": 0.0317, + "step": 5410 + }, + { + "epoch": 2.1613418530351436, + "grad_norm": 0.9849794944821841, + "learning_rate": 2.1980451499199262e-06, + "loss": 0.0352, + "step": 5412 + }, + { + "epoch": 2.162140575079872, + "grad_norm": 1.0062068145536687, + "learning_rate": 2.1941973139720368e-06, + "loss": 0.034, + "step": 5414 + }, + { + "epoch": 2.1629392971246006, + "grad_norm": 1.025042092696479, + "learning_rate": 2.190351901972935e-06, + "loss": 0.0364, + "step": 5416 + }, + { + "epoch": 2.163738019169329, + "grad_norm": 0.9038868061338211, + "learning_rate": 2.1865089172446928e-06, + "loss": 0.0342, + "step": 5418 + }, + { + "epoch": 2.1645367412140577, + "grad_norm": 0.9811870087519612, + "learning_rate": 2.1826683631072932e-06, + "loss": 0.0354, + "step": 5420 + }, + { + "epoch": 2.165335463258786, + "grad_norm": 0.9159807996481562, + "learning_rate": 2.1788302428786057e-06, + "loss": 0.0343, + "step": 5422 + }, + { + "epoch": 2.1661341853035143, + "grad_norm": 1.065699305014696, + "learning_rate": 2.1749945598744076e-06, + "loss": 0.0384, + "step": 5424 + }, + { + "epoch": 2.166932907348243, + "grad_norm": 0.9974570265355018, + "learning_rate": 2.171161317408366e-06, + "loss": 0.0333, + "step": 5426 + }, + { + "epoch": 2.1677316293929714, + "grad_norm": 1.0782436820025358, + "learning_rate": 2.1673305187920422e-06, + "loss": 0.036, + "step": 5428 + }, + { + "epoch": 2.1685303514377, + "grad_norm": 1.0684005095258051, + "learning_rate": 2.163502167334882e-06, + "loss": 0.0406, + "step": 5430 + }, + { + "epoch": 2.169329073482428, + "grad_norm": 0.933155992724429, + "learning_rate": 2.159676266344222e-06, + "loss": 0.0385, + "step": 5432 + }, + { + "epoch": 2.1701277955271565, + "grad_norm": 0.9059016726528943, + "learning_rate": 2.155852819125278e-06, + "loss": 0.0304, + "step": 5434 + }, + { + "epoch": 2.170926517571885, + "grad_norm": 0.8433132037155296, + "learning_rate": 2.1520318289811493e-06, + "loss": 0.0311, + "step": 5436 + }, + { + "epoch": 2.1717252396166136, + "grad_norm": 0.8527052161750599, + "learning_rate": 2.1482132992128125e-06, + "loss": 0.0325, + "step": 5438 + }, + { + "epoch": 2.1725239616613417, + "grad_norm": 0.9111474137493859, + "learning_rate": 2.144397233119112e-06, + "loss": 0.0326, + "step": 5440 + }, + { + "epoch": 2.17332268370607, + "grad_norm": 0.8278663259091216, + "learning_rate": 2.1405836339967707e-06, + "loss": 0.0316, + "step": 5442 + }, + { + "epoch": 2.1741214057507987, + "grad_norm": 0.8126851634140201, + "learning_rate": 2.136772505140382e-06, + "loss": 0.0297, + "step": 5444 + }, + { + "epoch": 2.1749201277955272, + "grad_norm": 0.8533464184624501, + "learning_rate": 2.1329638498423978e-06, + "loss": 0.0311, + "step": 5446 + }, + { + "epoch": 2.1757188498402558, + "grad_norm": 0.853939722117267, + "learning_rate": 2.1291576713931382e-06, + "loss": 0.0367, + "step": 5448 + }, + { + "epoch": 2.176517571884984, + "grad_norm": 1.1560454061905685, + "learning_rate": 2.125353973080782e-06, + "loss": 0.0383, + "step": 5450 + }, + { + "epoch": 2.1773162939297124, + "grad_norm": 1.013070043627522, + "learning_rate": 2.121552758191366e-06, + "loss": 0.0323, + "step": 5452 + }, + { + "epoch": 2.178115015974441, + "grad_norm": 1.011960788493202, + "learning_rate": 2.117754030008783e-06, + "loss": 0.0364, + "step": 5454 + }, + { + "epoch": 2.1789137380191694, + "grad_norm": 0.8655045405461433, + "learning_rate": 2.1139577918147715e-06, + "loss": 0.0333, + "step": 5456 + }, + { + "epoch": 2.179712460063898, + "grad_norm": 0.8272557585170817, + "learning_rate": 2.1101640468889255e-06, + "loss": 0.0354, + "step": 5458 + }, + { + "epoch": 2.180511182108626, + "grad_norm": 1.0574986102956363, + "learning_rate": 2.1063727985086827e-06, + "loss": 0.0333, + "step": 5460 + }, + { + "epoch": 2.1813099041533546, + "grad_norm": 1.0764551576276693, + "learning_rate": 2.102584049949326e-06, + "loss": 0.0307, + "step": 5462 + }, + { + "epoch": 2.182108626198083, + "grad_norm": 0.9248131636959693, + "learning_rate": 2.0987978044839707e-06, + "loss": 0.0331, + "step": 5464 + }, + { + "epoch": 2.1829073482428116, + "grad_norm": 0.9555477735825861, + "learning_rate": 2.0950140653835814e-06, + "loss": 0.032, + "step": 5466 + }, + { + "epoch": 2.18370607028754, + "grad_norm": 1.0306732674470207, + "learning_rate": 2.0912328359169498e-06, + "loss": 0.0356, + "step": 5468 + }, + { + "epoch": 2.1845047923322682, + "grad_norm": 1.0189431478737734, + "learning_rate": 2.087454119350703e-06, + "loss": 0.0317, + "step": 5470 + }, + { + "epoch": 2.1853035143769968, + "grad_norm": 0.8526696540231634, + "learning_rate": 2.0836779189492925e-06, + "loss": 0.034, + "step": 5472 + }, + { + "epoch": 2.1861022364217253, + "grad_norm": 0.8741979901728469, + "learning_rate": 2.079904237975e-06, + "loss": 0.0313, + "step": 5474 + }, + { + "epoch": 2.186900958466454, + "grad_norm": 0.9921256355991658, + "learning_rate": 2.0761330796879307e-06, + "loss": 0.0356, + "step": 5476 + }, + { + "epoch": 2.187699680511182, + "grad_norm": 1.0651481599446444, + "learning_rate": 2.0723644473460114e-06, + "loss": 0.0376, + "step": 5478 + }, + { + "epoch": 2.1884984025559104, + "grad_norm": 0.9968183069089577, + "learning_rate": 2.068598344204981e-06, + "loss": 0.0317, + "step": 5480 + }, + { + "epoch": 2.189297124600639, + "grad_norm": 1.0274581864731451, + "learning_rate": 2.064834773518399e-06, + "loss": 0.0384, + "step": 5482 + }, + { + "epoch": 2.1900958466453675, + "grad_norm": 0.7933724201731249, + "learning_rate": 2.061073738537635e-06, + "loss": 0.0335, + "step": 5484 + }, + { + "epoch": 2.190894568690096, + "grad_norm": 0.8535934658038107, + "learning_rate": 2.0573152425118703e-06, + "loss": 0.0366, + "step": 5486 + }, + { + "epoch": 2.191693290734824, + "grad_norm": 0.8643572127104457, + "learning_rate": 2.053559288688086e-06, + "loss": 0.0308, + "step": 5488 + }, + { + "epoch": 2.1924920127795526, + "grad_norm": 0.9310105796168394, + "learning_rate": 2.0498058803110775e-06, + "loss": 0.0327, + "step": 5490 + }, + { + "epoch": 2.193290734824281, + "grad_norm": 0.9425151428347762, + "learning_rate": 2.0460550206234324e-06, + "loss": 0.0367, + "step": 5492 + }, + { + "epoch": 2.1940894568690097, + "grad_norm": 0.8903710371632394, + "learning_rate": 2.042306712865543e-06, + "loss": 0.0332, + "step": 5494 + }, + { + "epoch": 2.194888178913738, + "grad_norm": 0.9619998941867759, + "learning_rate": 2.0385609602755878e-06, + "loss": 0.0342, + "step": 5496 + }, + { + "epoch": 2.1956869009584663, + "grad_norm": 0.8331065075000925, + "learning_rate": 2.0348177660895473e-06, + "loss": 0.0268, + "step": 5498 + }, + { + "epoch": 2.196485623003195, + "grad_norm": 0.7812360489536925, + "learning_rate": 2.031077133541188e-06, + "loss": 0.0261, + "step": 5500 + }, + { + "epoch": 2.196485623003195, + "eval_loss": 0.17987604439258575, + "eval_runtime": 417.2456, + "eval_samples_per_second": 42.678, + "eval_steps_per_second": 5.335, + "step": 5500 + }, + { + "epoch": 2.1972843450479234, + "grad_norm": 1.0053912019916695, + "learning_rate": 2.027339065862064e-06, + "loss": 0.0331, + "step": 5502 + }, + { + "epoch": 2.198083067092652, + "grad_norm": 0.9961342952209272, + "learning_rate": 2.02360356628151e-06, + "loss": 0.0321, + "step": 5504 + }, + { + "epoch": 2.1988817891373804, + "grad_norm": 1.1371974183191713, + "learning_rate": 2.019870638026648e-06, + "loss": 0.0357, + "step": 5506 + }, + { + "epoch": 2.1996805111821085, + "grad_norm": 1.0401230360921, + "learning_rate": 2.016140284322375e-06, + "loss": 0.0346, + "step": 5508 + }, + { + "epoch": 2.200479233226837, + "grad_norm": 0.8835551794696535, + "learning_rate": 2.0124125083913636e-06, + "loss": 0.0334, + "step": 5510 + }, + { + "epoch": 2.2012779552715656, + "grad_norm": 0.8819429072389985, + "learning_rate": 2.0086873134540626e-06, + "loss": 0.0336, + "step": 5512 + }, + { + "epoch": 2.202076677316294, + "grad_norm": 0.87779251719051, + "learning_rate": 2.004964702728688e-06, + "loss": 0.0328, + "step": 5514 + }, + { + "epoch": 2.202875399361022, + "grad_norm": 0.8894119758104073, + "learning_rate": 2.0012446794312236e-06, + "loss": 0.0284, + "step": 5516 + }, + { + "epoch": 2.2036741214057507, + "grad_norm": 1.1793198313761428, + "learning_rate": 1.997527246775421e-06, + "loss": 0.0375, + "step": 5518 + }, + { + "epoch": 2.2044728434504792, + "grad_norm": 0.9932553166480073, + "learning_rate": 1.9938124079727874e-06, + "loss": 0.0345, + "step": 5520 + }, + { + "epoch": 2.2052715654952078, + "grad_norm": 1.0489913796282613, + "learning_rate": 1.9901001662325946e-06, + "loss": 0.0339, + "step": 5522 + }, + { + "epoch": 2.2060702875399363, + "grad_norm": 0.9036884868208198, + "learning_rate": 1.9863905247618702e-06, + "loss": 0.0318, + "step": 5524 + }, + { + "epoch": 2.2068690095846644, + "grad_norm": 1.0039658882019877, + "learning_rate": 1.9826834867653956e-06, + "loss": 0.0378, + "step": 5526 + }, + { + "epoch": 2.207667731629393, + "grad_norm": 0.9584370299885461, + "learning_rate": 1.9789790554456977e-06, + "loss": 0.0351, + "step": 5528 + }, + { + "epoch": 2.2084664536741214, + "grad_norm": 1.0693519590795688, + "learning_rate": 1.9752772340030584e-06, + "loss": 0.033, + "step": 5530 + }, + { + "epoch": 2.20926517571885, + "grad_norm": 0.9311182383118676, + "learning_rate": 1.9715780256355014e-06, + "loss": 0.0315, + "step": 5532 + }, + { + "epoch": 2.2100638977635785, + "grad_norm": 0.886315430031371, + "learning_rate": 1.967881433538795e-06, + "loss": 0.0321, + "step": 5534 + }, + { + "epoch": 2.2108626198083066, + "grad_norm": 0.9667837860549522, + "learning_rate": 1.9641874609064443e-06, + "loss": 0.0298, + "step": 5536 + }, + { + "epoch": 2.211661341853035, + "grad_norm": 0.9696824443915598, + "learning_rate": 1.960496110929694e-06, + "loss": 0.0348, + "step": 5538 + }, + { + "epoch": 2.2124600638977636, + "grad_norm": 0.9405160870011481, + "learning_rate": 1.9568073867975217e-06, + "loss": 0.0307, + "step": 5540 + }, + { + "epoch": 2.213258785942492, + "grad_norm": 0.9078061671374033, + "learning_rate": 1.9531212916966395e-06, + "loss": 0.0337, + "step": 5542 + }, + { + "epoch": 2.2140575079872207, + "grad_norm": 0.8927963016211171, + "learning_rate": 1.9494378288114816e-06, + "loss": 0.0318, + "step": 5544 + }, + { + "epoch": 2.2148562300319488, + "grad_norm": 0.8652473614478016, + "learning_rate": 1.945757001324215e-06, + "loss": 0.029, + "step": 5546 + }, + { + "epoch": 2.2156549520766773, + "grad_norm": 1.0041756596183897, + "learning_rate": 1.9420788124147266e-06, + "loss": 0.0353, + "step": 5548 + }, + { + "epoch": 2.216453674121406, + "grad_norm": 0.8814650902754025, + "learning_rate": 1.938403265260625e-06, + "loss": 0.0325, + "step": 5550 + }, + { + "epoch": 2.2172523961661343, + "grad_norm": 0.9335249441231224, + "learning_rate": 1.9347303630372373e-06, + "loss": 0.0349, + "step": 5552 + }, + { + "epoch": 2.2180511182108624, + "grad_norm": 0.889201823881265, + "learning_rate": 1.931060108917601e-06, + "loss": 0.0344, + "step": 5554 + }, + { + "epoch": 2.218849840255591, + "grad_norm": 0.8902317631970373, + "learning_rate": 1.92739250607247e-06, + "loss": 0.0342, + "step": 5556 + }, + { + "epoch": 2.2196485623003195, + "grad_norm": 0.8614037453548934, + "learning_rate": 1.9237275576703125e-06, + "loss": 0.0346, + "step": 5558 + }, + { + "epoch": 2.220447284345048, + "grad_norm": 1.121859600487758, + "learning_rate": 1.9200652668772924e-06, + "loss": 0.0357, + "step": 5560 + }, + { + "epoch": 2.2212460063897765, + "grad_norm": 0.9204383985383575, + "learning_rate": 1.9164056368572847e-06, + "loss": 0.0327, + "step": 5562 + }, + { + "epoch": 2.2220447284345046, + "grad_norm": 1.069244881184746, + "learning_rate": 1.912748670771865e-06, + "loss": 0.035, + "step": 5564 + }, + { + "epoch": 2.222843450479233, + "grad_norm": 0.9412541774316218, + "learning_rate": 1.909094371780309e-06, + "loss": 0.034, + "step": 5566 + }, + { + "epoch": 2.2236421725239617, + "grad_norm": 1.1270763500131213, + "learning_rate": 1.9054427430395828e-06, + "loss": 0.0376, + "step": 5568 + }, + { + "epoch": 2.22444089456869, + "grad_norm": 1.0152263236453896, + "learning_rate": 1.9017937877043496e-06, + "loss": 0.035, + "step": 5570 + }, + { + "epoch": 2.2252396166134187, + "grad_norm": 1.0576930750635203, + "learning_rate": 1.8981475089269641e-06, + "loss": 0.0348, + "step": 5572 + }, + { + "epoch": 2.226038338658147, + "grad_norm": 0.9331195909365617, + "learning_rate": 1.8945039098574658e-06, + "loss": 0.0322, + "step": 5574 + }, + { + "epoch": 2.2268370607028753, + "grad_norm": 0.9101266572569631, + "learning_rate": 1.890862993643583e-06, + "loss": 0.0367, + "step": 5576 + }, + { + "epoch": 2.227635782747604, + "grad_norm": 0.958445615782898, + "learning_rate": 1.8872247634307205e-06, + "loss": 0.0357, + "step": 5578 + }, + { + "epoch": 2.2284345047923324, + "grad_norm": 0.9925711413975752, + "learning_rate": 1.883589222361965e-06, + "loss": 0.0369, + "step": 5580 + }, + { + "epoch": 2.229233226837061, + "grad_norm": 1.0786281449106017, + "learning_rate": 1.8799563735780873e-06, + "loss": 0.0329, + "step": 5582 + }, + { + "epoch": 2.230031948881789, + "grad_norm": 0.924349636618338, + "learning_rate": 1.8763262202175204e-06, + "loss": 0.031, + "step": 5584 + }, + { + "epoch": 2.2308306709265175, + "grad_norm": 0.8786179873798488, + "learning_rate": 1.8726987654163753e-06, + "loss": 0.0284, + "step": 5586 + }, + { + "epoch": 2.231629392971246, + "grad_norm": 0.8699008504990049, + "learning_rate": 1.8690740123084316e-06, + "loss": 0.0329, + "step": 5588 + }, + { + "epoch": 2.2324281150159746, + "grad_norm": 0.9317629329233429, + "learning_rate": 1.8654519640251334e-06, + "loss": 0.0334, + "step": 5590 + }, + { + "epoch": 2.2332268370607027, + "grad_norm": 0.858865932937765, + "learning_rate": 1.8618326236955908e-06, + "loss": 0.032, + "step": 5592 + }, + { + "epoch": 2.234025559105431, + "grad_norm": 0.9990448382210804, + "learning_rate": 1.858215994446569e-06, + "loss": 0.0358, + "step": 5594 + }, + { + "epoch": 2.2348242811501597, + "grad_norm": 0.9366419998033949, + "learning_rate": 1.8546020794024955e-06, + "loss": 0.032, + "step": 5596 + }, + { + "epoch": 2.2356230031948883, + "grad_norm": 0.9425282547585793, + "learning_rate": 1.8509908816854527e-06, + "loss": 0.0348, + "step": 5598 + }, + { + "epoch": 2.236421725239617, + "grad_norm": 0.9345193697622752, + "learning_rate": 1.8473824044151762e-06, + "loss": 0.0348, + "step": 5600 + }, + { + "epoch": 2.237220447284345, + "grad_norm": 0.9048364378406388, + "learning_rate": 1.843776650709046e-06, + "loss": 0.0292, + "step": 5602 + }, + { + "epoch": 2.2380191693290734, + "grad_norm": 1.0024652573185826, + "learning_rate": 1.8401736236820933e-06, + "loss": 0.0287, + "step": 5604 + }, + { + "epoch": 2.238817891373802, + "grad_norm": 1.141968574912131, + "learning_rate": 1.836573326446997e-06, + "loss": 0.0359, + "step": 5606 + }, + { + "epoch": 2.2396166134185305, + "grad_norm": 1.00331976516427, + "learning_rate": 1.8329757621140748e-06, + "loss": 0.0341, + "step": 5608 + }, + { + "epoch": 2.2404153354632586, + "grad_norm": 0.9747936880200038, + "learning_rate": 1.8293809337912789e-06, + "loss": 0.0379, + "step": 5610 + }, + { + "epoch": 2.241214057507987, + "grad_norm": 0.9903639773982147, + "learning_rate": 1.8257888445842026e-06, + "loss": 0.0335, + "step": 5612 + }, + { + "epoch": 2.2420127795527156, + "grad_norm": 0.9391122747126337, + "learning_rate": 1.8221994975960739e-06, + "loss": 0.028, + "step": 5614 + }, + { + "epoch": 2.242811501597444, + "grad_norm": 0.9322142611011831, + "learning_rate": 1.81861289592775e-06, + "loss": 0.0332, + "step": 5616 + }, + { + "epoch": 2.2436102236421727, + "grad_norm": 0.926413396097988, + "learning_rate": 1.815029042677714e-06, + "loss": 0.0363, + "step": 5618 + }, + { + "epoch": 2.244408945686901, + "grad_norm": 0.9401248489200065, + "learning_rate": 1.8114479409420783e-06, + "loss": 0.0349, + "step": 5620 + }, + { + "epoch": 2.2452076677316293, + "grad_norm": 0.9208704620021309, + "learning_rate": 1.8078695938145768e-06, + "loss": 0.0315, + "step": 5622 + }, + { + "epoch": 2.246006389776358, + "grad_norm": 0.9916844141557866, + "learning_rate": 1.8042940043865658e-06, + "loss": 0.0314, + "step": 5624 + }, + { + "epoch": 2.2468051118210863, + "grad_norm": 1.0110707106239305, + "learning_rate": 1.8007211757470117e-06, + "loss": 0.0371, + "step": 5626 + }, + { + "epoch": 2.247603833865815, + "grad_norm": 1.1436090733823392, + "learning_rate": 1.7971511109825064e-06, + "loss": 0.0347, + "step": 5628 + }, + { + "epoch": 2.248402555910543, + "grad_norm": 0.9259112088416946, + "learning_rate": 1.7935838131772481e-06, + "loss": 0.0338, + "step": 5630 + }, + { + "epoch": 2.2492012779552715, + "grad_norm": 0.9880736506122585, + "learning_rate": 1.7900192854130465e-06, + "loss": 0.035, + "step": 5632 + }, + { + "epoch": 2.25, + "grad_norm": 0.8024555556579349, + "learning_rate": 1.786457530769314e-06, + "loss": 0.0328, + "step": 5634 + }, + { + "epoch": 2.2507987220447285, + "grad_norm": 0.8899527470220918, + "learning_rate": 1.7828985523230725e-06, + "loss": 0.0334, + "step": 5636 + }, + { + "epoch": 2.251597444089457, + "grad_norm": 0.9363903078372263, + "learning_rate": 1.779342353148943e-06, + "loss": 0.0348, + "step": 5638 + }, + { + "epoch": 2.252396166134185, + "grad_norm": 0.9060811105083338, + "learning_rate": 1.7757889363191484e-06, + "loss": 0.034, + "step": 5640 + }, + { + "epoch": 2.2531948881789137, + "grad_norm": 1.025351909976564, + "learning_rate": 1.7722383049035019e-06, + "loss": 0.0391, + "step": 5642 + }, + { + "epoch": 2.253993610223642, + "grad_norm": 1.0426242945439066, + "learning_rate": 1.7686904619694156e-06, + "loss": 0.0356, + "step": 5644 + }, + { + "epoch": 2.2547923322683707, + "grad_norm": 0.8919813927820728, + "learning_rate": 1.7651454105818915e-06, + "loss": 0.0306, + "step": 5646 + }, + { + "epoch": 2.255591054313099, + "grad_norm": 0.956484596890347, + "learning_rate": 1.7616031538035189e-06, + "loss": 0.0354, + "step": 5648 + }, + { + "epoch": 2.2563897763578273, + "grad_norm": 0.90756103740582, + "learning_rate": 1.758063694694474e-06, + "loss": 0.0336, + "step": 5650 + }, + { + "epoch": 2.257188498402556, + "grad_norm": 0.8099489111484716, + "learning_rate": 1.7545270363125155e-06, + "loss": 0.029, + "step": 5652 + }, + { + "epoch": 2.2579872204472844, + "grad_norm": 1.0364198349070215, + "learning_rate": 1.7509931817129821e-06, + "loss": 0.0343, + "step": 5654 + }, + { + "epoch": 2.258785942492013, + "grad_norm": 0.87676918992303, + "learning_rate": 1.7474621339487925e-06, + "loss": 0.032, + "step": 5656 + }, + { + "epoch": 2.2595846645367414, + "grad_norm": 0.968068180478562, + "learning_rate": 1.7439338960704355e-06, + "loss": 0.0318, + "step": 5658 + }, + { + "epoch": 2.2603833865814695, + "grad_norm": 0.9143786360310998, + "learning_rate": 1.7404084711259777e-06, + "loss": 0.0306, + "step": 5660 + }, + { + "epoch": 2.261182108626198, + "grad_norm": 0.983712060636582, + "learning_rate": 1.736885862161054e-06, + "loss": 0.0334, + "step": 5662 + }, + { + "epoch": 2.2619808306709266, + "grad_norm": 0.9626625864043248, + "learning_rate": 1.7333660722188667e-06, + "loss": 0.034, + "step": 5664 + }, + { + "epoch": 2.262779552715655, + "grad_norm": 0.9609884877774749, + "learning_rate": 1.7298491043401794e-06, + "loss": 0.0297, + "step": 5666 + }, + { + "epoch": 2.263578274760383, + "grad_norm": 0.9396750240874573, + "learning_rate": 1.7263349615633228e-06, + "loss": 0.0332, + "step": 5668 + }, + { + "epoch": 2.2643769968051117, + "grad_norm": 0.9290418992667302, + "learning_rate": 1.7228236469241837e-06, + "loss": 0.0298, + "step": 5670 + }, + { + "epoch": 2.2651757188498403, + "grad_norm": 0.9095375069866244, + "learning_rate": 1.7193151634562071e-06, + "loss": 0.0304, + "step": 5672 + }, + { + "epoch": 2.265974440894569, + "grad_norm": 1.010681342107459, + "learning_rate": 1.715809514190392e-06, + "loss": 0.0307, + "step": 5674 + }, + { + "epoch": 2.2667731629392973, + "grad_norm": 1.0154601770035847, + "learning_rate": 1.712306702155288e-06, + "loss": 0.0306, + "step": 5676 + }, + { + "epoch": 2.2675718849840254, + "grad_norm": 1.0108750470875814, + "learning_rate": 1.7088067303769946e-06, + "loss": 0.0325, + "step": 5678 + }, + { + "epoch": 2.268370607028754, + "grad_norm": 1.0636729615862845, + "learning_rate": 1.7053096018791588e-06, + "loss": 0.0409, + "step": 5680 + }, + { + "epoch": 2.2691693290734825, + "grad_norm": 0.9202813920561608, + "learning_rate": 1.7018153196829662e-06, + "loss": 0.0332, + "step": 5682 + }, + { + "epoch": 2.269968051118211, + "grad_norm": 1.214698931969398, + "learning_rate": 1.6983238868071489e-06, + "loss": 0.0338, + "step": 5684 + }, + { + "epoch": 2.270766773162939, + "grad_norm": 0.8519680135270461, + "learning_rate": 1.6948353062679752e-06, + "loss": 0.0309, + "step": 5686 + }, + { + "epoch": 2.2715654952076676, + "grad_norm": 0.8609963780131156, + "learning_rate": 1.691349581079249e-06, + "loss": 0.0316, + "step": 5688 + }, + { + "epoch": 2.272364217252396, + "grad_norm": 1.03764079744846, + "learning_rate": 1.687866714252311e-06, + "loss": 0.035, + "step": 5690 + }, + { + "epoch": 2.2731629392971247, + "grad_norm": 0.9903945590247691, + "learning_rate": 1.6843867087960252e-06, + "loss": 0.0338, + "step": 5692 + }, + { + "epoch": 2.273961661341853, + "grad_norm": 0.9835600544969014, + "learning_rate": 1.6809095677167897e-06, + "loss": 0.033, + "step": 5694 + }, + { + "epoch": 2.2747603833865817, + "grad_norm": 0.9107299972526909, + "learning_rate": 1.6774352940185269e-06, + "loss": 0.0329, + "step": 5696 + }, + { + "epoch": 2.27555910543131, + "grad_norm": 1.0179931131158924, + "learning_rate": 1.6739638907026806e-06, + "loss": 0.0324, + "step": 5698 + }, + { + "epoch": 2.2763578274760383, + "grad_norm": 0.8564341030978901, + "learning_rate": 1.6704953607682161e-06, + "loss": 0.0371, + "step": 5700 + }, + { + "epoch": 2.277156549520767, + "grad_norm": 0.9606560664042086, + "learning_rate": 1.6670297072116165e-06, + "loss": 0.0357, + "step": 5702 + }, + { + "epoch": 2.2779552715654954, + "grad_norm": 0.970530456375879, + "learning_rate": 1.663566933026879e-06, + "loss": 0.0368, + "step": 5704 + }, + { + "epoch": 2.2787539936102235, + "grad_norm": 0.823503415647808, + "learning_rate": 1.6601070412055154e-06, + "loss": 0.0308, + "step": 5706 + }, + { + "epoch": 2.279552715654952, + "grad_norm": 0.8849782867422896, + "learning_rate": 1.6566500347365421e-06, + "loss": 0.0339, + "step": 5708 + }, + { + "epoch": 2.2803514376996805, + "grad_norm": 0.9443290922040906, + "learning_rate": 1.6531959166064893e-06, + "loss": 0.0382, + "step": 5710 + }, + { + "epoch": 2.281150159744409, + "grad_norm": 0.9814437843516525, + "learning_rate": 1.6497446897993885e-06, + "loss": 0.0314, + "step": 5712 + }, + { + "epoch": 2.2819488817891376, + "grad_norm": 0.9443631962649643, + "learning_rate": 1.6462963572967756e-06, + "loss": 0.0329, + "step": 5714 + }, + { + "epoch": 2.2827476038338657, + "grad_norm": 0.9206008196409816, + "learning_rate": 1.6428509220776812e-06, + "loss": 0.0327, + "step": 5716 + }, + { + "epoch": 2.283546325878594, + "grad_norm": 0.9303152959167551, + "learning_rate": 1.6394083871186362e-06, + "loss": 0.0326, + "step": 5718 + }, + { + "epoch": 2.2843450479233227, + "grad_norm": 0.9131910544498667, + "learning_rate": 1.6359687553936714e-06, + "loss": 0.0326, + "step": 5720 + }, + { + "epoch": 2.2851437699680512, + "grad_norm": 1.0967870027836903, + "learning_rate": 1.6325320298742986e-06, + "loss": 0.0395, + "step": 5722 + }, + { + "epoch": 2.2859424920127793, + "grad_norm": 0.8427528083071858, + "learning_rate": 1.6290982135295269e-06, + "loss": 0.0331, + "step": 5724 + }, + { + "epoch": 2.286741214057508, + "grad_norm": 0.8644733263443983, + "learning_rate": 1.6256673093258485e-06, + "loss": 0.0345, + "step": 5726 + }, + { + "epoch": 2.2875399361022364, + "grad_norm": 0.9392972386925895, + "learning_rate": 1.6222393202272414e-06, + "loss": 0.0304, + "step": 5728 + }, + { + "epoch": 2.288338658146965, + "grad_norm": 0.9017990006181082, + "learning_rate": 1.618814249195167e-06, + "loss": 0.0314, + "step": 5730 + }, + { + "epoch": 2.2891373801916934, + "grad_norm": 0.7993416720434479, + "learning_rate": 1.6153920991885591e-06, + "loss": 0.0286, + "step": 5732 + }, + { + "epoch": 2.289936102236422, + "grad_norm": 0.9176139011413571, + "learning_rate": 1.6119728731638345e-06, + "loss": 0.0369, + "step": 5734 + }, + { + "epoch": 2.29073482428115, + "grad_norm": 0.874001701788666, + "learning_rate": 1.6085565740748825e-06, + "loss": 0.033, + "step": 5736 + }, + { + "epoch": 2.2915335463258786, + "grad_norm": 0.9359227443696032, + "learning_rate": 1.605143204873064e-06, + "loss": 0.0377, + "step": 5738 + }, + { + "epoch": 2.292332268370607, + "grad_norm": 0.8690038620673686, + "learning_rate": 1.6017327685072047e-06, + "loss": 0.031, + "step": 5740 + }, + { + "epoch": 2.2931309904153356, + "grad_norm": 1.055903086127581, + "learning_rate": 1.5983252679236006e-06, + "loss": 0.0355, + "step": 5742 + }, + { + "epoch": 2.2939297124600637, + "grad_norm": 1.1058038934128054, + "learning_rate": 1.5949207060660138e-06, + "loss": 0.0328, + "step": 5744 + }, + { + "epoch": 2.2947284345047922, + "grad_norm": 0.9784683752210179, + "learning_rate": 1.5915190858756635e-06, + "loss": 0.0334, + "step": 5746 + }, + { + "epoch": 2.2955271565495208, + "grad_norm": 0.9248930707330709, + "learning_rate": 1.5881204102912262e-06, + "loss": 0.0334, + "step": 5748 + }, + { + "epoch": 2.2963258785942493, + "grad_norm": 1.028126750218338, + "learning_rate": 1.5847246822488388e-06, + "loss": 0.0344, + "step": 5750 + }, + { + "epoch": 2.297124600638978, + "grad_norm": 0.8999392063400926, + "learning_rate": 1.581331904682089e-06, + "loss": 0.0293, + "step": 5752 + }, + { + "epoch": 2.297923322683706, + "grad_norm": 0.96073417050456, + "learning_rate": 1.5779420805220185e-06, + "loss": 0.0332, + "step": 5754 + }, + { + "epoch": 2.2987220447284344, + "grad_norm": 0.9826008607081279, + "learning_rate": 1.574555212697113e-06, + "loss": 0.0338, + "step": 5756 + }, + { + "epoch": 2.299520766773163, + "grad_norm": 0.9259548407344154, + "learning_rate": 1.5711713041333077e-06, + "loss": 0.0355, + "step": 5758 + }, + { + "epoch": 2.3003194888178915, + "grad_norm": 0.999628943497661, + "learning_rate": 1.5677903577539806e-06, + "loss": 0.0353, + "step": 5760 + }, + { + "epoch": 2.3011182108626196, + "grad_norm": 1.004302579075343, + "learning_rate": 1.5644123764799517e-06, + "loss": 0.0289, + "step": 5762 + }, + { + "epoch": 2.301916932907348, + "grad_norm": 1.0927223195156304, + "learning_rate": 1.561037363229475e-06, + "loss": 0.0378, + "step": 5764 + }, + { + "epoch": 2.3027156549520766, + "grad_norm": 0.9410698542998797, + "learning_rate": 1.5576653209182436e-06, + "loss": 0.0331, + "step": 5766 + }, + { + "epoch": 2.303514376996805, + "grad_norm": 0.8643954286211598, + "learning_rate": 1.5542962524593869e-06, + "loss": 0.0317, + "step": 5768 + }, + { + "epoch": 2.3043130990415337, + "grad_norm": 1.0178268360718439, + "learning_rate": 1.550930160763462e-06, + "loss": 0.0333, + "step": 5770 + }, + { + "epoch": 2.3051118210862622, + "grad_norm": 1.0794526119781374, + "learning_rate": 1.547567048738452e-06, + "loss": 0.037, + "step": 5772 + }, + { + "epoch": 2.3059105431309903, + "grad_norm": 0.9306102687878339, + "learning_rate": 1.5442069192897695e-06, + "loss": 0.0319, + "step": 5774 + }, + { + "epoch": 2.306709265175719, + "grad_norm": 0.9959647755317999, + "learning_rate": 1.54084977532025e-06, + "loss": 0.0304, + "step": 5776 + }, + { + "epoch": 2.3075079872204474, + "grad_norm": 1.178192165622762, + "learning_rate": 1.5374956197301494e-06, + "loss": 0.0359, + "step": 5778 + }, + { + "epoch": 2.308306709265176, + "grad_norm": 0.970629316470673, + "learning_rate": 1.5341444554171397e-06, + "loss": 0.0323, + "step": 5780 + }, + { + "epoch": 2.309105431309904, + "grad_norm": 0.9995642863840597, + "learning_rate": 1.5307962852763115e-06, + "loss": 0.0314, + "step": 5782 + }, + { + "epoch": 2.3099041533546325, + "grad_norm": 0.8635977267044507, + "learning_rate": 1.5274511122001684e-06, + "loss": 0.0292, + "step": 5784 + }, + { + "epoch": 2.310702875399361, + "grad_norm": 0.941234774919912, + "learning_rate": 1.524108939078624e-06, + "loss": 0.0335, + "step": 5786 + }, + { + "epoch": 2.3115015974440896, + "grad_norm": 0.9670030899801637, + "learning_rate": 1.5207697687990004e-06, + "loss": 0.0355, + "step": 5788 + }, + { + "epoch": 2.312300319488818, + "grad_norm": 1.0863404667557413, + "learning_rate": 1.5174336042460264e-06, + "loss": 0.0359, + "step": 5790 + }, + { + "epoch": 2.313099041533546, + "grad_norm": 0.9487702530128579, + "learning_rate": 1.5141004483018323e-06, + "loss": 0.0303, + "step": 5792 + }, + { + "epoch": 2.3138977635782747, + "grad_norm": 0.8581962375915709, + "learning_rate": 1.5107703038459531e-06, + "loss": 0.0329, + "step": 5794 + }, + { + "epoch": 2.3146964856230032, + "grad_norm": 0.9463896237746494, + "learning_rate": 1.5074431737553158e-06, + "loss": 0.0344, + "step": 5796 + }, + { + "epoch": 2.3154952076677318, + "grad_norm": 1.0247492412011727, + "learning_rate": 1.5041190609042477e-06, + "loss": 0.0322, + "step": 5798 + }, + { + "epoch": 2.31629392971246, + "grad_norm": 0.8881026553630956, + "learning_rate": 1.5007979681644696e-06, + "loss": 0.033, + "step": 5800 + }, + { + "epoch": 2.3170926517571884, + "grad_norm": 0.9632852088358282, + "learning_rate": 1.4974798984050941e-06, + "loss": 0.0348, + "step": 5802 + }, + { + "epoch": 2.317891373801917, + "grad_norm": 0.8335404581898861, + "learning_rate": 1.4941648544926164e-06, + "loss": 0.0312, + "step": 5804 + }, + { + "epoch": 2.3186900958466454, + "grad_norm": 0.9511476448808138, + "learning_rate": 1.4908528392909233e-06, + "loss": 0.0343, + "step": 5806 + }, + { + "epoch": 2.319488817891374, + "grad_norm": 0.9453004464927225, + "learning_rate": 1.4875438556612836e-06, + "loss": 0.0305, + "step": 5808 + }, + { + "epoch": 2.3202875399361025, + "grad_norm": 0.9770824243294399, + "learning_rate": 1.4842379064623474e-06, + "loss": 0.0316, + "step": 5810 + }, + { + "epoch": 2.3210862619808306, + "grad_norm": 0.9354726575321289, + "learning_rate": 1.4809349945501422e-06, + "loss": 0.03, + "step": 5812 + }, + { + "epoch": 2.321884984025559, + "grad_norm": 0.9809930908311559, + "learning_rate": 1.4776351227780732e-06, + "loss": 0.0327, + "step": 5814 + }, + { + "epoch": 2.3226837060702876, + "grad_norm": 0.9812165421073059, + "learning_rate": 1.474338293996917e-06, + "loss": 0.0291, + "step": 5816 + }, + { + "epoch": 2.323482428115016, + "grad_norm": 1.0065291780758312, + "learning_rate": 1.4710445110548255e-06, + "loss": 0.0349, + "step": 5818 + }, + { + "epoch": 2.3242811501597442, + "grad_norm": 1.0293319955302287, + "learning_rate": 1.467753776797312e-06, + "loss": 0.0331, + "step": 5820 + }, + { + "epoch": 2.3250798722044728, + "grad_norm": 0.9835892465385753, + "learning_rate": 1.4644660940672628e-06, + "loss": 0.0328, + "step": 5822 + }, + { + "epoch": 2.3258785942492013, + "grad_norm": 0.8998383284814865, + "learning_rate": 1.4611814657049257e-06, + "loss": 0.0329, + "step": 5824 + }, + { + "epoch": 2.32667731629393, + "grad_norm": 0.9572453991906558, + "learning_rate": 1.4578998945479084e-06, + "loss": 0.0328, + "step": 5826 + }, + { + "epoch": 2.3274760383386583, + "grad_norm": 1.0608318737189215, + "learning_rate": 1.4546213834311823e-06, + "loss": 0.0362, + "step": 5828 + }, + { + "epoch": 2.3282747603833864, + "grad_norm": 0.9194197731835649, + "learning_rate": 1.4513459351870669e-06, + "loss": 0.0313, + "step": 5830 + }, + { + "epoch": 2.329073482428115, + "grad_norm": 0.9174893410490872, + "learning_rate": 1.4480735526452427e-06, + "loss": 0.0325, + "step": 5832 + }, + { + "epoch": 2.3298722044728435, + "grad_norm": 0.8639598370634627, + "learning_rate": 1.4448042386327394e-06, + "loss": 0.0309, + "step": 5834 + }, + { + "epoch": 2.330670926517572, + "grad_norm": 1.0738100152461907, + "learning_rate": 1.4415379959739356e-06, + "loss": 0.0376, + "step": 5836 + }, + { + "epoch": 2.3314696485623, + "grad_norm": 0.8696561197568158, + "learning_rate": 1.4382748274905573e-06, + "loss": 0.0296, + "step": 5838 + }, + { + "epoch": 2.3322683706070286, + "grad_norm": 1.0538751425130783, + "learning_rate": 1.4350147360016743e-06, + "loss": 0.0368, + "step": 5840 + }, + { + "epoch": 2.333067092651757, + "grad_norm": 1.0257476904138636, + "learning_rate": 1.4317577243236968e-06, + "loss": 0.0365, + "step": 5842 + }, + { + "epoch": 2.3338658146964857, + "grad_norm": 0.8938198309680814, + "learning_rate": 1.4285037952703785e-06, + "loss": 0.0357, + "step": 5844 + }, + { + "epoch": 2.334664536741214, + "grad_norm": 0.9464804274403688, + "learning_rate": 1.425252951652803e-06, + "loss": 0.0293, + "step": 5846 + }, + { + "epoch": 2.3354632587859427, + "grad_norm": 0.9175488309023649, + "learning_rate": 1.4220051962793952e-06, + "loss": 0.038, + "step": 5848 + }, + { + "epoch": 2.336261980830671, + "grad_norm": 0.88533327061595, + "learning_rate": 1.4187605319559078e-06, + "loss": 0.0301, + "step": 5850 + }, + { + "epoch": 2.3370607028753994, + "grad_norm": 0.9127303239271386, + "learning_rate": 1.4155189614854275e-06, + "loss": 0.0319, + "step": 5852 + }, + { + "epoch": 2.337859424920128, + "grad_norm": 0.9168807782532327, + "learning_rate": 1.4122804876683616e-06, + "loss": 0.0308, + "step": 5854 + }, + { + "epoch": 2.3386581469648564, + "grad_norm": 1.1257679223953572, + "learning_rate": 1.4090451133024473e-06, + "loss": 0.0388, + "step": 5856 + }, + { + "epoch": 2.3394568690095845, + "grad_norm": 0.8696474054620578, + "learning_rate": 1.4058128411827432e-06, + "loss": 0.0355, + "step": 5858 + }, + { + "epoch": 2.340255591054313, + "grad_norm": 0.9000167630617952, + "learning_rate": 1.4025836741016274e-06, + "loss": 0.0341, + "step": 5860 + }, + { + "epoch": 2.3410543130990416, + "grad_norm": 0.9240511974633537, + "learning_rate": 1.399357614848796e-06, + "loss": 0.0307, + "step": 5862 + }, + { + "epoch": 2.34185303514377, + "grad_norm": 0.8909434883498069, + "learning_rate": 1.3961346662112585e-06, + "loss": 0.0341, + "step": 5864 + }, + { + "epoch": 2.3426517571884986, + "grad_norm": 1.0135872043495942, + "learning_rate": 1.3929148309733392e-06, + "loss": 0.0286, + "step": 5866 + }, + { + "epoch": 2.3434504792332267, + "grad_norm": 0.9580825595304057, + "learning_rate": 1.3896981119166741e-06, + "loss": 0.0346, + "step": 5868 + }, + { + "epoch": 2.344249201277955, + "grad_norm": 0.8615986543919892, + "learning_rate": 1.3864845118202013e-06, + "loss": 0.034, + "step": 5870 + }, + { + "epoch": 2.3450479233226837, + "grad_norm": 1.1873836274839265, + "learning_rate": 1.3832740334601692e-06, + "loss": 0.0317, + "step": 5872 + }, + { + "epoch": 2.3458466453674123, + "grad_norm": 0.8711721095623617, + "learning_rate": 1.3800666796101291e-06, + "loss": 0.0331, + "step": 5874 + }, + { + "epoch": 2.3466453674121404, + "grad_norm": 1.010832936665381, + "learning_rate": 1.3768624530409324e-06, + "loss": 0.0353, + "step": 5876 + }, + { + "epoch": 2.347444089456869, + "grad_norm": 0.9937261665959263, + "learning_rate": 1.373661356520727e-06, + "loss": 0.0362, + "step": 5878 + }, + { + "epoch": 2.3482428115015974, + "grad_norm": 0.9184012307526624, + "learning_rate": 1.3704633928149575e-06, + "loss": 0.0287, + "step": 5880 + }, + { + "epoch": 2.349041533546326, + "grad_norm": 0.9982535009466615, + "learning_rate": 1.3672685646863653e-06, + "loss": 0.033, + "step": 5882 + }, + { + "epoch": 2.3498402555910545, + "grad_norm": 0.9164440397933561, + "learning_rate": 1.3640768748949811e-06, + "loss": 0.0293, + "step": 5884 + }, + { + "epoch": 2.3506389776357826, + "grad_norm": 1.002185223742666, + "learning_rate": 1.360888326198121e-06, + "loss": 0.0341, + "step": 5886 + }, + { + "epoch": 2.351437699680511, + "grad_norm": 0.9987751966058985, + "learning_rate": 1.3577029213503911e-06, + "loss": 0.0321, + "step": 5888 + }, + { + "epoch": 2.3522364217252396, + "grad_norm": 0.9634859351636175, + "learning_rate": 1.354520663103681e-06, + "loss": 0.033, + "step": 5890 + }, + { + "epoch": 2.353035143769968, + "grad_norm": 1.0094199782296815, + "learning_rate": 1.351341554207163e-06, + "loss": 0.0356, + "step": 5892 + }, + { + "epoch": 2.3538338658146967, + "grad_norm": 0.953311272783066, + "learning_rate": 1.3481655974072845e-06, + "loss": 0.0333, + "step": 5894 + }, + { + "epoch": 2.3546325878594248, + "grad_norm": 0.9161467503492303, + "learning_rate": 1.3449927954477732e-06, + "loss": 0.0305, + "step": 5896 + }, + { + "epoch": 2.3554313099041533, + "grad_norm": 1.0625770728133241, + "learning_rate": 1.3418231510696312e-06, + "loss": 0.0335, + "step": 5898 + }, + { + "epoch": 2.356230031948882, + "grad_norm": 0.9368140797679172, + "learning_rate": 1.3386566670111339e-06, + "loss": 0.0351, + "step": 5900 + }, + { + "epoch": 2.3570287539936103, + "grad_norm": 0.9236886045000889, + "learning_rate": 1.3354933460078217e-06, + "loss": 0.0314, + "step": 5902 + }, + { + "epoch": 2.357827476038339, + "grad_norm": 1.137543124830593, + "learning_rate": 1.3323331907925046e-06, + "loss": 0.038, + "step": 5904 + }, + { + "epoch": 2.358626198083067, + "grad_norm": 0.9008576498883231, + "learning_rate": 1.3291762040952626e-06, + "loss": 0.0273, + "step": 5906 + }, + { + "epoch": 2.3594249201277955, + "grad_norm": 0.9264640068095965, + "learning_rate": 1.3260223886434342e-06, + "loss": 0.0284, + "step": 5908 + }, + { + "epoch": 2.360223642172524, + "grad_norm": 1.0330425296416088, + "learning_rate": 1.3228717471616153e-06, + "loss": 0.0359, + "step": 5910 + }, + { + "epoch": 2.3610223642172525, + "grad_norm": 0.8006806552776662, + "learning_rate": 1.319724282371664e-06, + "loss": 0.0328, + "step": 5912 + }, + { + "epoch": 2.3618210862619806, + "grad_norm": 0.9719478594390628, + "learning_rate": 1.3165799969926928e-06, + "loss": 0.0359, + "step": 5914 + }, + { + "epoch": 2.362619808306709, + "grad_norm": 0.9089803281939999, + "learning_rate": 1.3134388937410697e-06, + "loss": 0.0296, + "step": 5916 + }, + { + "epoch": 2.3634185303514377, + "grad_norm": 1.0859836987195648, + "learning_rate": 1.3103009753304085e-06, + "loss": 0.0289, + "step": 5918 + }, + { + "epoch": 2.364217252396166, + "grad_norm": 0.9338786842895157, + "learning_rate": 1.307166244471576e-06, + "loss": 0.0303, + "step": 5920 + }, + { + "epoch": 2.3650159744408947, + "grad_norm": 0.9352879035371544, + "learning_rate": 1.3040347038726831e-06, + "loss": 0.0313, + "step": 5922 + }, + { + "epoch": 2.365814696485623, + "grad_norm": 0.9193300328510401, + "learning_rate": 1.3009063562390866e-06, + "loss": 0.0308, + "step": 5924 + }, + { + "epoch": 2.3666134185303513, + "grad_norm": 0.8330378897517992, + "learning_rate": 1.297781204273385e-06, + "loss": 0.0277, + "step": 5926 + }, + { + "epoch": 2.36741214057508, + "grad_norm": 1.0098776366020665, + "learning_rate": 1.2946592506754097e-06, + "loss": 0.0292, + "step": 5928 + }, + { + "epoch": 2.3682108626198084, + "grad_norm": 0.9244538147214044, + "learning_rate": 1.2915404981422386e-06, + "loss": 0.0358, + "step": 5930 + }, + { + "epoch": 2.369009584664537, + "grad_norm": 1.009009426826755, + "learning_rate": 1.28842494936818e-06, + "loss": 0.0307, + "step": 5932 + }, + { + "epoch": 2.369808306709265, + "grad_norm": 0.8599683395682384, + "learning_rate": 1.2853126070447709e-06, + "loss": 0.0312, + "step": 5934 + }, + { + "epoch": 2.3706070287539935, + "grad_norm": 1.2504223376409314, + "learning_rate": 1.282203473860783e-06, + "loss": 0.0324, + "step": 5936 + }, + { + "epoch": 2.371405750798722, + "grad_norm": 0.9600194803305533, + "learning_rate": 1.2790975525022136e-06, + "loss": 0.0332, + "step": 5938 + }, + { + "epoch": 2.3722044728434506, + "grad_norm": 0.8579303834114889, + "learning_rate": 1.275994845652288e-06, + "loss": 0.028, + "step": 5940 + }, + { + "epoch": 2.373003194888179, + "grad_norm": 0.9767263492376159, + "learning_rate": 1.2728953559914486e-06, + "loss": 0.0343, + "step": 5942 + }, + { + "epoch": 2.373801916932907, + "grad_norm": 0.9810633979289892, + "learning_rate": 1.2697990861973635e-06, + "loss": 0.0283, + "step": 5944 + }, + { + "epoch": 2.3746006389776357, + "grad_norm": 0.8914061929408517, + "learning_rate": 1.2667060389449182e-06, + "loss": 0.0318, + "step": 5946 + }, + { + "epoch": 2.3753993610223643, + "grad_norm": 1.1915603298866675, + "learning_rate": 1.2636162169062133e-06, + "loss": 0.0402, + "step": 5948 + }, + { + "epoch": 2.376198083067093, + "grad_norm": 0.836740461461381, + "learning_rate": 1.260529622750563e-06, + "loss": 0.0311, + "step": 5950 + }, + { + "epoch": 2.376996805111821, + "grad_norm": 0.9210823534934908, + "learning_rate": 1.257446259144494e-06, + "loss": 0.0314, + "step": 5952 + }, + { + "epoch": 2.3777955271565494, + "grad_norm": 0.9674053346660237, + "learning_rate": 1.2543661287517423e-06, + "loss": 0.0301, + "step": 5954 + }, + { + "epoch": 2.378594249201278, + "grad_norm": 0.9378146372928756, + "learning_rate": 1.25128923423325e-06, + "loss": 0.028, + "step": 5956 + }, + { + "epoch": 2.3793929712460065, + "grad_norm": 0.882404097137147, + "learning_rate": 1.2482155782471612e-06, + "loss": 0.0289, + "step": 5958 + }, + { + "epoch": 2.380191693290735, + "grad_norm": 1.0244194815400358, + "learning_rate": 1.2451451634488264e-06, + "loss": 0.0319, + "step": 5960 + }, + { + "epoch": 2.380990415335463, + "grad_norm": 0.9513392453303519, + "learning_rate": 1.242077992490794e-06, + "loss": 0.0291, + "step": 5962 + }, + { + "epoch": 2.3817891373801916, + "grad_norm": 0.8933825432617372, + "learning_rate": 1.2390140680228107e-06, + "loss": 0.0324, + "step": 5964 + }, + { + "epoch": 2.38258785942492, + "grad_norm": 0.9739451587254204, + "learning_rate": 1.2359533926918193e-06, + "loss": 0.0325, + "step": 5966 + }, + { + "epoch": 2.3833865814696487, + "grad_norm": 1.0284624062364278, + "learning_rate": 1.2328959691419517e-06, + "loss": 0.0377, + "step": 5968 + }, + { + "epoch": 2.384185303514377, + "grad_norm": 0.9808325264631864, + "learning_rate": 1.2298418000145345e-06, + "loss": 0.0311, + "step": 5970 + }, + { + "epoch": 2.3849840255591053, + "grad_norm": 0.9709468792899423, + "learning_rate": 1.2267908879480822e-06, + "loss": 0.0333, + "step": 5972 + }, + { + "epoch": 2.385782747603834, + "grad_norm": 1.033875589705149, + "learning_rate": 1.2237432355782947e-06, + "loss": 0.0374, + "step": 5974 + }, + { + "epoch": 2.3865814696485623, + "grad_norm": 1.078337957714353, + "learning_rate": 1.2206988455380558e-06, + "loss": 0.0326, + "step": 5976 + }, + { + "epoch": 2.387380191693291, + "grad_norm": 1.0758642600387078, + "learning_rate": 1.2176577204574318e-06, + "loss": 0.0313, + "step": 5978 + }, + { + "epoch": 2.3881789137380194, + "grad_norm": 0.901933168392248, + "learning_rate": 1.214619862963668e-06, + "loss": 0.0319, + "step": 5980 + }, + { + "epoch": 2.3889776357827475, + "grad_norm": 1.0229488021571116, + "learning_rate": 1.2115852756811875e-06, + "loss": 0.0367, + "step": 5982 + }, + { + "epoch": 2.389776357827476, + "grad_norm": 0.8928180495678354, + "learning_rate": 1.2085539612315844e-06, + "loss": 0.0307, + "step": 5984 + }, + { + "epoch": 2.3905750798722045, + "grad_norm": 0.9981047761041587, + "learning_rate": 1.2055259222336303e-06, + "loss": 0.0352, + "step": 5986 + }, + { + "epoch": 2.391373801916933, + "grad_norm": 0.7912725756162461, + "learning_rate": 1.202501161303265e-06, + "loss": 0.0272, + "step": 5988 + }, + { + "epoch": 2.392172523961661, + "grad_norm": 0.9543956236269545, + "learning_rate": 1.1994796810535981e-06, + "loss": 0.0337, + "step": 5990 + }, + { + "epoch": 2.3929712460063897, + "grad_norm": 0.8444542368501661, + "learning_rate": 1.1964614840949002e-06, + "loss": 0.028, + "step": 5992 + }, + { + "epoch": 2.393769968051118, + "grad_norm": 0.9428028723910442, + "learning_rate": 1.1934465730346106e-06, + "loss": 0.0373, + "step": 5994 + }, + { + "epoch": 2.3945686900958467, + "grad_norm": 0.8624521055571825, + "learning_rate": 1.1904349504773276e-06, + "loss": 0.0317, + "step": 5996 + }, + { + "epoch": 2.3953674121405752, + "grad_norm": 0.9861824702284993, + "learning_rate": 1.1874266190248095e-06, + "loss": 0.0334, + "step": 5998 + }, + { + "epoch": 2.3961661341853033, + "grad_norm": 1.09155387358314, + "learning_rate": 1.1844215812759708e-06, + "loss": 0.0349, + "step": 6000 + }, + { + "epoch": 2.3961661341853033, + "eval_loss": 0.1782146841287613, + "eval_runtime": 418.6183, + "eval_samples_per_second": 42.538, + "eval_steps_per_second": 5.317, + "step": 6000 + }, + { + "epoch": 2.396964856230032, + "grad_norm": 0.8415845821620274, + "learning_rate": 1.1814198398268794e-06, + "loss": 0.0284, + "step": 6002 + }, + { + "epoch": 2.3977635782747604, + "grad_norm": 0.8937417948243025, + "learning_rate": 1.1784213972707581e-06, + "loss": 0.0294, + "step": 6004 + }, + { + "epoch": 2.398562300319489, + "grad_norm": 0.9462060632845546, + "learning_rate": 1.175426256197979e-06, + "loss": 0.0339, + "step": 6006 + }, + { + "epoch": 2.3993610223642174, + "grad_norm": 0.9979933172942926, + "learning_rate": 1.1724344191960591e-06, + "loss": 0.0329, + "step": 6008 + }, + { + "epoch": 2.4001597444089455, + "grad_norm": 0.8510571914760195, + "learning_rate": 1.169445888849664e-06, + "loss": 0.0285, + "step": 6010 + }, + { + "epoch": 2.400958466453674, + "grad_norm": 0.8796317934735823, + "learning_rate": 1.1664606677406025e-06, + "loss": 0.0285, + "step": 6012 + }, + { + "epoch": 2.4017571884984026, + "grad_norm": 1.1059376491146813, + "learning_rate": 1.1634787584478257e-06, + "loss": 0.0286, + "step": 6014 + }, + { + "epoch": 2.402555910543131, + "grad_norm": 1.0839859430790064, + "learning_rate": 1.1605001635474183e-06, + "loss": 0.031, + "step": 6016 + }, + { + "epoch": 2.4033546325878596, + "grad_norm": 1.0568594830906386, + "learning_rate": 1.157524885612607e-06, + "loss": 0.0336, + "step": 6018 + }, + { + "epoch": 2.4041533546325877, + "grad_norm": 0.8544876905209444, + "learning_rate": 1.1545529272137496e-06, + "loss": 0.0293, + "step": 6020 + }, + { + "epoch": 2.4049520766773163, + "grad_norm": 0.9202306605617924, + "learning_rate": 1.1515842909183422e-06, + "loss": 0.0305, + "step": 6022 + }, + { + "epoch": 2.405750798722045, + "grad_norm": 1.2779812066659912, + "learning_rate": 1.1486189792910024e-06, + "loss": 0.0327, + "step": 6024 + }, + { + "epoch": 2.4065495207667733, + "grad_norm": 0.8423188795687913, + "learning_rate": 1.1456569948934804e-06, + "loss": 0.0323, + "step": 6026 + }, + { + "epoch": 2.4073482428115014, + "grad_norm": 0.9219710260036055, + "learning_rate": 1.142698340284652e-06, + "loss": 0.0367, + "step": 6028 + }, + { + "epoch": 2.40814696485623, + "grad_norm": 0.8836691940601481, + "learning_rate": 1.139743018020517e-06, + "loss": 0.031, + "step": 6030 + }, + { + "epoch": 2.4089456869009584, + "grad_norm": 1.009462941023179, + "learning_rate": 1.1367910306541918e-06, + "loss": 0.0313, + "step": 6032 + }, + { + "epoch": 2.409744408945687, + "grad_norm": 0.9333397367826699, + "learning_rate": 1.133842380735916e-06, + "loss": 0.0289, + "step": 6034 + }, + { + "epoch": 2.4105431309904155, + "grad_norm": 0.9518139468601376, + "learning_rate": 1.1308970708130458e-06, + "loss": 0.0329, + "step": 6036 + }, + { + "epoch": 2.4113418530351436, + "grad_norm": 1.0093135425496413, + "learning_rate": 1.1279551034300523e-06, + "loss": 0.0293, + "step": 6038 + }, + { + "epoch": 2.412140575079872, + "grad_norm": 0.8766410436250559, + "learning_rate": 1.1250164811285148e-06, + "loss": 0.0314, + "step": 6040 + }, + { + "epoch": 2.4129392971246006, + "grad_norm": 0.7888244567724153, + "learning_rate": 1.1220812064471248e-06, + "loss": 0.0302, + "step": 6042 + }, + { + "epoch": 2.413738019169329, + "grad_norm": 0.9752687497406552, + "learning_rate": 1.119149281921687e-06, + "loss": 0.0338, + "step": 6044 + }, + { + "epoch": 2.4145367412140573, + "grad_norm": 1.015706721424374, + "learning_rate": 1.1162207100851069e-06, + "loss": 0.0316, + "step": 6046 + }, + { + "epoch": 2.415335463258786, + "grad_norm": 0.9186099679113181, + "learning_rate": 1.1132954934673911e-06, + "loss": 0.0348, + "step": 6048 + }, + { + "epoch": 2.4161341853035143, + "grad_norm": 0.9014882003540872, + "learning_rate": 1.110373634595653e-06, + "loss": 0.0279, + "step": 6050 + }, + { + "epoch": 2.416932907348243, + "grad_norm": 0.8554968617937979, + "learning_rate": 1.1074551359941022e-06, + "loss": 0.0274, + "step": 6052 + }, + { + "epoch": 2.4177316293929714, + "grad_norm": 1.023254004503027, + "learning_rate": 1.1045400001840474e-06, + "loss": 0.0346, + "step": 6054 + }, + { + "epoch": 2.4185303514377, + "grad_norm": 1.0199229730833819, + "learning_rate": 1.1016282296838887e-06, + "loss": 0.0372, + "step": 6056 + }, + { + "epoch": 2.419329073482428, + "grad_norm": 0.8141953630986705, + "learning_rate": 1.0987198270091225e-06, + "loss": 0.0297, + "step": 6058 + }, + { + "epoch": 2.4201277955271565, + "grad_norm": 0.9824196187903327, + "learning_rate": 1.0958147946723341e-06, + "loss": 0.0295, + "step": 6060 + }, + { + "epoch": 2.420926517571885, + "grad_norm": 1.116417391297416, + "learning_rate": 1.0929131351831974e-06, + "loss": 0.0339, + "step": 6062 + }, + { + "epoch": 2.4217252396166136, + "grad_norm": 0.9588644560235544, + "learning_rate": 1.090014851048473e-06, + "loss": 0.0322, + "step": 6064 + }, + { + "epoch": 2.4225239616613417, + "grad_norm": 0.876473284974182, + "learning_rate": 1.0871199447720022e-06, + "loss": 0.0337, + "step": 6066 + }, + { + "epoch": 2.42332268370607, + "grad_norm": 0.8818096541129938, + "learning_rate": 1.0842284188547142e-06, + "loss": 0.0299, + "step": 6068 + }, + { + "epoch": 2.4241214057507987, + "grad_norm": 0.9093312218826864, + "learning_rate": 1.0813402757946145e-06, + "loss": 0.0326, + "step": 6070 + }, + { + "epoch": 2.4249201277955272, + "grad_norm": 0.8911104729900233, + "learning_rate": 1.078455518086784e-06, + "loss": 0.0311, + "step": 6072 + }, + { + "epoch": 2.4257188498402558, + "grad_norm": 0.9634405906730059, + "learning_rate": 1.0755741482233822e-06, + "loss": 0.0321, + "step": 6074 + }, + { + "epoch": 2.426517571884984, + "grad_norm": 0.8619867335268869, + "learning_rate": 1.0726961686936406e-06, + "loss": 0.0274, + "step": 6076 + }, + { + "epoch": 2.4273162939297124, + "grad_norm": 0.8467036432543594, + "learning_rate": 1.069821581983862e-06, + "loss": 0.0308, + "step": 6078 + }, + { + "epoch": 2.428115015974441, + "grad_norm": 0.9553299696013161, + "learning_rate": 1.0669503905774198e-06, + "loss": 0.0305, + "step": 6080 + }, + { + "epoch": 2.4289137380191694, + "grad_norm": 0.9434220839422669, + "learning_rate": 1.0640825969547498e-06, + "loss": 0.032, + "step": 6082 + }, + { + "epoch": 2.4297124600638975, + "grad_norm": 0.907723226631111, + "learning_rate": 1.061218203593356e-06, + "loss": 0.0292, + "step": 6084 + }, + { + "epoch": 2.430511182108626, + "grad_norm": 1.0082338096229233, + "learning_rate": 1.0583572129678043e-06, + "loss": 0.0345, + "step": 6086 + }, + { + "epoch": 2.4313099041533546, + "grad_norm": 0.9128983473815063, + "learning_rate": 1.055499627549722e-06, + "loss": 0.0312, + "step": 6088 + }, + { + "epoch": 2.432108626198083, + "grad_norm": 0.9047232760560915, + "learning_rate": 1.0526454498077892e-06, + "loss": 0.0333, + "step": 6090 + }, + { + "epoch": 2.4329073482428116, + "grad_norm": 0.9748694127638722, + "learning_rate": 1.0497946822077504e-06, + "loss": 0.0301, + "step": 6092 + }, + { + "epoch": 2.43370607028754, + "grad_norm": 0.9900482589709648, + "learning_rate": 1.0469473272123998e-06, + "loss": 0.0312, + "step": 6094 + }, + { + "epoch": 2.4345047923322682, + "grad_norm": 1.0325102686584386, + "learning_rate": 1.0441033872815804e-06, + "loss": 0.0334, + "step": 6096 + }, + { + "epoch": 2.4353035143769968, + "grad_norm": 0.8638115070891277, + "learning_rate": 1.0412628648721895e-06, + "loss": 0.0296, + "step": 6098 + }, + { + "epoch": 2.4361022364217253, + "grad_norm": 1.1255119248973784, + "learning_rate": 1.0384257624381705e-06, + "loss": 0.0305, + "step": 6100 + }, + { + "epoch": 2.436900958466454, + "grad_norm": 0.9446843609413517, + "learning_rate": 1.0355920824305127e-06, + "loss": 0.0323, + "step": 6102 + }, + { + "epoch": 2.437699680511182, + "grad_norm": 0.9313934818835989, + "learning_rate": 1.0327618272972484e-06, + "loss": 0.0322, + "step": 6104 + }, + { + "epoch": 2.4384984025559104, + "grad_norm": 1.1986399439946025, + "learning_rate": 1.0299349994834497e-06, + "loss": 0.0369, + "step": 6106 + }, + { + "epoch": 2.439297124600639, + "grad_norm": 1.0849811906791489, + "learning_rate": 1.0271116014312293e-06, + "loss": 0.0274, + "step": 6108 + }, + { + "epoch": 2.4400958466453675, + "grad_norm": 0.9885919632765612, + "learning_rate": 1.0242916355797372e-06, + "loss": 0.0294, + "step": 6110 + }, + { + "epoch": 2.440894568690096, + "grad_norm": 1.1666414296370116, + "learning_rate": 1.0214751043651582e-06, + "loss": 0.0327, + "step": 6112 + }, + { + "epoch": 2.441693290734824, + "grad_norm": 1.0373974615204293, + "learning_rate": 1.018662010220709e-06, + "loss": 0.0345, + "step": 6114 + }, + { + "epoch": 2.4424920127795526, + "grad_norm": 0.8806182333785598, + "learning_rate": 1.0158523555766375e-06, + "loss": 0.03, + "step": 6116 + }, + { + "epoch": 2.443290734824281, + "grad_norm": 0.8377676265174371, + "learning_rate": 1.0130461428602206e-06, + "loss": 0.0286, + "step": 6118 + }, + { + "epoch": 2.4440894568690097, + "grad_norm": 0.9542688751670757, + "learning_rate": 1.010243374495763e-06, + "loss": 0.0296, + "step": 6120 + }, + { + "epoch": 2.4448881789137378, + "grad_norm": 0.845828630003871, + "learning_rate": 1.0074440529045882e-06, + "loss": 0.0299, + "step": 6122 + }, + { + "epoch": 2.4456869009584663, + "grad_norm": 0.838659411317932, + "learning_rate": 1.0046481805050484e-06, + "loss": 0.0286, + "step": 6124 + }, + { + "epoch": 2.446485623003195, + "grad_norm": 0.964366696594686, + "learning_rate": 1.001855759712513e-06, + "loss": 0.0309, + "step": 6126 + }, + { + "epoch": 2.4472843450479234, + "grad_norm": 1.0550865306282715, + "learning_rate": 9.990667929393715e-07, + "loss": 0.0357, + "step": 6128 + }, + { + "epoch": 2.448083067092652, + "grad_norm": 0.9665555146057448, + "learning_rate": 9.962812825950252e-07, + "loss": 0.0334, + "step": 6130 + }, + { + "epoch": 2.4488817891373804, + "grad_norm": 1.1013658647610531, + "learning_rate": 9.934992310858944e-07, + "loss": 0.0358, + "step": 6132 + }, + { + "epoch": 2.4496805111821085, + "grad_norm": 0.9451008760428278, + "learning_rate": 9.90720640815408e-07, + "loss": 0.0319, + "step": 6134 + }, + { + "epoch": 2.450479233226837, + "grad_norm": 0.9344422669847151, + "learning_rate": 9.879455141840067e-07, + "loss": 0.0298, + "step": 6136 + }, + { + "epoch": 2.4512779552715656, + "grad_norm": 0.8508316348278806, + "learning_rate": 9.851738535891375e-07, + "loss": 0.0266, + "step": 6138 + }, + { + "epoch": 2.452076677316294, + "grad_norm": 0.9862824857730245, + "learning_rate": 9.824056614252542e-07, + "loss": 0.0314, + "step": 6140 + }, + { + "epoch": 2.452875399361022, + "grad_norm": 0.8320616360939587, + "learning_rate": 9.79640940083813e-07, + "loss": 0.0299, + "step": 6142 + }, + { + "epoch": 2.4536741214057507, + "grad_norm": 0.9792614234160211, + "learning_rate": 9.768796919532742e-07, + "loss": 0.0314, + "step": 6144 + }, + { + "epoch": 2.4544728434504792, + "grad_norm": 0.9293003315650749, + "learning_rate": 9.741219194190925e-07, + "loss": 0.0338, + "step": 6146 + }, + { + "epoch": 2.4552715654952078, + "grad_norm": 0.9682041377572445, + "learning_rate": 9.71367624863725e-07, + "loss": 0.0329, + "step": 6148 + }, + { + "epoch": 2.4560702875399363, + "grad_norm": 0.8325832618656944, + "learning_rate": 9.686168106666216e-07, + "loss": 0.031, + "step": 6150 + }, + { + "epoch": 2.4568690095846644, + "grad_norm": 0.9350572549104323, + "learning_rate": 9.658694792042284e-07, + "loss": 0.0288, + "step": 6152 + }, + { + "epoch": 2.457667731629393, + "grad_norm": 0.8277288114049663, + "learning_rate": 9.631256328499772e-07, + "loss": 0.031, + "step": 6154 + }, + { + "epoch": 2.4584664536741214, + "grad_norm": 0.9213528535489943, + "learning_rate": 9.603852739742941e-07, + "loss": 0.0284, + "step": 6156 + }, + { + "epoch": 2.45926517571885, + "grad_norm": 0.8009301927639629, + "learning_rate": 9.576484049445895e-07, + "loss": 0.029, + "step": 6158 + }, + { + "epoch": 2.460063897763578, + "grad_norm": 0.8921062677918278, + "learning_rate": 9.549150281252633e-07, + "loss": 0.0354, + "step": 6160 + }, + { + "epoch": 2.4608626198083066, + "grad_norm": 0.8850346677550753, + "learning_rate": 9.521851458776915e-07, + "loss": 0.0297, + "step": 6162 + }, + { + "epoch": 2.461661341853035, + "grad_norm": 0.9779735465184336, + "learning_rate": 9.494587605602368e-07, + "loss": 0.0289, + "step": 6164 + }, + { + "epoch": 2.4624600638977636, + "grad_norm": 0.9498573454203734, + "learning_rate": 9.467358745282379e-07, + "loss": 0.0279, + "step": 6166 + }, + { + "epoch": 2.463258785942492, + "grad_norm": 0.9196211437633159, + "learning_rate": 9.440164901340127e-07, + "loss": 0.0311, + "step": 6168 + }, + { + "epoch": 2.4640575079872207, + "grad_norm": 0.8765231457695842, + "learning_rate": 9.413006097268512e-07, + "loss": 0.0243, + "step": 6170 + }, + { + "epoch": 2.4648562300319488, + "grad_norm": 0.853315794604095, + "learning_rate": 9.385882356530179e-07, + "loss": 0.026, + "step": 6172 + }, + { + "epoch": 2.4656549520766773, + "grad_norm": 0.9844044641457295, + "learning_rate": 9.358793702557489e-07, + "loss": 0.0293, + "step": 6174 + }, + { + "epoch": 2.466453674121406, + "grad_norm": 0.9636119419245541, + "learning_rate": 9.331740158752495e-07, + "loss": 0.0334, + "step": 6176 + }, + { + "epoch": 2.4672523961661343, + "grad_norm": 1.1029243201298184, + "learning_rate": 9.304721748486878e-07, + "loss": 0.0344, + "step": 6178 + }, + { + "epoch": 2.4680511182108624, + "grad_norm": 0.9591374043106669, + "learning_rate": 9.277738495102012e-07, + "loss": 0.0308, + "step": 6180 + }, + { + "epoch": 2.468849840255591, + "grad_norm": 0.9656785120322073, + "learning_rate": 9.250790421908862e-07, + "loss": 0.0261, + "step": 6182 + }, + { + "epoch": 2.4696485623003195, + "grad_norm": 1.0686970243259972, + "learning_rate": 9.223877552188065e-07, + "loss": 0.0325, + "step": 6184 + }, + { + "epoch": 2.470447284345048, + "grad_norm": 0.9422767101505445, + "learning_rate": 9.196999909189764e-07, + "loss": 0.0293, + "step": 6186 + }, + { + "epoch": 2.4712460063897765, + "grad_norm": 0.9128608441727557, + "learning_rate": 9.17015751613371e-07, + "loss": 0.0278, + "step": 6188 + }, + { + "epoch": 2.4720447284345046, + "grad_norm": 1.0182227621831177, + "learning_rate": 9.14335039620921e-07, + "loss": 0.0291, + "step": 6190 + }, + { + "epoch": 2.472843450479233, + "grad_norm": 1.0323032210841876, + "learning_rate": 9.116578572575091e-07, + "loss": 0.0381, + "step": 6192 + }, + { + "epoch": 2.4736421725239617, + "grad_norm": 1.007977038062956, + "learning_rate": 9.089842068359661e-07, + "loss": 0.0328, + "step": 6194 + }, + { + "epoch": 2.47444089456869, + "grad_norm": 1.0276724023266843, + "learning_rate": 9.06314090666075e-07, + "loss": 0.0353, + "step": 6196 + }, + { + "epoch": 2.4752396166134183, + "grad_norm": 1.033358203994531, + "learning_rate": 9.03647511054564e-07, + "loss": 0.0291, + "step": 6198 + }, + { + "epoch": 2.476038338658147, + "grad_norm": 0.8075474991970624, + "learning_rate": 9.009844703051063e-07, + "loss": 0.0276, + "step": 6200 + }, + { + "epoch": 2.4768370607028753, + "grad_norm": 0.9186532720625925, + "learning_rate": 8.98324970718319e-07, + "loss": 0.0283, + "step": 6202 + }, + { + "epoch": 2.477635782747604, + "grad_norm": 0.9745581052474273, + "learning_rate": 8.956690145917557e-07, + "loss": 0.0337, + "step": 6204 + }, + { + "epoch": 2.4784345047923324, + "grad_norm": 0.8345103534692212, + "learning_rate": 8.930166042199146e-07, + "loss": 0.0256, + "step": 6206 + }, + { + "epoch": 2.479233226837061, + "grad_norm": 0.9005598945512714, + "learning_rate": 8.903677418942292e-07, + "loss": 0.0285, + "step": 6208 + }, + { + "epoch": 2.480031948881789, + "grad_norm": 1.0271916802595848, + "learning_rate": 8.877224299030629e-07, + "loss": 0.0317, + "step": 6210 + }, + { + "epoch": 2.4808306709265175, + "grad_norm": 1.075693558366458, + "learning_rate": 8.850806705317183e-07, + "loss": 0.0368, + "step": 6212 + }, + { + "epoch": 2.481629392971246, + "grad_norm": 0.8722330378555192, + "learning_rate": 8.824424660624247e-07, + "loss": 0.029, + "step": 6214 + }, + { + "epoch": 2.4824281150159746, + "grad_norm": 1.0601895552621947, + "learning_rate": 8.79807818774343e-07, + "loss": 0.0293, + "step": 6216 + }, + { + "epoch": 2.4832268370607027, + "grad_norm": 0.8707900195092941, + "learning_rate": 8.771767309435614e-07, + "loss": 0.0304, + "step": 6218 + }, + { + "epoch": 2.484025559105431, + "grad_norm": 0.9733820547813553, + "learning_rate": 8.745492048430876e-07, + "loss": 0.0343, + "step": 6220 + }, + { + "epoch": 2.4848242811501597, + "grad_norm": 0.9317695338486248, + "learning_rate": 8.719252427428582e-07, + "loss": 0.0303, + "step": 6222 + }, + { + "epoch": 2.4856230031948883, + "grad_norm": 1.033404563883472, + "learning_rate": 8.693048469097293e-07, + "loss": 0.0325, + "step": 6224 + }, + { + "epoch": 2.486421725239617, + "grad_norm": 0.9136265608554973, + "learning_rate": 8.666880196074767e-07, + "loss": 0.0308, + "step": 6226 + }, + { + "epoch": 2.487220447284345, + "grad_norm": 1.1596479673050444, + "learning_rate": 8.640747630967883e-07, + "loss": 0.0294, + "step": 6228 + }, + { + "epoch": 2.4880191693290734, + "grad_norm": 1.0426530374304792, + "learning_rate": 8.614650796352747e-07, + "loss": 0.0306, + "step": 6230 + }, + { + "epoch": 2.488817891373802, + "grad_norm": 0.9242364015680927, + "learning_rate": 8.58858971477457e-07, + "loss": 0.0327, + "step": 6232 + }, + { + "epoch": 2.4896166134185305, + "grad_norm": 0.9313580110161238, + "learning_rate": 8.562564408747637e-07, + "loss": 0.0306, + "step": 6234 + }, + { + "epoch": 2.4904153354632586, + "grad_norm": 1.067403189080394, + "learning_rate": 8.536574900755367e-07, + "loss": 0.0317, + "step": 6236 + }, + { + "epoch": 2.491214057507987, + "grad_norm": 0.9285372176981973, + "learning_rate": 8.510621213250248e-07, + "loss": 0.0305, + "step": 6238 + }, + { + "epoch": 2.4920127795527156, + "grad_norm": 0.8346074025633845, + "learning_rate": 8.484703368653812e-07, + "loss": 0.0273, + "step": 6240 + }, + { + "epoch": 2.492811501597444, + "grad_norm": 0.9152125909961968, + "learning_rate": 8.458821389356647e-07, + "loss": 0.0303, + "step": 6242 + }, + { + "epoch": 2.4936102236421727, + "grad_norm": 0.9838042565841876, + "learning_rate": 8.432975297718321e-07, + "loss": 0.0299, + "step": 6244 + }, + { + "epoch": 2.494408945686901, + "grad_norm": 0.9237463657264794, + "learning_rate": 8.407165116067423e-07, + "loss": 0.0304, + "step": 6246 + }, + { + "epoch": 2.4952076677316293, + "grad_norm": 1.1889649482379565, + "learning_rate": 8.381390866701517e-07, + "loss": 0.0354, + "step": 6248 + }, + { + "epoch": 2.496006389776358, + "grad_norm": 0.9830687823209033, + "learning_rate": 8.355652571887135e-07, + "loss": 0.0317, + "step": 6250 + }, + { + "epoch": 2.4968051118210863, + "grad_norm": 0.8992861344610856, + "learning_rate": 8.329950253859703e-07, + "loss": 0.0299, + "step": 6252 + }, + { + "epoch": 2.497603833865815, + "grad_norm": 1.0201169404321446, + "learning_rate": 8.304283934823626e-07, + "loss": 0.0337, + "step": 6254 + }, + { + "epoch": 2.498402555910543, + "grad_norm": 0.996273309697261, + "learning_rate": 8.278653636952177e-07, + "loss": 0.0309, + "step": 6256 + }, + { + "epoch": 2.4992012779552715, + "grad_norm": 0.8898734510036184, + "learning_rate": 8.25305938238753e-07, + "loss": 0.031, + "step": 6258 + }, + { + "epoch": 2.5, + "grad_norm": 0.8809017767144187, + "learning_rate": 8.227501193240673e-07, + "loss": 0.0344, + "step": 6260 + }, + { + "epoch": 2.5007987220447285, + "grad_norm": 0.9543528237861113, + "learning_rate": 8.201979091591488e-07, + "loss": 0.0282, + "step": 6262 + }, + { + "epoch": 2.501597444089457, + "grad_norm": 1.111241667863821, + "learning_rate": 8.176493099488664e-07, + "loss": 0.0358, + "step": 6264 + }, + { + "epoch": 2.502396166134185, + "grad_norm": 0.869787895901821, + "learning_rate": 8.151043238949697e-07, + "loss": 0.0272, + "step": 6266 + }, + { + "epoch": 2.5031948881789137, + "grad_norm": 0.9762765697943674, + "learning_rate": 8.125629531960849e-07, + "loss": 0.0299, + "step": 6268 + }, + { + "epoch": 2.503993610223642, + "grad_norm": 0.9008988645129132, + "learning_rate": 8.100252000477177e-07, + "loss": 0.0295, + "step": 6270 + }, + { + "epoch": 2.5047923322683707, + "grad_norm": 0.9858370192539189, + "learning_rate": 8.074910666422475e-07, + "loss": 0.0316, + "step": 6272 + }, + { + "epoch": 2.505591054313099, + "grad_norm": 0.9950028382350193, + "learning_rate": 8.049605551689255e-07, + "loss": 0.0314, + "step": 6274 + }, + { + "epoch": 2.5063897763578273, + "grad_norm": 0.8706385388900013, + "learning_rate": 8.024336678138761e-07, + "loss": 0.03, + "step": 6276 + }, + { + "epoch": 2.507188498402556, + "grad_norm": 0.9317584954844236, + "learning_rate": 7.999104067600904e-07, + "loss": 0.0319, + "step": 6278 + }, + { + "epoch": 2.5079872204472844, + "grad_norm": 0.9028386865531697, + "learning_rate": 7.973907741874287e-07, + "loss": 0.0319, + "step": 6280 + }, + { + "epoch": 2.508785942492013, + "grad_norm": 1.0429573535057794, + "learning_rate": 7.948747722726169e-07, + "loss": 0.0362, + "step": 6282 + }, + { + "epoch": 2.5095846645367414, + "grad_norm": 0.9075149519955287, + "learning_rate": 7.923624031892402e-07, + "loss": 0.0297, + "step": 6284 + }, + { + "epoch": 2.5103833865814695, + "grad_norm": 0.9244441640851673, + "learning_rate": 7.898536691077508e-07, + "loss": 0.0356, + "step": 6286 + }, + { + "epoch": 2.511182108626198, + "grad_norm": 0.9718862084825739, + "learning_rate": 7.873485721954572e-07, + "loss": 0.0327, + "step": 6288 + }, + { + "epoch": 2.5119808306709266, + "grad_norm": 0.8694408724126972, + "learning_rate": 7.848471146165287e-07, + "loss": 0.0297, + "step": 6290 + }, + { + "epoch": 2.512779552715655, + "grad_norm": 0.8938047550855578, + "learning_rate": 7.823492985319858e-07, + "loss": 0.0275, + "step": 6292 + }, + { + "epoch": 2.513578274760383, + "grad_norm": 0.8065785132438869, + "learning_rate": 7.798551260997067e-07, + "loss": 0.0312, + "step": 6294 + }, + { + "epoch": 2.5143769968051117, + "grad_norm": 0.929264264045834, + "learning_rate": 7.773645994744222e-07, + "loss": 0.0256, + "step": 6296 + }, + { + "epoch": 2.5151757188498403, + "grad_norm": 1.3557337175166702, + "learning_rate": 7.748777208077118e-07, + "loss": 0.0288, + "step": 6298 + }, + { + "epoch": 2.515974440894569, + "grad_norm": 1.1432470693302257, + "learning_rate": 7.723944922480037e-07, + "loss": 0.0343, + "step": 6300 + }, + { + "epoch": 2.5167731629392973, + "grad_norm": 1.1285169628245737, + "learning_rate": 7.699149159405734e-07, + "loss": 0.0344, + "step": 6302 + }, + { + "epoch": 2.5175718849840254, + "grad_norm": 0.9264387543909826, + "learning_rate": 7.674389940275406e-07, + "loss": 0.0299, + "step": 6304 + }, + { + "epoch": 2.518370607028754, + "grad_norm": 0.9885710066492656, + "learning_rate": 7.649667286478696e-07, + "loss": 0.0349, + "step": 6306 + }, + { + "epoch": 2.5191693290734825, + "grad_norm": 0.8115719655821809, + "learning_rate": 7.624981219373623e-07, + "loss": 0.0301, + "step": 6308 + }, + { + "epoch": 2.519968051118211, + "grad_norm": 0.8265082284522801, + "learning_rate": 7.600331760286627e-07, + "loss": 0.0291, + "step": 6310 + }, + { + "epoch": 2.520766773162939, + "grad_norm": 0.8510973944883193, + "learning_rate": 7.575718930512516e-07, + "loss": 0.0259, + "step": 6312 + }, + { + "epoch": 2.5215654952076676, + "grad_norm": 0.9955720237715131, + "learning_rate": 7.551142751314455e-07, + "loss": 0.0309, + "step": 6314 + }, + { + "epoch": 2.522364217252396, + "grad_norm": 0.94544787132954, + "learning_rate": 7.526603243923958e-07, + "loss": 0.0278, + "step": 6316 + }, + { + "epoch": 2.5231629392971247, + "grad_norm": 0.7689821541255585, + "learning_rate": 7.502100429540815e-07, + "loss": 0.0283, + "step": 6318 + }, + { + "epoch": 2.523961661341853, + "grad_norm": 0.8326325372584937, + "learning_rate": 7.47763432933315e-07, + "loss": 0.0288, + "step": 6320 + }, + { + "epoch": 2.5247603833865817, + "grad_norm": 0.8468951802399899, + "learning_rate": 7.453204964437394e-07, + "loss": 0.0261, + "step": 6322 + }, + { + "epoch": 2.52555910543131, + "grad_norm": 0.9023769818457569, + "learning_rate": 7.428812355958181e-07, + "loss": 0.0316, + "step": 6324 + }, + { + "epoch": 2.5263578274760383, + "grad_norm": 0.7923001953812997, + "learning_rate": 7.404456524968445e-07, + "loss": 0.0246, + "step": 6326 + }, + { + "epoch": 2.527156549520767, + "grad_norm": 0.8829690105277411, + "learning_rate": 7.380137492509309e-07, + "loss": 0.0262, + "step": 6328 + }, + { + "epoch": 2.527955271565495, + "grad_norm": 0.7804684297568552, + "learning_rate": 7.355855279590146e-07, + "loss": 0.0243, + "step": 6330 + }, + { + "epoch": 2.5287539936102235, + "grad_norm": 1.0507641034180348, + "learning_rate": 7.33160990718847e-07, + "loss": 0.0318, + "step": 6332 + }, + { + "epoch": 2.529552715654952, + "grad_norm": 0.9578888020234547, + "learning_rate": 7.307401396250008e-07, + "loss": 0.0255, + "step": 6334 + }, + { + "epoch": 2.5303514376996805, + "grad_norm": 0.9269807171184239, + "learning_rate": 7.283229767688627e-07, + "loss": 0.0308, + "step": 6336 + }, + { + "epoch": 2.531150159744409, + "grad_norm": 0.8790871744926378, + "learning_rate": 7.259095042386338e-07, + "loss": 0.0301, + "step": 6338 + }, + { + "epoch": 2.5319488817891376, + "grad_norm": 0.9895066251185489, + "learning_rate": 7.23499724119327e-07, + "loss": 0.0291, + "step": 6340 + }, + { + "epoch": 2.5327476038338657, + "grad_norm": 0.8899644518567821, + "learning_rate": 7.210936384927631e-07, + "loss": 0.0311, + "step": 6342 + }, + { + "epoch": 2.533546325878594, + "grad_norm": 1.0693235257658584, + "learning_rate": 7.186912494375736e-07, + "loss": 0.0323, + "step": 6344 + }, + { + "epoch": 2.5343450479233227, + "grad_norm": 1.044274627458564, + "learning_rate": 7.162925590291986e-07, + "loss": 0.0294, + "step": 6346 + }, + { + "epoch": 2.5351437699680512, + "grad_norm": 0.9703071429232554, + "learning_rate": 7.13897569339877e-07, + "loss": 0.0292, + "step": 6348 + }, + { + "epoch": 2.5359424920127793, + "grad_norm": 0.9567409636353577, + "learning_rate": 7.115062824386554e-07, + "loss": 0.0262, + "step": 6350 + }, + { + "epoch": 2.536741214057508, + "grad_norm": 0.8461463315662133, + "learning_rate": 7.091187003913802e-07, + "loss": 0.0298, + "step": 6352 + }, + { + "epoch": 2.5375399361022364, + "grad_norm": 0.9718174385536262, + "learning_rate": 7.067348252606965e-07, + "loss": 0.031, + "step": 6354 + }, + { + "epoch": 2.538338658146965, + "grad_norm": 0.8730259562016701, + "learning_rate": 7.043546591060485e-07, + "loss": 0.0299, + "step": 6356 + }, + { + "epoch": 2.5391373801916934, + "grad_norm": 0.9333674604962541, + "learning_rate": 7.019782039836737e-07, + "loss": 0.0295, + "step": 6358 + }, + { + "epoch": 2.539936102236422, + "grad_norm": 0.888933835128418, + "learning_rate": 6.996054619466053e-07, + "loss": 0.0322, + "step": 6360 + }, + { + "epoch": 2.54073482428115, + "grad_norm": 0.9530139786909172, + "learning_rate": 6.972364350446698e-07, + "loss": 0.032, + "step": 6362 + }, + { + "epoch": 2.5415335463258786, + "grad_norm": 0.8887252902742203, + "learning_rate": 6.948711253244827e-07, + "loss": 0.0312, + "step": 6364 + }, + { + "epoch": 2.542332268370607, + "grad_norm": 0.8352734988241693, + "learning_rate": 6.92509534829447e-07, + "loss": 0.0272, + "step": 6366 + }, + { + "epoch": 2.543130990415335, + "grad_norm": 0.9214285326384319, + "learning_rate": 6.901516655997536e-07, + "loss": 0.0279, + "step": 6368 + }, + { + "epoch": 2.5439297124600637, + "grad_norm": 1.0207519839454122, + "learning_rate": 6.877975196723824e-07, + "loss": 0.0295, + "step": 6370 + }, + { + "epoch": 2.5447284345047922, + "grad_norm": 1.0846736644854258, + "learning_rate": 6.854470990810907e-07, + "loss": 0.0371, + "step": 6372 + }, + { + "epoch": 2.5455271565495208, + "grad_norm": 0.929449666083272, + "learning_rate": 6.831004058564211e-07, + "loss": 0.03, + "step": 6374 + }, + { + "epoch": 2.5463258785942493, + "grad_norm": 0.9865284947555943, + "learning_rate": 6.80757442025694e-07, + "loss": 0.0311, + "step": 6376 + }, + { + "epoch": 2.547124600638978, + "grad_norm": 0.9826417414313666, + "learning_rate": 6.784182096130104e-07, + "loss": 0.0285, + "step": 6378 + }, + { + "epoch": 2.547923322683706, + "grad_norm": 0.9343640996748256, + "learning_rate": 6.76082710639247e-07, + "loss": 0.0292, + "step": 6380 + }, + { + "epoch": 2.5487220447284344, + "grad_norm": 0.9782586764772694, + "learning_rate": 6.737509471220527e-07, + "loss": 0.0327, + "step": 6382 + }, + { + "epoch": 2.549520766773163, + "grad_norm": 0.9736964982154661, + "learning_rate": 6.714229210758516e-07, + "loss": 0.0292, + "step": 6384 + }, + { + "epoch": 2.5503194888178915, + "grad_norm": 0.9366260036987168, + "learning_rate": 6.690986345118389e-07, + "loss": 0.0308, + "step": 6386 + }, + { + "epoch": 2.5511182108626196, + "grad_norm": 1.052050189264594, + "learning_rate": 6.667780894379799e-07, + "loss": 0.0308, + "step": 6388 + }, + { + "epoch": 2.551916932907348, + "grad_norm": 0.9827812854065209, + "learning_rate": 6.644612878590034e-07, + "loss": 0.0316, + "step": 6390 + }, + { + "epoch": 2.5527156549520766, + "grad_norm": 0.8386586055394432, + "learning_rate": 6.621482317764105e-07, + "loss": 0.0305, + "step": 6392 + }, + { + "epoch": 2.553514376996805, + "grad_norm": 0.9894645955986272, + "learning_rate": 6.598389231884628e-07, + "loss": 0.0297, + "step": 6394 + }, + { + "epoch": 2.5543130990415337, + "grad_norm": 0.8839682059943562, + "learning_rate": 6.575333640901855e-07, + "loss": 0.0285, + "step": 6396 + }, + { + "epoch": 2.5551118210862622, + "grad_norm": 0.8887779152215409, + "learning_rate": 6.552315564733625e-07, + "loss": 0.0285, + "step": 6398 + }, + { + "epoch": 2.5559105431309903, + "grad_norm": 1.0490578369091348, + "learning_rate": 6.529335023265387e-07, + "loss": 0.0289, + "step": 6400 + }, + { + "epoch": 2.556709265175719, + "grad_norm": 1.0894646563356374, + "learning_rate": 6.506392036350168e-07, + "loss": 0.0322, + "step": 6402 + }, + { + "epoch": 2.5575079872204474, + "grad_norm": 0.9316306126563935, + "learning_rate": 6.483486623808555e-07, + "loss": 0.0282, + "step": 6404 + }, + { + "epoch": 2.5583067092651754, + "grad_norm": 1.0138875975414274, + "learning_rate": 6.460618805428637e-07, + "loss": 0.029, + "step": 6406 + }, + { + "epoch": 2.559105431309904, + "grad_norm": 0.9409011790958446, + "learning_rate": 6.437788600966066e-07, + "loss": 0.0275, + "step": 6408 + }, + { + "epoch": 2.5599041533546325, + "grad_norm": 1.0950975759175423, + "learning_rate": 6.414996030143982e-07, + "loss": 0.026, + "step": 6410 + }, + { + "epoch": 2.560702875399361, + "grad_norm": 0.8621907379188767, + "learning_rate": 6.392241112653031e-07, + "loss": 0.0263, + "step": 6412 + }, + { + "epoch": 2.5615015974440896, + "grad_norm": 1.0987324235841187, + "learning_rate": 6.369523868151278e-07, + "loss": 0.03, + "step": 6414 + }, + { + "epoch": 2.562300319488818, + "grad_norm": 1.0180249667885073, + "learning_rate": 6.346844316264312e-07, + "loss": 0.0337, + "step": 6416 + }, + { + "epoch": 2.563099041533546, + "grad_norm": 0.918284409010965, + "learning_rate": 6.324202476585112e-07, + "loss": 0.0293, + "step": 6418 + }, + { + "epoch": 2.5638977635782747, + "grad_norm": 0.8900922129741325, + "learning_rate": 6.301598368674106e-07, + "loss": 0.0262, + "step": 6420 + }, + { + "epoch": 2.5646964856230032, + "grad_norm": 0.8698502363272664, + "learning_rate": 6.279032012059089e-07, + "loss": 0.0279, + "step": 6422 + }, + { + "epoch": 2.5654952076677318, + "grad_norm": 0.7647567338632194, + "learning_rate": 6.256503426235277e-07, + "loss": 0.0259, + "step": 6424 + }, + { + "epoch": 2.56629392971246, + "grad_norm": 0.9236140855263162, + "learning_rate": 6.234012630665237e-07, + "loss": 0.0309, + "step": 6426 + }, + { + "epoch": 2.5670926517571884, + "grad_norm": 0.9418851113359151, + "learning_rate": 6.211559644778908e-07, + "loss": 0.0287, + "step": 6428 + }, + { + "epoch": 2.567891373801917, + "grad_norm": 0.9410493586581912, + "learning_rate": 6.189144487973531e-07, + "loss": 0.0245, + "step": 6430 + }, + { + "epoch": 2.5686900958466454, + "grad_norm": 0.8715415469545029, + "learning_rate": 6.166767179613691e-07, + "loss": 0.0272, + "step": 6432 + }, + { + "epoch": 2.569488817891374, + "grad_norm": 1.0011250141948451, + "learning_rate": 6.144427739031284e-07, + "loss": 0.0336, + "step": 6434 + }, + { + "epoch": 2.5702875399361025, + "grad_norm": 0.8930124959505643, + "learning_rate": 6.122126185525462e-07, + "loss": 0.0287, + "step": 6436 + }, + { + "epoch": 2.5710862619808306, + "grad_norm": 0.8345328277096289, + "learning_rate": 6.099862538362678e-07, + "loss": 0.0274, + "step": 6438 + }, + { + "epoch": 2.571884984025559, + "grad_norm": 0.8429043604050549, + "learning_rate": 6.077636816776611e-07, + "loss": 0.0259, + "step": 6440 + }, + { + "epoch": 2.5726837060702876, + "grad_norm": 0.9282997534416909, + "learning_rate": 6.055449039968197e-07, + "loss": 0.0308, + "step": 6442 + }, + { + "epoch": 2.5734824281150157, + "grad_norm": 1.078810276103343, + "learning_rate": 6.033299227105588e-07, + "loss": 0.03, + "step": 6444 + }, + { + "epoch": 2.5742811501597442, + "grad_norm": 0.9006283103160382, + "learning_rate": 6.011187397324114e-07, + "loss": 0.0277, + "step": 6446 + }, + { + "epoch": 2.5750798722044728, + "grad_norm": 1.0725617589003342, + "learning_rate": 5.989113569726312e-07, + "loss": 0.0312, + "step": 6448 + }, + { + "epoch": 2.5758785942492013, + "grad_norm": 1.0044942875803178, + "learning_rate": 5.967077763381895e-07, + "loss": 0.0269, + "step": 6450 + }, + { + "epoch": 2.57667731629393, + "grad_norm": 1.0483372526983028, + "learning_rate": 5.945079997327713e-07, + "loss": 0.0303, + "step": 6452 + }, + { + "epoch": 2.5774760383386583, + "grad_norm": 0.885373336335105, + "learning_rate": 5.923120290567779e-07, + "loss": 0.0292, + "step": 6454 + }, + { + "epoch": 2.5782747603833864, + "grad_norm": 0.9919843795418669, + "learning_rate": 5.901198662073188e-07, + "loss": 0.0372, + "step": 6456 + }, + { + "epoch": 2.579073482428115, + "grad_norm": 1.300703442861457, + "learning_rate": 5.87931513078216e-07, + "loss": 0.0246, + "step": 6458 + }, + { + "epoch": 2.5798722044728435, + "grad_norm": 1.0888147527555645, + "learning_rate": 5.8574697156e-07, + "loss": 0.0368, + "step": 6460 + }, + { + "epoch": 2.580670926517572, + "grad_norm": 1.183957007842984, + "learning_rate": 5.835662435399098e-07, + "loss": 0.0345, + "step": 6462 + }, + { + "epoch": 2.5814696485623, + "grad_norm": 0.9387448812321908, + "learning_rate": 5.813893309018881e-07, + "loss": 0.026, + "step": 6464 + }, + { + "epoch": 2.5822683706070286, + "grad_norm": 0.9875605986563711, + "learning_rate": 5.792162355265812e-07, + "loss": 0.0323, + "step": 6466 + }, + { + "epoch": 2.583067092651757, + "grad_norm": 0.8843843609089413, + "learning_rate": 5.770469592913408e-07, + "loss": 0.029, + "step": 6468 + }, + { + "epoch": 2.5838658146964857, + "grad_norm": 0.9754923443275886, + "learning_rate": 5.748815040702138e-07, + "loss": 0.0307, + "step": 6470 + }, + { + "epoch": 2.584664536741214, + "grad_norm": 1.005644213615307, + "learning_rate": 5.727198717339511e-07, + "loss": 0.0324, + "step": 6472 + }, + { + "epoch": 2.5854632587859427, + "grad_norm": 0.9932008626732668, + "learning_rate": 5.705620641499981e-07, + "loss": 0.0306, + "step": 6474 + }, + { + "epoch": 2.586261980830671, + "grad_norm": 0.9873244519751645, + "learning_rate": 5.684080831824978e-07, + "loss": 0.0331, + "step": 6476 + }, + { + "epoch": 2.5870607028753994, + "grad_norm": 0.9241878530410826, + "learning_rate": 5.662579306922872e-07, + "loss": 0.0322, + "step": 6478 + }, + { + "epoch": 2.587859424920128, + "grad_norm": 1.0995821963687364, + "learning_rate": 5.641116085368931e-07, + "loss": 0.0308, + "step": 6480 + }, + { + "epoch": 2.588658146964856, + "grad_norm": 0.8378812796834532, + "learning_rate": 5.619691185705356e-07, + "loss": 0.0285, + "step": 6482 + }, + { + "epoch": 2.5894568690095845, + "grad_norm": 0.8605425287906566, + "learning_rate": 5.598304626441264e-07, + "loss": 0.0251, + "step": 6484 + }, + { + "epoch": 2.590255591054313, + "grad_norm": 1.0050366580290326, + "learning_rate": 5.576956426052605e-07, + "loss": 0.0287, + "step": 6486 + }, + { + "epoch": 2.5910543130990416, + "grad_norm": 1.065340889189702, + "learning_rate": 5.555646602982207e-07, + "loss": 0.0336, + "step": 6488 + }, + { + "epoch": 2.59185303514377, + "grad_norm": 0.9304491062820247, + "learning_rate": 5.53437517563975e-07, + "loss": 0.0282, + "step": 6490 + }, + { + "epoch": 2.5926517571884986, + "grad_norm": 0.9396575473059782, + "learning_rate": 5.513142162401746e-07, + "loss": 0.0258, + "step": 6492 + }, + { + "epoch": 2.5934504792332267, + "grad_norm": 0.9393766036210395, + "learning_rate": 5.491947581611517e-07, + "loss": 0.0266, + "step": 6494 + }, + { + "epoch": 2.594249201277955, + "grad_norm": 0.9435384740988189, + "learning_rate": 5.470791451579172e-07, + "loss": 0.0317, + "step": 6496 + }, + { + "epoch": 2.5950479233226837, + "grad_norm": 0.9321052021346842, + "learning_rate": 5.449673790581611e-07, + "loss": 0.0299, + "step": 6498 + }, + { + "epoch": 2.5958466453674123, + "grad_norm": 0.9112996056101765, + "learning_rate": 5.428594616862504e-07, + "loss": 0.0269, + "step": 6500 + }, + { + "epoch": 2.5958466453674123, + "eval_loss": 0.18101496994495392, + "eval_runtime": 417.6423, + "eval_samples_per_second": 42.637, + "eval_steps_per_second": 5.33, + "step": 6500 + }, + { + "epoch": 2.5966453674121404, + "grad_norm": 0.9226194319100446, + "learning_rate": 5.407553948632277e-07, + "loss": 0.0304, + "step": 6502 + }, + { + "epoch": 2.597444089456869, + "grad_norm": 0.9405971080377886, + "learning_rate": 5.386551804068063e-07, + "loss": 0.0314, + "step": 6504 + }, + { + "epoch": 2.5982428115015974, + "grad_norm": 0.9385959964411323, + "learning_rate": 5.365588201313737e-07, + "loss": 0.0297, + "step": 6506 + }, + { + "epoch": 2.599041533546326, + "grad_norm": 0.84541282801553, + "learning_rate": 5.344663158479901e-07, + "loss": 0.0268, + "step": 6508 + }, + { + "epoch": 2.5998402555910545, + "grad_norm": 0.9713712244776086, + "learning_rate": 5.323776693643784e-07, + "loss": 0.0306, + "step": 6510 + }, + { + "epoch": 2.600638977635783, + "grad_norm": 0.8383820244075577, + "learning_rate": 5.302928824849335e-07, + "loss": 0.0265, + "step": 6512 + }, + { + "epoch": 2.601437699680511, + "grad_norm": 1.124289996932428, + "learning_rate": 5.282119570107147e-07, + "loss": 0.0322, + "step": 6514 + }, + { + "epoch": 2.6022364217252396, + "grad_norm": 0.9423889108993881, + "learning_rate": 5.261348947394451e-07, + "loss": 0.0281, + "step": 6516 + }, + { + "epoch": 2.603035143769968, + "grad_norm": 0.9129093506596754, + "learning_rate": 5.240616974655116e-07, + "loss": 0.0279, + "step": 6518 + }, + { + "epoch": 2.6038338658146962, + "grad_norm": 0.7842673666707356, + "learning_rate": 5.219923669799587e-07, + "loss": 0.0241, + "step": 6520 + }, + { + "epoch": 2.6046325878594248, + "grad_norm": 1.1436169228498474, + "learning_rate": 5.199269050704935e-07, + "loss": 0.0325, + "step": 6522 + }, + { + "epoch": 2.6054313099041533, + "grad_norm": 0.8690653883300302, + "learning_rate": 5.178653135214811e-07, + "loss": 0.0298, + "step": 6524 + }, + { + "epoch": 2.606230031948882, + "grad_norm": 0.8819467380201907, + "learning_rate": 5.158075941139429e-07, + "loss": 0.0291, + "step": 6526 + }, + { + "epoch": 2.6070287539936103, + "grad_norm": 0.9898669074800155, + "learning_rate": 5.137537486255517e-07, + "loss": 0.0306, + "step": 6528 + }, + { + "epoch": 2.607827476038339, + "grad_norm": 1.129969353766448, + "learning_rate": 5.117037788306367e-07, + "loss": 0.029, + "step": 6530 + }, + { + "epoch": 2.608626198083067, + "grad_norm": 1.18791037882053, + "learning_rate": 5.096576865001802e-07, + "loss": 0.0308, + "step": 6532 + }, + { + "epoch": 2.6094249201277955, + "grad_norm": 0.9477100539787796, + "learning_rate": 5.07615473401813e-07, + "loss": 0.0277, + "step": 6534 + }, + { + "epoch": 2.610223642172524, + "grad_norm": 0.9068341807452427, + "learning_rate": 5.055771412998122e-07, + "loss": 0.0289, + "step": 6536 + }, + { + "epoch": 2.6110223642172525, + "grad_norm": 0.895582110683765, + "learning_rate": 5.035426919551062e-07, + "loss": 0.0268, + "step": 6538 + }, + { + "epoch": 2.6118210862619806, + "grad_norm": 0.9258207040673696, + "learning_rate": 5.015121271252659e-07, + "loss": 0.0287, + "step": 6540 + }, + { + "epoch": 2.612619808306709, + "grad_norm": 0.8949264605397048, + "learning_rate": 4.994854485645106e-07, + "loss": 0.0291, + "step": 6542 + }, + { + "epoch": 2.6134185303514377, + "grad_norm": 1.0785382270595747, + "learning_rate": 4.974626580236957e-07, + "loss": 0.0302, + "step": 6544 + }, + { + "epoch": 2.614217252396166, + "grad_norm": 1.0483244361602544, + "learning_rate": 4.954437572503235e-07, + "loss": 0.031, + "step": 6546 + }, + { + "epoch": 2.6150159744408947, + "grad_norm": 1.028765681064945, + "learning_rate": 4.934287479885336e-07, + "loss": 0.0303, + "step": 6548 + }, + { + "epoch": 2.6158146964856233, + "grad_norm": 0.9557621074130701, + "learning_rate": 4.914176319791037e-07, + "loss": 0.025, + "step": 6550 + }, + { + "epoch": 2.6166134185303513, + "grad_norm": 0.8620869517168456, + "learning_rate": 4.894104109594466e-07, + "loss": 0.0275, + "step": 6552 + }, + { + "epoch": 2.61741214057508, + "grad_norm": 0.8437067964701872, + "learning_rate": 4.874070866636149e-07, + "loss": 0.0274, + "step": 6554 + }, + { + "epoch": 2.6182108626198084, + "grad_norm": 0.8006419089172357, + "learning_rate": 4.854076608222901e-07, + "loss": 0.0303, + "step": 6556 + }, + { + "epoch": 2.6190095846645365, + "grad_norm": 0.8823397281540788, + "learning_rate": 4.834121351627885e-07, + "loss": 0.0254, + "step": 6558 + }, + { + "epoch": 2.619808306709265, + "grad_norm": 0.8499615378338659, + "learning_rate": 4.814205114090543e-07, + "loss": 0.0239, + "step": 6560 + }, + { + "epoch": 2.6206070287539935, + "grad_norm": 1.03836923861178, + "learning_rate": 4.794327912816637e-07, + "loss": 0.0337, + "step": 6562 + }, + { + "epoch": 2.621405750798722, + "grad_norm": 1.064067294335829, + "learning_rate": 4.774489764978185e-07, + "loss": 0.0267, + "step": 6564 + }, + { + "epoch": 2.6222044728434506, + "grad_norm": 0.9357645207898694, + "learning_rate": 4.754690687713498e-07, + "loss": 0.0281, + "step": 6566 + }, + { + "epoch": 2.623003194888179, + "grad_norm": 0.9735104123969658, + "learning_rate": 4.734930698127077e-07, + "loss": 0.0294, + "step": 6568 + }, + { + "epoch": 2.623801916932907, + "grad_norm": 1.0247460222068252, + "learning_rate": 4.715209813289706e-07, + "loss": 0.0307, + "step": 6570 + }, + { + "epoch": 2.6246006389776357, + "grad_norm": 0.9357520689720981, + "learning_rate": 4.695528050238368e-07, + "loss": 0.031, + "step": 6572 + }, + { + "epoch": 2.6253993610223643, + "grad_norm": 0.9787665294252466, + "learning_rate": 4.675885425976251e-07, + "loss": 0.031, + "step": 6574 + }, + { + "epoch": 2.626198083067093, + "grad_norm": 1.0180864802210927, + "learning_rate": 4.6562819574727304e-07, + "loss": 0.0281, + "step": 6576 + }, + { + "epoch": 2.626996805111821, + "grad_norm": 0.8926200488656815, + "learning_rate": 4.6367176616633426e-07, + "loss": 0.0295, + "step": 6578 + }, + { + "epoch": 2.6277955271565494, + "grad_norm": 1.0500600630048418, + "learning_rate": 4.6171925554498066e-07, + "loss": 0.0333, + "step": 6580 + }, + { + "epoch": 2.628594249201278, + "grad_norm": 1.182205096455108, + "learning_rate": 4.597706655699974e-07, + "loss": 0.0343, + "step": 6582 + }, + { + "epoch": 2.6293929712460065, + "grad_norm": 0.8380383559912178, + "learning_rate": 4.578259979247801e-07, + "loss": 0.0249, + "step": 6584 + }, + { + "epoch": 2.630191693290735, + "grad_norm": 0.9217249683454195, + "learning_rate": 4.558852542893405e-07, + "loss": 0.0274, + "step": 6586 + }, + { + "epoch": 2.6309904153354635, + "grad_norm": 0.9654985885991184, + "learning_rate": 4.539484363402963e-07, + "loss": 0.0313, + "step": 6588 + }, + { + "epoch": 2.6317891373801916, + "grad_norm": 0.9416775486966591, + "learning_rate": 4.520155457508768e-07, + "loss": 0.0278, + "step": 6590 + }, + { + "epoch": 2.63258785942492, + "grad_norm": 1.0149290675646854, + "learning_rate": 4.500865841909169e-07, + "loss": 0.0283, + "step": 6592 + }, + { + "epoch": 2.6333865814696487, + "grad_norm": 1.0066112830843752, + "learning_rate": 4.4816155332685687e-07, + "loss": 0.0321, + "step": 6594 + }, + { + "epoch": 2.6341853035143767, + "grad_norm": 0.9399055283517556, + "learning_rate": 4.462404548217414e-07, + "loss": 0.0274, + "step": 6596 + }, + { + "epoch": 2.6349840255591053, + "grad_norm": 0.9221492425778504, + "learning_rate": 4.4432329033521903e-07, + "loss": 0.0295, + "step": 6598 + }, + { + "epoch": 2.635782747603834, + "grad_norm": 0.890695263755349, + "learning_rate": 4.4241006152353885e-07, + "loss": 0.0253, + "step": 6600 + }, + { + "epoch": 2.6365814696485623, + "grad_norm": 0.9178158597505024, + "learning_rate": 4.405007700395497e-07, + "loss": 0.0281, + "step": 6602 + }, + { + "epoch": 2.637380191693291, + "grad_norm": 1.0639158204341026, + "learning_rate": 4.385954175326995e-07, + "loss": 0.0264, + "step": 6604 + }, + { + "epoch": 2.6381789137380194, + "grad_norm": 1.012323859666103, + "learning_rate": 4.366940056490343e-07, + "loss": 0.0308, + "step": 6606 + }, + { + "epoch": 2.6389776357827475, + "grad_norm": 0.9727641969282357, + "learning_rate": 4.3479653603119287e-07, + "loss": 0.0271, + "step": 6608 + }, + { + "epoch": 2.639776357827476, + "grad_norm": 0.8772595568384365, + "learning_rate": 4.329030103184095e-07, + "loss": 0.0259, + "step": 6610 + }, + { + "epoch": 2.6405750798722045, + "grad_norm": 0.8227589976875681, + "learning_rate": 4.3101343014651356e-07, + "loss": 0.0277, + "step": 6612 + }, + { + "epoch": 2.641373801916933, + "grad_norm": 0.9510948759982237, + "learning_rate": 4.2912779714792296e-07, + "loss": 0.0265, + "step": 6614 + }, + { + "epoch": 2.642172523961661, + "grad_norm": 1.0097839001611448, + "learning_rate": 4.2724611295164755e-07, + "loss": 0.026, + "step": 6616 + }, + { + "epoch": 2.6429712460063897, + "grad_norm": 0.9098862544268895, + "learning_rate": 4.2536837918328353e-07, + "loss": 0.0306, + "step": 6618 + }, + { + "epoch": 2.643769968051118, + "grad_norm": 0.8802136378624498, + "learning_rate": 4.2349459746501674e-07, + "loss": 0.0249, + "step": 6620 + }, + { + "epoch": 2.6445686900958467, + "grad_norm": 0.9041537710345738, + "learning_rate": 4.2162476941561723e-07, + "loss": 0.0291, + "step": 6622 + }, + { + "epoch": 2.6453674121405752, + "grad_norm": 1.007668717045686, + "learning_rate": 4.197588966504401e-07, + "loss": 0.0269, + "step": 6624 + }, + { + "epoch": 2.6461661341853038, + "grad_norm": 0.9464297822234001, + "learning_rate": 4.178969807814237e-07, + "loss": 0.0292, + "step": 6626 + }, + { + "epoch": 2.646964856230032, + "grad_norm": 0.9545384276330572, + "learning_rate": 4.1603902341708804e-07, + "loss": 0.0315, + "step": 6628 + }, + { + "epoch": 2.6477635782747604, + "grad_norm": 0.9924287706034195, + "learning_rate": 4.1418502616253185e-07, + "loss": 0.0293, + "step": 6630 + }, + { + "epoch": 2.648562300319489, + "grad_norm": 0.8072318680535587, + "learning_rate": 4.123349906194357e-07, + "loss": 0.0281, + "step": 6632 + }, + { + "epoch": 2.649361022364217, + "grad_norm": 0.8755579059821392, + "learning_rate": 4.1048891838605386e-07, + "loss": 0.0243, + "step": 6634 + }, + { + "epoch": 2.6501597444089455, + "grad_norm": 1.1069033803045907, + "learning_rate": 4.0864681105721895e-07, + "loss": 0.031, + "step": 6636 + }, + { + "epoch": 2.650958466453674, + "grad_norm": 0.9327831704930124, + "learning_rate": 4.068086702243379e-07, + "loss": 0.0283, + "step": 6638 + }, + { + "epoch": 2.6517571884984026, + "grad_norm": 1.0816491366183623, + "learning_rate": 4.0497449747539217e-07, + "loss": 0.0324, + "step": 6640 + }, + { + "epoch": 2.652555910543131, + "grad_norm": 0.9653480667623733, + "learning_rate": 4.031442943949321e-07, + "loss": 0.0271, + "step": 6642 + }, + { + "epoch": 2.6533546325878596, + "grad_norm": 1.025462257232587, + "learning_rate": 4.013180625640811e-07, + "loss": 0.0268, + "step": 6644 + }, + { + "epoch": 2.6541533546325877, + "grad_norm": 1.0073308408827422, + "learning_rate": 3.994958035605323e-07, + "loss": 0.0335, + "step": 6646 + }, + { + "epoch": 2.6549520766773163, + "grad_norm": 0.7759996809586879, + "learning_rate": 3.9767751895854467e-07, + "loss": 0.0278, + "step": 6648 + }, + { + "epoch": 2.655750798722045, + "grad_norm": 0.9648450344658905, + "learning_rate": 3.958632103289439e-07, + "loss": 0.0323, + "step": 6650 + }, + { + "epoch": 2.6565495207667733, + "grad_norm": 0.8621409106150557, + "learning_rate": 3.940528792391224e-07, + "loss": 0.0223, + "step": 6652 + }, + { + "epoch": 2.6573482428115014, + "grad_norm": 0.8191488899403426, + "learning_rate": 3.9224652725303514e-07, + "loss": 0.0286, + "step": 6654 + }, + { + "epoch": 2.65814696485623, + "grad_norm": 1.0056039386415438, + "learning_rate": 3.904441559312006e-07, + "loss": 0.0311, + "step": 6656 + }, + { + "epoch": 2.6589456869009584, + "grad_norm": 1.0006415392629169, + "learning_rate": 3.886457668306959e-07, + "loss": 0.031, + "step": 6658 + }, + { + "epoch": 2.659744408945687, + "grad_norm": 1.01436188125872, + "learning_rate": 3.8685136150516056e-07, + "loss": 0.032, + "step": 6660 + }, + { + "epoch": 2.6605431309904155, + "grad_norm": 0.9807096116942686, + "learning_rate": 3.8506094150479125e-07, + "loss": 0.0266, + "step": 6662 + }, + { + "epoch": 2.661341853035144, + "grad_norm": 1.0204227532844772, + "learning_rate": 3.8327450837634284e-07, + "loss": 0.0283, + "step": 6664 + }, + { + "epoch": 2.662140575079872, + "grad_norm": 0.8193622993500249, + "learning_rate": 3.8149206366312365e-07, + "loss": 0.0269, + "step": 6666 + }, + { + "epoch": 2.6629392971246006, + "grad_norm": 0.9734485918188198, + "learning_rate": 3.7971360890499686e-07, + "loss": 0.0266, + "step": 6668 + }, + { + "epoch": 2.663738019169329, + "grad_norm": 0.8642796045293297, + "learning_rate": 3.7793914563838187e-07, + "loss": 0.0251, + "step": 6670 + }, + { + "epoch": 2.6645367412140573, + "grad_norm": 1.033909020021237, + "learning_rate": 3.7616867539624733e-07, + "loss": 0.0352, + "step": 6672 + }, + { + "epoch": 2.665335463258786, + "grad_norm": 0.9998346566813847, + "learning_rate": 3.7440219970811155e-07, + "loss": 0.0321, + "step": 6674 + }, + { + "epoch": 2.6661341853035143, + "grad_norm": 1.1423057215989467, + "learning_rate": 3.7263972010004256e-07, + "loss": 0.0324, + "step": 6676 + }, + { + "epoch": 2.666932907348243, + "grad_norm": 0.8695749763174104, + "learning_rate": 3.708812380946569e-07, + "loss": 0.0256, + "step": 6678 + }, + { + "epoch": 2.6677316293929714, + "grad_norm": 0.9934106793723311, + "learning_rate": 3.691267552111183e-07, + "loss": 0.0302, + "step": 6680 + }, + { + "epoch": 2.6685303514377, + "grad_norm": 1.0381623580396713, + "learning_rate": 3.67376272965132e-07, + "loss": 0.0281, + "step": 6682 + }, + { + "epoch": 2.669329073482428, + "grad_norm": 0.8905369484200444, + "learning_rate": 3.6562979286895115e-07, + "loss": 0.0255, + "step": 6684 + }, + { + "epoch": 2.6701277955271565, + "grad_norm": 0.8948659099592012, + "learning_rate": 3.6388731643136944e-07, + "loss": 0.0252, + "step": 6686 + }, + { + "epoch": 2.670926517571885, + "grad_norm": 0.9124920896558937, + "learning_rate": 3.621488451577221e-07, + "loss": 0.0293, + "step": 6688 + }, + { + "epoch": 2.6717252396166136, + "grad_norm": 0.7777444783097159, + "learning_rate": 3.60414380549885e-07, + "loss": 0.0251, + "step": 6690 + }, + { + "epoch": 2.6725239616613417, + "grad_norm": 0.9000009491244071, + "learning_rate": 3.586839241062695e-07, + "loss": 0.0254, + "step": 6692 + }, + { + "epoch": 2.67332268370607, + "grad_norm": 0.9763457977314538, + "learning_rate": 3.5695747732182873e-07, + "loss": 0.0307, + "step": 6694 + }, + { + "epoch": 2.6741214057507987, + "grad_norm": 1.1307665134525087, + "learning_rate": 3.552350416880507e-07, + "loss": 0.0297, + "step": 6696 + }, + { + "epoch": 2.6749201277955272, + "grad_norm": 0.864801567907232, + "learning_rate": 3.535166186929556e-07, + "loss": 0.0296, + "step": 6698 + }, + { + "epoch": 2.6757188498402558, + "grad_norm": 0.8870307933812397, + "learning_rate": 3.518022098210988e-07, + "loss": 0.0292, + "step": 6700 + }, + { + "epoch": 2.6765175718849843, + "grad_norm": 1.0409321048802573, + "learning_rate": 3.500918165535683e-07, + "loss": 0.0345, + "step": 6702 + }, + { + "epoch": 2.6773162939297124, + "grad_norm": 0.9716769842609737, + "learning_rate": 3.483854403679832e-07, + "loss": 0.0296, + "step": 6704 + }, + { + "epoch": 2.678115015974441, + "grad_norm": 1.257329352898907, + "learning_rate": 3.4668308273848985e-07, + "loss": 0.0267, + "step": 6706 + }, + { + "epoch": 2.6789137380191694, + "grad_norm": 1.0441757060258385, + "learning_rate": 3.4498474513576574e-07, + "loss": 0.0317, + "step": 6708 + }, + { + "epoch": 2.6797124600638975, + "grad_norm": 1.1129495022310048, + "learning_rate": 3.432904290270139e-07, + "loss": 0.0286, + "step": 6710 + }, + { + "epoch": 2.680511182108626, + "grad_norm": 0.9584352042603548, + "learning_rate": 3.416001358759635e-07, + "loss": 0.028, + "step": 6712 + }, + { + "epoch": 2.6813099041533546, + "grad_norm": 1.1251450657365984, + "learning_rate": 3.3991386714286924e-07, + "loss": 0.0278, + "step": 6714 + }, + { + "epoch": 2.682108626198083, + "grad_norm": 0.9969919886738752, + "learning_rate": 3.382316242845074e-07, + "loss": 0.0328, + "step": 6716 + }, + { + "epoch": 2.6829073482428116, + "grad_norm": 1.1126055490955078, + "learning_rate": 3.365534087541772e-07, + "loss": 0.0277, + "step": 6718 + }, + { + "epoch": 2.68370607028754, + "grad_norm": 1.0455467801276128, + "learning_rate": 3.3487922200169944e-07, + "loss": 0.0314, + "step": 6720 + }, + { + "epoch": 2.6845047923322682, + "grad_norm": 0.9840651799523558, + "learning_rate": 3.332090654734116e-07, + "loss": 0.0325, + "step": 6722 + }, + { + "epoch": 2.6853035143769968, + "grad_norm": 0.9993440868747537, + "learning_rate": 3.315429406121723e-07, + "loss": 0.0285, + "step": 6724 + }, + { + "epoch": 2.6861022364217253, + "grad_norm": 1.041727821587971, + "learning_rate": 3.2988084885735684e-07, + "loss": 0.0351, + "step": 6726 + }, + { + "epoch": 2.686900958466454, + "grad_norm": 0.979266522959519, + "learning_rate": 3.2822279164485494e-07, + "loss": 0.0267, + "step": 6728 + }, + { + "epoch": 2.687699680511182, + "grad_norm": 0.8894076061044695, + "learning_rate": 3.2656877040707247e-07, + "loss": 0.0306, + "step": 6730 + }, + { + "epoch": 2.6884984025559104, + "grad_norm": 1.1099161112289966, + "learning_rate": 3.2491878657292643e-07, + "loss": 0.029, + "step": 6732 + }, + { + "epoch": 2.689297124600639, + "grad_norm": 0.7905372930822995, + "learning_rate": 3.2327284156784765e-07, + "loss": 0.0248, + "step": 6734 + }, + { + "epoch": 2.6900958466453675, + "grad_norm": 0.8972437845079343, + "learning_rate": 3.2163093681377765e-07, + "loss": 0.0322, + "step": 6736 + }, + { + "epoch": 2.690894568690096, + "grad_norm": 0.7811198103953365, + "learning_rate": 3.1999307372916675e-07, + "loss": 0.0211, + "step": 6738 + }, + { + "epoch": 2.6916932907348246, + "grad_norm": 0.8481986944549397, + "learning_rate": 3.183592537289748e-07, + "loss": 0.0258, + "step": 6740 + }, + { + "epoch": 2.6924920127795526, + "grad_norm": 0.8974957564028379, + "learning_rate": 3.1672947822466714e-07, + "loss": 0.0307, + "step": 6742 + }, + { + "epoch": 2.693290734824281, + "grad_norm": 0.8069628364489961, + "learning_rate": 3.151037486242181e-07, + "loss": 0.0253, + "step": 6744 + }, + { + "epoch": 2.6940894568690097, + "grad_norm": 1.0619428207925592, + "learning_rate": 3.13482066332102e-07, + "loss": 0.0328, + "step": 6746 + }, + { + "epoch": 2.6948881789137378, + "grad_norm": 1.0230784961867097, + "learning_rate": 3.1186443274930035e-07, + "loss": 0.033, + "step": 6748 + }, + { + "epoch": 2.6956869009584663, + "grad_norm": 0.9466613888969522, + "learning_rate": 3.102508492732964e-07, + "loss": 0.0271, + "step": 6750 + }, + { + "epoch": 2.696485623003195, + "grad_norm": 0.9853478032894241, + "learning_rate": 3.08641317298074e-07, + "loss": 0.0321, + "step": 6752 + }, + { + "epoch": 2.6972843450479234, + "grad_norm": 1.0460709064794675, + "learning_rate": 3.07035838214117e-07, + "loss": 0.0354, + "step": 6754 + }, + { + "epoch": 2.698083067092652, + "grad_norm": 0.9909587345397065, + "learning_rate": 3.0543441340840696e-07, + "loss": 0.0324, + "step": 6756 + }, + { + "epoch": 2.6988817891373804, + "grad_norm": 1.0147873081466183, + "learning_rate": 3.0383704426442396e-07, + "loss": 0.0325, + "step": 6758 + }, + { + "epoch": 2.6996805111821085, + "grad_norm": 0.9477745651098728, + "learning_rate": 3.022437321621452e-07, + "loss": 0.0307, + "step": 6760 + }, + { + "epoch": 2.700479233226837, + "grad_norm": 0.8576252217343889, + "learning_rate": 3.006544784780413e-07, + "loss": 0.0261, + "step": 6762 + }, + { + "epoch": 2.7012779552715656, + "grad_norm": 0.929114631572117, + "learning_rate": 2.9906928458507735e-07, + "loss": 0.0298, + "step": 6764 + }, + { + "epoch": 2.702076677316294, + "grad_norm": 0.8245393411728102, + "learning_rate": 2.9748815185271174e-07, + "loss": 0.0273, + "step": 6766 + }, + { + "epoch": 2.702875399361022, + "grad_norm": 1.0143221518300005, + "learning_rate": 2.959110816468935e-07, + "loss": 0.0294, + "step": 6768 + }, + { + "epoch": 2.7036741214057507, + "grad_norm": 1.1519123024674627, + "learning_rate": 2.94338075330064e-07, + "loss": 0.0287, + "step": 6770 + }, + { + "epoch": 2.7044728434504792, + "grad_norm": 0.9244500535568194, + "learning_rate": 2.927691342611505e-07, + "loss": 0.032, + "step": 6772 + }, + { + "epoch": 2.7052715654952078, + "grad_norm": 0.8604757808842944, + "learning_rate": 2.9120425979557e-07, + "loss": 0.0244, + "step": 6774 + }, + { + "epoch": 2.7060702875399363, + "grad_norm": 0.8443290403852196, + "learning_rate": 2.896434532852277e-07, + "loss": 0.0266, + "step": 6776 + }, + { + "epoch": 2.706869009584665, + "grad_norm": 0.9923984530744449, + "learning_rate": 2.880867160785128e-07, + "loss": 0.0299, + "step": 6778 + }, + { + "epoch": 2.707667731629393, + "grad_norm": 0.9535512491511288, + "learning_rate": 2.865340495202984e-07, + "loss": 0.0289, + "step": 6780 + }, + { + "epoch": 2.7084664536741214, + "grad_norm": 0.9510201264153861, + "learning_rate": 2.849854549519426e-07, + "loss": 0.0294, + "step": 6782 + }, + { + "epoch": 2.70926517571885, + "grad_norm": 0.9033130404655279, + "learning_rate": 2.834409337112842e-07, + "loss": 0.0273, + "step": 6784 + }, + { + "epoch": 2.710063897763578, + "grad_norm": 1.0548477729978563, + "learning_rate": 2.8190048713264586e-07, + "loss": 0.0311, + "step": 6786 + }, + { + "epoch": 2.7108626198083066, + "grad_norm": 0.9948422314656845, + "learning_rate": 2.8036411654682627e-07, + "loss": 0.0304, + "step": 6788 + }, + { + "epoch": 2.711661341853035, + "grad_norm": 0.8822930619200026, + "learning_rate": 2.7883182328110494e-07, + "loss": 0.0281, + "step": 6790 + }, + { + "epoch": 2.7124600638977636, + "grad_norm": 1.0705668513390736, + "learning_rate": 2.7730360865923954e-07, + "loss": 0.0291, + "step": 6792 + }, + { + "epoch": 2.713258785942492, + "grad_norm": 0.9950549300332445, + "learning_rate": 2.75779474001463e-07, + "loss": 0.0306, + "step": 6794 + }, + { + "epoch": 2.7140575079872207, + "grad_norm": 0.9852425718587328, + "learning_rate": 2.7425942062448254e-07, + "loss": 0.0291, + "step": 6796 + }, + { + "epoch": 2.7148562300319488, + "grad_norm": 0.9530301493388222, + "learning_rate": 2.727434498414827e-07, + "loss": 0.0305, + "step": 6798 + }, + { + "epoch": 2.7156549520766773, + "grad_norm": 0.9617585154543935, + "learning_rate": 2.712315629621176e-07, + "loss": 0.0304, + "step": 6800 + }, + { + "epoch": 2.716453674121406, + "grad_norm": 0.8160240654532492, + "learning_rate": 2.697237612925169e-07, + "loss": 0.0254, + "step": 6802 + }, + { + "epoch": 2.7172523961661343, + "grad_norm": 1.0732849620331193, + "learning_rate": 2.682200461352763e-07, + "loss": 0.029, + "step": 6804 + }, + { + "epoch": 2.7180511182108624, + "grad_norm": 1.1148345917930245, + "learning_rate": 2.6672041878946507e-07, + "loss": 0.0364, + "step": 6806 + }, + { + "epoch": 2.718849840255591, + "grad_norm": 0.9845373546577002, + "learning_rate": 2.6522488055062076e-07, + "loss": 0.0323, + "step": 6808 + }, + { + "epoch": 2.7196485623003195, + "grad_norm": 0.9694597420527851, + "learning_rate": 2.6373343271074657e-07, + "loss": 0.0305, + "step": 6810 + }, + { + "epoch": 2.720447284345048, + "grad_norm": 0.8865423882611635, + "learning_rate": 2.6224607655831236e-07, + "loss": 0.0301, + "step": 6812 + }, + { + "epoch": 2.7212460063897765, + "grad_norm": 1.007509492035804, + "learning_rate": 2.607628133782536e-07, + "loss": 0.0306, + "step": 6814 + }, + { + "epoch": 2.722044728434505, + "grad_norm": 0.8870706562836438, + "learning_rate": 2.5928364445196975e-07, + "loss": 0.0294, + "step": 6816 + }, + { + "epoch": 2.722843450479233, + "grad_norm": 0.9708567758541311, + "learning_rate": 2.578085710573247e-07, + "loss": 0.028, + "step": 6818 + }, + { + "epoch": 2.7236421725239617, + "grad_norm": 0.8934830296672568, + "learning_rate": 2.563375944686397e-07, + "loss": 0.0306, + "step": 6820 + }, + { + "epoch": 2.72444089456869, + "grad_norm": 1.072610973986754, + "learning_rate": 2.548707159567021e-07, + "loss": 0.0288, + "step": 6822 + }, + { + "epoch": 2.7252396166134183, + "grad_norm": 0.8222963836412455, + "learning_rate": 2.534079367887549e-07, + "loss": 0.0278, + "step": 6824 + }, + { + "epoch": 2.726038338658147, + "grad_norm": 0.9085074348725313, + "learning_rate": 2.519492582285027e-07, + "loss": 0.0304, + "step": 6826 + }, + { + "epoch": 2.7268370607028753, + "grad_norm": 0.9177610792802552, + "learning_rate": 2.504946815361065e-07, + "loss": 0.0331, + "step": 6828 + }, + { + "epoch": 2.727635782747604, + "grad_norm": 1.094130825800624, + "learning_rate": 2.4904420796818097e-07, + "loss": 0.0303, + "step": 6830 + }, + { + "epoch": 2.7284345047923324, + "grad_norm": 1.0338426208891864, + "learning_rate": 2.475978387778e-07, + "loss": 0.0311, + "step": 6832 + }, + { + "epoch": 2.729233226837061, + "grad_norm": 0.8384072672060133, + "learning_rate": 2.461555752144912e-07, + "loss": 0.0292, + "step": 6834 + }, + { + "epoch": 2.730031948881789, + "grad_norm": 1.015906477269788, + "learning_rate": 2.447174185242324e-07, + "loss": 0.0277, + "step": 6836 + }, + { + "epoch": 2.7308306709265175, + "grad_norm": 0.9215903866822917, + "learning_rate": 2.432833699494558e-07, + "loss": 0.0277, + "step": 6838 + }, + { + "epoch": 2.731629392971246, + "grad_norm": 0.913267265555896, + "learning_rate": 2.4185343072904376e-07, + "loss": 0.0248, + "step": 6840 + }, + { + "epoch": 2.7324281150159746, + "grad_norm": 0.9333269872247636, + "learning_rate": 2.404276020983304e-07, + "loss": 0.0292, + "step": 6842 + }, + { + "epoch": 2.7332268370607027, + "grad_norm": 1.0187672549108133, + "learning_rate": 2.3900588528909475e-07, + "loss": 0.0344, + "step": 6844 + }, + { + "epoch": 2.734025559105431, + "grad_norm": 0.9419705337564647, + "learning_rate": 2.375882815295677e-07, + "loss": 0.0286, + "step": 6846 + }, + { + "epoch": 2.7348242811501597, + "grad_norm": 0.918071642194504, + "learning_rate": 2.3617479204442462e-07, + "loss": 0.028, + "step": 6848 + }, + { + "epoch": 2.7356230031948883, + "grad_norm": 1.0220943478986433, + "learning_rate": 2.3476541805478647e-07, + "loss": 0.0287, + "step": 6850 + }, + { + "epoch": 2.736421725239617, + "grad_norm": 1.0904072022439066, + "learning_rate": 2.3336016077822154e-07, + "loss": 0.0296, + "step": 6852 + }, + { + "epoch": 2.737220447284345, + "grad_norm": 0.9166980890980327, + "learning_rate": 2.3195902142873593e-07, + "loss": 0.0269, + "step": 6854 + }, + { + "epoch": 2.7380191693290734, + "grad_norm": 0.9240202525664635, + "learning_rate": 2.305620012167853e-07, + "loss": 0.028, + "step": 6856 + }, + { + "epoch": 2.738817891373802, + "grad_norm": 0.9499828140577764, + "learning_rate": 2.2916910134926197e-07, + "loss": 0.0273, + "step": 6858 + }, + { + "epoch": 2.7396166134185305, + "grad_norm": 1.0046665742939103, + "learning_rate": 2.2778032302949948e-07, + "loss": 0.0299, + "step": 6860 + }, + { + "epoch": 2.7404153354632586, + "grad_norm": 0.9346433823459767, + "learning_rate": 2.2639566745727203e-07, + "loss": 0.0301, + "step": 6862 + }, + { + "epoch": 2.741214057507987, + "grad_norm": 1.0474839299661065, + "learning_rate": 2.2501513582879108e-07, + "loss": 0.035, + "step": 6864 + }, + { + "epoch": 2.7420127795527156, + "grad_norm": 0.9326384987743596, + "learning_rate": 2.236387293367054e-07, + "loss": 0.0267, + "step": 6866 + }, + { + "epoch": 2.742811501597444, + "grad_norm": 0.9513219854071373, + "learning_rate": 2.2226644917010153e-07, + "loss": 0.0299, + "step": 6868 + }, + { + "epoch": 2.7436102236421727, + "grad_norm": 0.8426342593529366, + "learning_rate": 2.208982965144979e-07, + "loss": 0.0245, + "step": 6870 + }, + { + "epoch": 2.744408945686901, + "grad_norm": 0.967982836150597, + "learning_rate": 2.1953427255185122e-07, + "loss": 0.0333, + "step": 6872 + }, + { + "epoch": 2.7452076677316293, + "grad_norm": 0.9628141955997789, + "learning_rate": 2.1817437846054889e-07, + "loss": 0.032, + "step": 6874 + }, + { + "epoch": 2.746006389776358, + "grad_norm": 0.9095860682951128, + "learning_rate": 2.1681861541541117e-07, + "loss": 0.027, + "step": 6876 + }, + { + "epoch": 2.7468051118210863, + "grad_norm": 0.9999018913590277, + "learning_rate": 2.1546698458768888e-07, + "loss": 0.0266, + "step": 6878 + }, + { + "epoch": 2.747603833865815, + "grad_norm": 0.9155500721532669, + "learning_rate": 2.1411948714506414e-07, + "loss": 0.0248, + "step": 6880 + }, + { + "epoch": 2.748402555910543, + "grad_norm": 0.899083057917597, + "learning_rate": 2.1277612425164796e-07, + "loss": 0.0291, + "step": 6882 + }, + { + "epoch": 2.7492012779552715, + "grad_norm": 1.0507417274391886, + "learning_rate": 2.1143689706797809e-07, + "loss": 0.0304, + "step": 6884 + }, + { + "epoch": 2.75, + "grad_norm": 0.9237015927908853, + "learning_rate": 2.101018067510213e-07, + "loss": 0.0288, + "step": 6886 + }, + { + "epoch": 2.7507987220447285, + "grad_norm": 0.9396213034157446, + "learning_rate": 2.0877085445416889e-07, + "loss": 0.0313, + "step": 6888 + }, + { + "epoch": 2.751597444089457, + "grad_norm": 0.893230887911447, + "learning_rate": 2.0744404132723882e-07, + "loss": 0.0271, + "step": 6890 + }, + { + "epoch": 2.752396166134185, + "grad_norm": 1.1227883203425046, + "learning_rate": 2.0612136851647258e-07, + "loss": 0.028, + "step": 6892 + }, + { + "epoch": 2.7531948881789137, + "grad_norm": 1.1000266312956004, + "learning_rate": 2.0480283716453388e-07, + "loss": 0.0316, + "step": 6894 + }, + { + "epoch": 2.753993610223642, + "grad_norm": 1.0036859264242648, + "learning_rate": 2.034884484105093e-07, + "loss": 0.032, + "step": 6896 + }, + { + "epoch": 2.7547923322683707, + "grad_norm": 0.8664572975169675, + "learning_rate": 2.0217820338990723e-07, + "loss": 0.0293, + "step": 6898 + }, + { + "epoch": 2.755591054313099, + "grad_norm": 1.152106392290739, + "learning_rate": 2.0087210323465555e-07, + "loss": 0.0271, + "step": 6900 + }, + { + "epoch": 2.7563897763578273, + "grad_norm": 0.9072929731158895, + "learning_rate": 1.9957014907310224e-07, + "loss": 0.0297, + "step": 6902 + }, + { + "epoch": 2.757188498402556, + "grad_norm": 0.899746462852363, + "learning_rate": 1.98272342030012e-07, + "loss": 0.0269, + "step": 6904 + }, + { + "epoch": 2.7579872204472844, + "grad_norm": 0.9831117034435982, + "learning_rate": 1.96978683226568e-07, + "loss": 0.0286, + "step": 6906 + }, + { + "epoch": 2.758785942492013, + "grad_norm": 0.9654610196040313, + "learning_rate": 1.9568917378037012e-07, + "loss": 0.0286, + "step": 6908 + }, + { + "epoch": 2.7595846645367414, + "grad_norm": 0.8702316945781633, + "learning_rate": 1.9440381480543115e-07, + "loss": 0.0292, + "step": 6910 + }, + { + "epoch": 2.7603833865814695, + "grad_norm": 0.8701781601681666, + "learning_rate": 1.9312260741218114e-07, + "loss": 0.0296, + "step": 6912 + }, + { + "epoch": 2.761182108626198, + "grad_norm": 1.0906874136348748, + "learning_rate": 1.9184555270746198e-07, + "loss": 0.032, + "step": 6914 + }, + { + "epoch": 2.7619808306709266, + "grad_norm": 0.8789894864802608, + "learning_rate": 1.9057265179452945e-07, + "loss": 0.0283, + "step": 6916 + }, + { + "epoch": 2.762779552715655, + "grad_norm": 0.8289933518344175, + "learning_rate": 1.8930390577304836e-07, + "loss": 0.0254, + "step": 6918 + }, + { + "epoch": 2.763578274760383, + "grad_norm": 1.109173584946323, + "learning_rate": 1.8803931573909584e-07, + "loss": 0.0325, + "step": 6920 + }, + { + "epoch": 2.7643769968051117, + "grad_norm": 0.8437516857169298, + "learning_rate": 1.8677888278515854e-07, + "loss": 0.0275, + "step": 6922 + }, + { + "epoch": 2.7651757188498403, + "grad_norm": 0.957579586236523, + "learning_rate": 1.8552260800013266e-07, + "loss": 0.0324, + "step": 6924 + }, + { + "epoch": 2.765974440894569, + "grad_norm": 1.133247804875847, + "learning_rate": 1.8427049246932005e-07, + "loss": 0.0303, + "step": 6926 + }, + { + "epoch": 2.7667731629392973, + "grad_norm": 0.9042200415102326, + "learning_rate": 1.8302253727443041e-07, + "loss": 0.0261, + "step": 6928 + }, + { + "epoch": 2.7675718849840254, + "grad_norm": 1.0109916146672329, + "learning_rate": 1.817787434935797e-07, + "loss": 0.0317, + "step": 6930 + }, + { + "epoch": 2.768370607028754, + "grad_norm": 0.8886867588028137, + "learning_rate": 1.805391122012884e-07, + "loss": 0.0307, + "step": 6932 + }, + { + "epoch": 2.7691693290734825, + "grad_norm": 0.9881453864615531, + "learning_rate": 1.7930364446848035e-07, + "loss": 0.0282, + "step": 6934 + }, + { + "epoch": 2.769968051118211, + "grad_norm": 0.9753621627206346, + "learning_rate": 1.7807234136248296e-07, + "loss": 0.0316, + "step": 6936 + }, + { + "epoch": 2.770766773162939, + "grad_norm": 0.9729298132394265, + "learning_rate": 1.7684520394702697e-07, + "loss": 0.0322, + "step": 6938 + }, + { + "epoch": 2.7715654952076676, + "grad_norm": 1.0067163359277183, + "learning_rate": 1.7562223328224327e-07, + "loss": 0.0273, + "step": 6940 + }, + { + "epoch": 2.772364217252396, + "grad_norm": 1.014978444523247, + "learning_rate": 1.7440343042466225e-07, + "loss": 0.0222, + "step": 6942 + }, + { + "epoch": 2.7731629392971247, + "grad_norm": 0.9727814177226888, + "learning_rate": 1.731887964272144e-07, + "loss": 0.0284, + "step": 6944 + }, + { + "epoch": 2.773961661341853, + "grad_norm": 0.8674877180013327, + "learning_rate": 1.7197833233922933e-07, + "loss": 0.0266, + "step": 6946 + }, + { + "epoch": 2.7747603833865817, + "grad_norm": 0.92093939870497, + "learning_rate": 1.7077203920643548e-07, + "loss": 0.0277, + "step": 6948 + }, + { + "epoch": 2.77555910543131, + "grad_norm": 1.0354340580208805, + "learning_rate": 1.695699180709537e-07, + "loss": 0.0318, + "step": 6950 + }, + { + "epoch": 2.7763578274760383, + "grad_norm": 0.8959582446725475, + "learning_rate": 1.6837196997130434e-07, + "loss": 0.0273, + "step": 6952 + }, + { + "epoch": 2.777156549520767, + "grad_norm": 0.9082092205680304, + "learning_rate": 1.671781959424018e-07, + "loss": 0.0283, + "step": 6954 + }, + { + "epoch": 2.777955271565495, + "grad_norm": 0.8526839728174191, + "learning_rate": 1.6598859701555448e-07, + "loss": 0.0279, + "step": 6956 + }, + { + "epoch": 2.7787539936102235, + "grad_norm": 0.9183824299916817, + "learning_rate": 1.648031742184619e-07, + "loss": 0.0221, + "step": 6958 + }, + { + "epoch": 2.779552715654952, + "grad_norm": 0.8785486078394016, + "learning_rate": 1.6362192857521942e-07, + "loss": 0.028, + "step": 6960 + }, + { + "epoch": 2.7803514376996805, + "grad_norm": 0.984747118195936, + "learning_rate": 1.6244486110631062e-07, + "loss": 0.0343, + "step": 6962 + }, + { + "epoch": 2.781150159744409, + "grad_norm": 0.8833396654406691, + "learning_rate": 1.6127197282861106e-07, + "loss": 0.0275, + "step": 6964 + }, + { + "epoch": 2.7819488817891376, + "grad_norm": 0.8586218999503099, + "learning_rate": 1.6010326475538628e-07, + "loss": 0.0267, + "step": 6966 + }, + { + "epoch": 2.7827476038338657, + "grad_norm": 1.1132610814922481, + "learning_rate": 1.5893873789628812e-07, + "loss": 0.0357, + "step": 6968 + }, + { + "epoch": 2.783546325878594, + "grad_norm": 1.0377705902735281, + "learning_rate": 1.5777839325735955e-07, + "loss": 0.0348, + "step": 6970 + }, + { + "epoch": 2.7843450479233227, + "grad_norm": 0.8553477275906, + "learning_rate": 1.5662223184102876e-07, + "loss": 0.0282, + "step": 6972 + }, + { + "epoch": 2.7851437699680512, + "grad_norm": 0.9887534929150167, + "learning_rate": 1.55470254646109e-07, + "loss": 0.0339, + "step": 6974 + }, + { + "epoch": 2.7859424920127793, + "grad_norm": 0.8463180888040372, + "learning_rate": 1.5432246266780083e-07, + "loss": 0.0265, + "step": 6976 + }, + { + "epoch": 2.786741214057508, + "grad_norm": 0.9279956838643539, + "learning_rate": 1.5317885689768775e-07, + "loss": 0.028, + "step": 6978 + }, + { + "epoch": 2.7875399361022364, + "grad_norm": 0.898362447794585, + "learning_rate": 1.520394383237378e-07, + "loss": 0.0254, + "step": 6980 + }, + { + "epoch": 2.788338658146965, + "grad_norm": 0.999242247390689, + "learning_rate": 1.5090420793030025e-07, + "loss": 0.026, + "step": 6982 + }, + { + "epoch": 2.7891373801916934, + "grad_norm": 1.0069528441967504, + "learning_rate": 1.4977316669810782e-07, + "loss": 0.0299, + "step": 6984 + }, + { + "epoch": 2.789936102236422, + "grad_norm": 0.9597592645048728, + "learning_rate": 1.4864631560427277e-07, + "loss": 0.0286, + "step": 6986 + }, + { + "epoch": 2.79073482428115, + "grad_norm": 0.8133798258927686, + "learning_rate": 1.4752365562228865e-07, + "loss": 0.0256, + "step": 6988 + }, + { + "epoch": 2.7915335463258786, + "grad_norm": 1.0299858930198784, + "learning_rate": 1.4640518772202794e-07, + "loss": 0.0287, + "step": 6990 + }, + { + "epoch": 2.792332268370607, + "grad_norm": 0.8756875752154023, + "learning_rate": 1.4529091286973994e-07, + "loss": 0.0285, + "step": 6992 + }, + { + "epoch": 2.793130990415335, + "grad_norm": 0.9409132612586905, + "learning_rate": 1.4418083202805467e-07, + "loss": 0.031, + "step": 6994 + }, + { + "epoch": 2.7939297124600637, + "grad_norm": 0.9200301064033846, + "learning_rate": 1.4307494615597716e-07, + "loss": 0.0311, + "step": 6996 + }, + { + "epoch": 2.7947284345047922, + "grad_norm": 0.8145829376320072, + "learning_rate": 1.4197325620888714e-07, + "loss": 0.0246, + "step": 6998 + }, + { + "epoch": 2.7955271565495208, + "grad_norm": 0.9865390880493666, + "learning_rate": 1.4087576313854212e-07, + "loss": 0.0296, + "step": 7000 + }, + { + "epoch": 2.7955271565495208, + "eval_loss": 0.1819150149822235, + "eval_runtime": 417.4017, + "eval_samples_per_second": 42.662, + "eval_steps_per_second": 5.333, + "step": 7000 + }, + { + "epoch": 2.7963258785942493, + "grad_norm": 1.0878362509592698, + "learning_rate": 1.397824678930715e-07, + "loss": 0.0269, + "step": 7002 + }, + { + "epoch": 2.797124600638978, + "grad_norm": 0.8833659039797074, + "learning_rate": 1.386933714169797e-07, + "loss": 0.0295, + "step": 7004 + }, + { + "epoch": 2.797923322683706, + "grad_norm": 1.198555362781041, + "learning_rate": 1.3760847465114413e-07, + "loss": 0.0231, + "step": 7006 + }, + { + "epoch": 2.7987220447284344, + "grad_norm": 0.9766022877177375, + "learning_rate": 1.365277785328123e-07, + "loss": 0.0316, + "step": 7008 + }, + { + "epoch": 2.799520766773163, + "grad_norm": 0.9931479598265185, + "learning_rate": 1.3545128399560349e-07, + "loss": 0.0307, + "step": 7010 + }, + { + "epoch": 2.8003194888178915, + "grad_norm": 0.9470485404305614, + "learning_rate": 1.3437899196950765e-07, + "loss": 0.03, + "step": 7012 + }, + { + "epoch": 2.8011182108626196, + "grad_norm": 1.0425827226683049, + "learning_rate": 1.3331090338088437e-07, + "loss": 0.0291, + "step": 7014 + }, + { + "epoch": 2.801916932907348, + "grad_norm": 0.9479599981229176, + "learning_rate": 1.3224701915246053e-07, + "loss": 0.033, + "step": 7016 + }, + { + "epoch": 2.8027156549520766, + "grad_norm": 1.0722352133209674, + "learning_rate": 1.3118734020333257e-07, + "loss": 0.0305, + "step": 7018 + }, + { + "epoch": 2.803514376996805, + "grad_norm": 1.1283019296777128, + "learning_rate": 1.3013186744896323e-07, + "loss": 0.0322, + "step": 7020 + }, + { + "epoch": 2.8043130990415337, + "grad_norm": 1.0443979108148196, + "learning_rate": 1.2908060180118088e-07, + "loss": 0.0279, + "step": 7022 + }, + { + "epoch": 2.8051118210862622, + "grad_norm": 0.9204477323838135, + "learning_rate": 1.280335441681796e-07, + "loss": 0.026, + "step": 7024 + }, + { + "epoch": 2.8059105431309903, + "grad_norm": 1.091376167660413, + "learning_rate": 1.2699069545451858e-07, + "loss": 0.0303, + "step": 7026 + }, + { + "epoch": 2.806709265175719, + "grad_norm": 0.8186223224524324, + "learning_rate": 1.2595205656112164e-07, + "loss": 0.0251, + "step": 7028 + }, + { + "epoch": 2.8075079872204474, + "grad_norm": 0.9426669872933783, + "learning_rate": 1.2491762838527376e-07, + "loss": 0.0278, + "step": 7030 + }, + { + "epoch": 2.8083067092651754, + "grad_norm": 0.8562334740689089, + "learning_rate": 1.2388741182062348e-07, + "loss": 0.0254, + "step": 7032 + }, + { + "epoch": 2.809105431309904, + "grad_norm": 1.0003749268195539, + "learning_rate": 1.2286140775718048e-07, + "loss": 0.0254, + "step": 7034 + }, + { + "epoch": 2.8099041533546325, + "grad_norm": 0.9287476537267818, + "learning_rate": 1.2183961708131574e-07, + "loss": 0.0331, + "step": 7036 + }, + { + "epoch": 2.810702875399361, + "grad_norm": 1.0100009275296533, + "learning_rate": 1.2082204067576043e-07, + "loss": 0.0339, + "step": 7038 + }, + { + "epoch": 2.8115015974440896, + "grad_norm": 1.0019967241849932, + "learning_rate": 1.198086794196035e-07, + "loss": 0.034, + "step": 7040 + }, + { + "epoch": 2.812300319488818, + "grad_norm": 1.0274376107046983, + "learning_rate": 1.187995341882947e-07, + "loss": 0.0296, + "step": 7042 + }, + { + "epoch": 2.813099041533546, + "grad_norm": 0.9758492633679886, + "learning_rate": 1.1779460585363945e-07, + "loss": 0.0304, + "step": 7044 + }, + { + "epoch": 2.8138977635782747, + "grad_norm": 1.0055929014699148, + "learning_rate": 1.1679389528380159e-07, + "loss": 0.0285, + "step": 7046 + }, + { + "epoch": 2.8146964856230032, + "grad_norm": 0.9825352483369219, + "learning_rate": 1.1579740334330014e-07, + "loss": 0.0279, + "step": 7048 + }, + { + "epoch": 2.8154952076677318, + "grad_norm": 0.9663886663279524, + "learning_rate": 1.1480513089301037e-07, + "loss": 0.0238, + "step": 7050 + }, + { + "epoch": 2.81629392971246, + "grad_norm": 0.7632311003499087, + "learning_rate": 1.1381707879016158e-07, + "loss": 0.0228, + "step": 7052 + }, + { + "epoch": 2.8170926517571884, + "grad_norm": 0.8411185807947373, + "learning_rate": 1.1283324788833872e-07, + "loss": 0.0261, + "step": 7054 + }, + { + "epoch": 2.817891373801917, + "grad_norm": 1.0106383088967026, + "learning_rate": 1.1185363903747748e-07, + "loss": 0.0249, + "step": 7056 + }, + { + "epoch": 2.8186900958466454, + "grad_norm": 1.0568473872652466, + "learning_rate": 1.1087825308386812e-07, + "loss": 0.0328, + "step": 7058 + }, + { + "epoch": 2.819488817891374, + "grad_norm": 1.0600837943890258, + "learning_rate": 1.0990709087015217e-07, + "loss": 0.0288, + "step": 7060 + }, + { + "epoch": 2.8202875399361025, + "grad_norm": 0.9659386588248836, + "learning_rate": 1.0894015323532181e-07, + "loss": 0.0294, + "step": 7062 + }, + { + "epoch": 2.8210862619808306, + "grad_norm": 0.9822944918742355, + "learning_rate": 1.0797744101472052e-07, + "loss": 0.0298, + "step": 7064 + }, + { + "epoch": 2.821884984025559, + "grad_norm": 0.9411392883906924, + "learning_rate": 1.0701895504004022e-07, + "loss": 0.0268, + "step": 7066 + }, + { + "epoch": 2.8226837060702876, + "grad_norm": 0.8205930442237354, + "learning_rate": 1.0606469613932247e-07, + "loss": 0.0285, + "step": 7068 + }, + { + "epoch": 2.8234824281150157, + "grad_norm": 0.9585034609165559, + "learning_rate": 1.0511466513695778e-07, + "loss": 0.0295, + "step": 7070 + }, + { + "epoch": 2.8242811501597442, + "grad_norm": 1.017339428173637, + "learning_rate": 1.0416886285368188e-07, + "loss": 0.032, + "step": 7072 + }, + { + "epoch": 2.8250798722044728, + "grad_norm": 0.9480053562427759, + "learning_rate": 1.032272901065795e-07, + "loss": 0.0276, + "step": 7074 + }, + { + "epoch": 2.8258785942492013, + "grad_norm": 0.9241183013217804, + "learning_rate": 1.0228994770908052e-07, + "loss": 0.0313, + "step": 7076 + }, + { + "epoch": 2.82667731629393, + "grad_norm": 0.8600565904245382, + "learning_rate": 1.0135683647096107e-07, + "loss": 0.0312, + "step": 7078 + }, + { + "epoch": 2.8274760383386583, + "grad_norm": 0.857256000234177, + "learning_rate": 1.0042795719833964e-07, + "loss": 0.0304, + "step": 7080 + }, + { + "epoch": 2.8282747603833864, + "grad_norm": 0.9257782290418327, + "learning_rate": 9.950331069368102e-08, + "loss": 0.029, + "step": 7082 + }, + { + "epoch": 2.829073482428115, + "grad_norm": 0.9014008656073322, + "learning_rate": 9.858289775579289e-08, + "loss": 0.0277, + "step": 7084 + }, + { + "epoch": 2.8298722044728435, + "grad_norm": 0.9335945085748736, + "learning_rate": 9.766671917982529e-08, + "loss": 0.0258, + "step": 7086 + }, + { + "epoch": 2.830670926517572, + "grad_norm": 0.893406276113889, + "learning_rate": 9.675477575726954e-08, + "loss": 0.0286, + "step": 7088 + }, + { + "epoch": 2.8314696485623, + "grad_norm": 0.9135616915468692, + "learning_rate": 9.58470682759588e-08, + "loss": 0.0267, + "step": 7090 + }, + { + "epoch": 2.8322683706070286, + "grad_norm": 0.8914005786007674, + "learning_rate": 9.494359752006687e-08, + "loss": 0.0267, + "step": 7092 + }, + { + "epoch": 2.833067092651757, + "grad_norm": 0.9868182622898214, + "learning_rate": 9.404436427010777e-08, + "loss": 0.0259, + "step": 7094 + }, + { + "epoch": 2.8338658146964857, + "grad_norm": 0.8736365212101689, + "learning_rate": 9.314936930293283e-08, + "loss": 0.0238, + "step": 7096 + }, + { + "epoch": 2.834664536741214, + "grad_norm": 0.8341663421385362, + "learning_rate": 9.225861339173415e-08, + "loss": 0.0296, + "step": 7098 + }, + { + "epoch": 2.8354632587859427, + "grad_norm": 0.9404555958593751, + "learning_rate": 9.137209730604113e-08, + "loss": 0.029, + "step": 7100 + }, + { + "epoch": 2.836261980830671, + "grad_norm": 0.9105942982702757, + "learning_rate": 9.048982181171895e-08, + "loss": 0.0289, + "step": 7102 + }, + { + "epoch": 2.8370607028753994, + "grad_norm": 1.0814449020759576, + "learning_rate": 8.961178767097178e-08, + "loss": 0.0308, + "step": 7104 + }, + { + "epoch": 2.837859424920128, + "grad_norm": 0.8932338308675243, + "learning_rate": 8.873799564233676e-08, + "loss": 0.0301, + "step": 7106 + }, + { + "epoch": 2.838658146964856, + "grad_norm": 1.012162689411644, + "learning_rate": 8.786844648068837e-08, + "loss": 0.0313, + "step": 7108 + }, + { + "epoch": 2.8394568690095845, + "grad_norm": 1.0902329946901537, + "learning_rate": 8.700314093723572e-08, + "loss": 0.0315, + "step": 7110 + }, + { + "epoch": 2.840255591054313, + "grad_norm": 0.9095047254271645, + "learning_rate": 8.614207975952083e-08, + "loss": 0.0311, + "step": 7112 + }, + { + "epoch": 2.8410543130990416, + "grad_norm": 0.9218237866635856, + "learning_rate": 8.528526369141809e-08, + "loss": 0.0302, + "step": 7114 + }, + { + "epoch": 2.84185303514377, + "grad_norm": 0.8660219136797084, + "learning_rate": 8.443269347313765e-08, + "loss": 0.0265, + "step": 7116 + }, + { + "epoch": 2.8426517571884986, + "grad_norm": 0.9364401145220109, + "learning_rate": 8.358436984121865e-08, + "loss": 0.0273, + "step": 7118 + }, + { + "epoch": 2.8434504792332267, + "grad_norm": 1.023124638752514, + "learning_rate": 8.274029352853264e-08, + "loss": 0.0276, + "step": 7120 + }, + { + "epoch": 2.844249201277955, + "grad_norm": 0.8561164210287661, + "learning_rate": 8.190046526428241e-08, + "loss": 0.0291, + "step": 7122 + }, + { + "epoch": 2.8450479233226837, + "grad_norm": 0.8438795870319646, + "learning_rate": 8.106488577399985e-08, + "loss": 0.0264, + "step": 7124 + }, + { + "epoch": 2.8458466453674123, + "grad_norm": 0.9601387385173464, + "learning_rate": 8.02335557795464e-08, + "loss": 0.0268, + "step": 7126 + }, + { + "epoch": 2.8466453674121404, + "grad_norm": 1.0416344807221194, + "learning_rate": 7.940647599911477e-08, + "loss": 0.0274, + "step": 7128 + }, + { + "epoch": 2.847444089456869, + "grad_norm": 1.030804565632412, + "learning_rate": 7.858364714722122e-08, + "loss": 0.0297, + "step": 7130 + }, + { + "epoch": 2.8482428115015974, + "grad_norm": 0.848403943487444, + "learning_rate": 7.776506993471323e-08, + "loss": 0.0262, + "step": 7132 + }, + { + "epoch": 2.849041533546326, + "grad_norm": 0.9248426395931749, + "learning_rate": 7.695074506876566e-08, + "loss": 0.0275, + "step": 7134 + }, + { + "epoch": 2.8498402555910545, + "grad_norm": 1.0380277597027192, + "learning_rate": 7.614067325287632e-08, + "loss": 0.029, + "step": 7136 + }, + { + "epoch": 2.850638977635783, + "grad_norm": 1.0174264329449134, + "learning_rate": 7.533485518687211e-08, + "loss": 0.0262, + "step": 7138 + }, + { + "epoch": 2.851437699680511, + "grad_norm": 0.9035723830180614, + "learning_rate": 7.453329156690337e-08, + "loss": 0.0312, + "step": 7140 + }, + { + "epoch": 2.8522364217252396, + "grad_norm": 0.8948154240149504, + "learning_rate": 7.373598308544505e-08, + "loss": 0.0286, + "step": 7142 + }, + { + "epoch": 2.853035143769968, + "grad_norm": 0.8767549377755169, + "learning_rate": 7.294293043129785e-08, + "loss": 0.0271, + "step": 7144 + }, + { + "epoch": 2.8538338658146962, + "grad_norm": 1.0359127327227582, + "learning_rate": 7.215413428958263e-08, + "loss": 0.0299, + "step": 7146 + }, + { + "epoch": 2.8546325878594248, + "grad_norm": 1.054388868239098, + "learning_rate": 7.136959534174592e-08, + "loss": 0.0295, + "step": 7148 + }, + { + "epoch": 2.8554313099041533, + "grad_norm": 0.9105886167898716, + "learning_rate": 7.058931426555449e-08, + "loss": 0.0278, + "step": 7150 + }, + { + "epoch": 2.856230031948882, + "grad_norm": 0.9273333490997623, + "learning_rate": 6.981329173509909e-08, + "loss": 0.0287, + "step": 7152 + }, + { + "epoch": 2.8570287539936103, + "grad_norm": 0.8079326225611323, + "learning_rate": 6.904152842078848e-08, + "loss": 0.0269, + "step": 7154 + }, + { + "epoch": 2.857827476038339, + "grad_norm": 0.9892975697892658, + "learning_rate": 6.827402498935377e-08, + "loss": 0.0336, + "step": 7156 + }, + { + "epoch": 2.858626198083067, + "grad_norm": 0.9535750532619426, + "learning_rate": 6.75107821038462e-08, + "loss": 0.029, + "step": 7158 + }, + { + "epoch": 2.8594249201277955, + "grad_norm": 0.8407797191652399, + "learning_rate": 6.675180042363505e-08, + "loss": 0.0254, + "step": 7160 + }, + { + "epoch": 2.860223642172524, + "grad_norm": 1.0055696204335296, + "learning_rate": 6.599708060440857e-08, + "loss": 0.0291, + "step": 7162 + }, + { + "epoch": 2.8610223642172525, + "grad_norm": 0.860800590976241, + "learning_rate": 6.524662329817411e-08, + "loss": 0.0244, + "step": 7164 + }, + { + "epoch": 2.8618210862619806, + "grad_norm": 0.9470630865172524, + "learning_rate": 6.450042915325527e-08, + "loss": 0.0288, + "step": 7166 + }, + { + "epoch": 2.862619808306709, + "grad_norm": 0.917809030684146, + "learning_rate": 6.375849881429418e-08, + "loss": 0.0278, + "step": 7168 + }, + { + "epoch": 2.8634185303514377, + "grad_norm": 0.9384143679170448, + "learning_rate": 6.302083292224814e-08, + "loss": 0.0312, + "step": 7170 + }, + { + "epoch": 2.864217252396166, + "grad_norm": 0.8768613951600329, + "learning_rate": 6.22874321143907e-08, + "loss": 0.0249, + "step": 7172 + }, + { + "epoch": 2.8650159744408947, + "grad_norm": 0.7174846049452049, + "learning_rate": 6.15582970243117e-08, + "loss": 0.0223, + "step": 7174 + }, + { + "epoch": 2.8658146964856233, + "grad_norm": 1.0161897086560874, + "learning_rate": 6.083342828191453e-08, + "loss": 0.0269, + "step": 7176 + }, + { + "epoch": 2.8666134185303513, + "grad_norm": 0.9854394127230478, + "learning_rate": 6.011282651341655e-08, + "loss": 0.0288, + "step": 7178 + }, + { + "epoch": 2.86741214057508, + "grad_norm": 0.901696634188199, + "learning_rate": 5.9396492341351475e-08, + "loss": 0.0276, + "step": 7180 + }, + { + "epoch": 2.8682108626198084, + "grad_norm": 0.9841680067526405, + "learning_rate": 5.868442638456373e-08, + "loss": 0.0288, + "step": 7182 + }, + { + "epoch": 2.8690095846645365, + "grad_norm": 0.9134522070907273, + "learning_rate": 5.797662925821068e-08, + "loss": 0.03, + "step": 7184 + }, + { + "epoch": 2.869808306709265, + "grad_norm": 1.0297579858038646, + "learning_rate": 5.7273101573762644e-08, + "loss": 0.0297, + "step": 7186 + }, + { + "epoch": 2.8706070287539935, + "grad_norm": 1.002668230554201, + "learning_rate": 5.6573843939001224e-08, + "loss": 0.0321, + "step": 7188 + }, + { + "epoch": 2.871405750798722, + "grad_norm": 0.8448908330163806, + "learning_rate": 5.5878856958018755e-08, + "loss": 0.0275, + "step": 7190 + }, + { + "epoch": 2.8722044728434506, + "grad_norm": 0.8951090877529749, + "learning_rate": 5.518814123121885e-08, + "loss": 0.028, + "step": 7192 + }, + { + "epoch": 2.873003194888179, + "grad_norm": 0.8887400334865615, + "learning_rate": 5.450169735531419e-08, + "loss": 0.0266, + "step": 7194 + }, + { + "epoch": 2.873801916932907, + "grad_norm": 0.9527424771255477, + "learning_rate": 5.381952592332762e-08, + "loss": 0.0292, + "step": 7196 + }, + { + "epoch": 2.8746006389776357, + "grad_norm": 0.9236990470777641, + "learning_rate": 5.3141627524591066e-08, + "loss": 0.0268, + "step": 7198 + }, + { + "epoch": 2.8753993610223643, + "grad_norm": 0.99547523531349, + "learning_rate": 5.246800274474439e-08, + "loss": 0.0287, + "step": 7200 + }, + { + "epoch": 2.876198083067093, + "grad_norm": 1.0395745492495505, + "learning_rate": 5.179865216573654e-08, + "loss": 0.0272, + "step": 7202 + }, + { + "epoch": 2.876996805111821, + "grad_norm": 0.9239013292846188, + "learning_rate": 5.1133576365823277e-08, + "loss": 0.03, + "step": 7204 + }, + { + "epoch": 2.8777955271565494, + "grad_norm": 0.8510701957695702, + "learning_rate": 5.047277591956668e-08, + "loss": 0.0237, + "step": 7206 + }, + { + "epoch": 2.878594249201278, + "grad_norm": 1.0550009694911764, + "learning_rate": 4.981625139783619e-08, + "loss": 0.0264, + "step": 7208 + }, + { + "epoch": 2.8793929712460065, + "grad_norm": 0.8651740589602106, + "learning_rate": 4.916400336780758e-08, + "loss": 0.0289, + "step": 7210 + }, + { + "epoch": 2.880191693290735, + "grad_norm": 0.9833410681522953, + "learning_rate": 4.851603239296065e-08, + "loss": 0.0263, + "step": 7212 + }, + { + "epoch": 2.8809904153354635, + "grad_norm": 0.9156057914045268, + "learning_rate": 4.787233903308208e-08, + "loss": 0.0266, + "step": 7214 + }, + { + "epoch": 2.8817891373801916, + "grad_norm": 0.9881205371437639, + "learning_rate": 4.723292384426203e-08, + "loss": 0.0295, + "step": 7216 + }, + { + "epoch": 2.88258785942492, + "grad_norm": 0.9124860856419786, + "learning_rate": 4.65977873788942e-08, + "loss": 0.0287, + "step": 7218 + }, + { + "epoch": 2.8833865814696487, + "grad_norm": 0.9969073107367281, + "learning_rate": 4.596693018567744e-08, + "loss": 0.0288, + "step": 7220 + }, + { + "epoch": 2.8841853035143767, + "grad_norm": 0.8841157317264496, + "learning_rate": 4.534035280961191e-08, + "loss": 0.0309, + "step": 7222 + }, + { + "epoch": 2.8849840255591053, + "grad_norm": 1.0078494252596897, + "learning_rate": 4.471805579200239e-08, + "loss": 0.0312, + "step": 7224 + }, + { + "epoch": 2.885782747603834, + "grad_norm": 0.9932395040774775, + "learning_rate": 4.41000396704544e-08, + "loss": 0.0283, + "step": 7226 + }, + { + "epoch": 2.8865814696485623, + "grad_norm": 0.9777222302672404, + "learning_rate": 4.3486304978875294e-08, + "loss": 0.0286, + "step": 7228 + }, + { + "epoch": 2.887380191693291, + "grad_norm": 0.9488138150582978, + "learning_rate": 4.287685224747373e-08, + "loss": 0.0287, + "step": 7230 + }, + { + "epoch": 2.8881789137380194, + "grad_norm": 0.9515551976834303, + "learning_rate": 4.227168200276077e-08, + "loss": 0.0301, + "step": 7232 + }, + { + "epoch": 2.8889776357827475, + "grad_norm": 1.0020004748739546, + "learning_rate": 4.167079476754432e-08, + "loss": 0.0303, + "step": 7234 + }, + { + "epoch": 2.889776357827476, + "grad_norm": 0.9940040887434322, + "learning_rate": 4.1074191060935794e-08, + "loss": 0.0314, + "step": 7236 + }, + { + "epoch": 2.8905750798722045, + "grad_norm": 0.9574123979978048, + "learning_rate": 4.048187139834403e-08, + "loss": 0.0266, + "step": 7238 + }, + { + "epoch": 2.891373801916933, + "grad_norm": 0.9872954533945656, + "learning_rate": 3.989383629147747e-08, + "loss": 0.0305, + "step": 7240 + }, + { + "epoch": 2.892172523961661, + "grad_norm": 1.093299647415339, + "learning_rate": 3.9310086248342536e-08, + "loss": 0.0309, + "step": 7242 + }, + { + "epoch": 2.8929712460063897, + "grad_norm": 0.9207625460483227, + "learning_rate": 3.873062177324472e-08, + "loss": 0.0293, + "step": 7244 + }, + { + "epoch": 2.893769968051118, + "grad_norm": 0.8613042189991472, + "learning_rate": 3.8155443366785786e-08, + "loss": 0.0278, + "step": 7246 + }, + { + "epoch": 2.8945686900958467, + "grad_norm": 0.9093281736899324, + "learning_rate": 3.758455152586715e-08, + "loss": 0.0273, + "step": 7248 + }, + { + "epoch": 2.8953674121405752, + "grad_norm": 0.8976668145886221, + "learning_rate": 3.7017946743683754e-08, + "loss": 0.0279, + "step": 7250 + }, + { + "epoch": 2.8961661341853038, + "grad_norm": 0.9438489705868893, + "learning_rate": 3.645562950973014e-08, + "loss": 0.0288, + "step": 7252 + }, + { + "epoch": 2.896964856230032, + "grad_norm": 0.9039303228109454, + "learning_rate": 3.589760030979439e-08, + "loss": 0.029, + "step": 7254 + }, + { + "epoch": 2.8977635782747604, + "grad_norm": 0.8665135816874094, + "learning_rate": 3.534385962596143e-08, + "loss": 0.0257, + "step": 7256 + }, + { + "epoch": 2.898562300319489, + "grad_norm": 0.9885112089994623, + "learning_rate": 3.479440793661082e-08, + "loss": 0.0287, + "step": 7258 + }, + { + "epoch": 2.899361022364217, + "grad_norm": 0.9313125684039982, + "learning_rate": 3.4249245716417303e-08, + "loss": 0.0311, + "step": 7260 + }, + { + "epoch": 2.9001597444089455, + "grad_norm": 1.0211354686014047, + "learning_rate": 3.370837343634914e-08, + "loss": 0.0281, + "step": 7262 + }, + { + "epoch": 2.900958466453674, + "grad_norm": 0.9261447629517585, + "learning_rate": 3.3171791563669785e-08, + "loss": 0.0285, + "step": 7264 + }, + { + "epoch": 2.9017571884984026, + "grad_norm": 0.9543015423065228, + "learning_rate": 3.263950056193455e-08, + "loss": 0.0285, + "step": 7266 + }, + { + "epoch": 2.902555910543131, + "grad_norm": 0.876741688352455, + "learning_rate": 3.211150089099224e-08, + "loss": 0.0261, + "step": 7268 + }, + { + "epoch": 2.9033546325878596, + "grad_norm": 0.9732009390231579, + "learning_rate": 3.1587793006985224e-08, + "loss": 0.0308, + "step": 7270 + }, + { + "epoch": 2.9041533546325877, + "grad_norm": 0.924537711179532, + "learning_rate": 3.10683773623488e-08, + "loss": 0.028, + "step": 7272 + }, + { + "epoch": 2.9049520766773163, + "grad_norm": 0.8328671231098053, + "learning_rate": 3.055325440580736e-08, + "loss": 0.0231, + "step": 7274 + }, + { + "epoch": 2.905750798722045, + "grad_norm": 0.8481571211750596, + "learning_rate": 3.004242458237994e-08, + "loss": 0.029, + "step": 7276 + }, + { + "epoch": 2.9065495207667733, + "grad_norm": 1.0264649887625952, + "learning_rate": 2.9535888333374064e-08, + "loss": 0.0316, + "step": 7278 + }, + { + "epoch": 2.9073482428115014, + "grad_norm": 0.9904821324548735, + "learning_rate": 2.9033646096390255e-08, + "loss": 0.0317, + "step": 7280 + }, + { + "epoch": 2.90814696485623, + "grad_norm": 0.9529343344181515, + "learning_rate": 2.853569830531755e-08, + "loss": 0.0294, + "step": 7282 + }, + { + "epoch": 2.9089456869009584, + "grad_norm": 0.8950285806155511, + "learning_rate": 2.8042045390336835e-08, + "loss": 0.027, + "step": 7284 + }, + { + "epoch": 2.909744408945687, + "grad_norm": 0.9915085372273561, + "learning_rate": 2.7552687777916976e-08, + "loss": 0.029, + "step": 7286 + }, + { + "epoch": 2.9105431309904155, + "grad_norm": 1.0843075888116502, + "learning_rate": 2.706762589081646e-08, + "loss": 0.0329, + "step": 7288 + }, + { + "epoch": 2.911341853035144, + "grad_norm": 1.0723208662350674, + "learning_rate": 2.6586860148084537e-08, + "loss": 0.0314, + "step": 7290 + }, + { + "epoch": 2.912140575079872, + "grad_norm": 0.9855410494694304, + "learning_rate": 2.6110390965055632e-08, + "loss": 0.0332, + "step": 7292 + }, + { + "epoch": 2.9129392971246006, + "grad_norm": 0.9456950838662169, + "learning_rate": 2.563821875335437e-08, + "loss": 0.0306, + "step": 7294 + }, + { + "epoch": 2.913738019169329, + "grad_norm": 0.9431752923514247, + "learning_rate": 2.517034392089446e-08, + "loss": 0.0242, + "step": 7296 + }, + { + "epoch": 2.9145367412140573, + "grad_norm": 0.9335986279046339, + "learning_rate": 2.4706766871874232e-08, + "loss": 0.029, + "step": 7298 + }, + { + "epoch": 2.915335463258786, + "grad_norm": 0.772170384658901, + "learning_rate": 2.4247488006781116e-08, + "loss": 0.0259, + "step": 7300 + }, + { + "epoch": 2.9161341853035143, + "grad_norm": 0.9779269003823648, + "learning_rate": 2.3792507722388835e-08, + "loss": 0.0301, + "step": 7302 + }, + { + "epoch": 2.916932907348243, + "grad_norm": 0.9611794936794353, + "learning_rate": 2.3341826411756863e-08, + "loss": 0.0305, + "step": 7304 + }, + { + "epoch": 2.9177316293929714, + "grad_norm": 0.8596768995810088, + "learning_rate": 2.2895444464232087e-08, + "loss": 0.0279, + "step": 7306 + }, + { + "epoch": 2.9185303514377, + "grad_norm": 1.050550890425119, + "learning_rate": 2.2453362265445477e-08, + "loss": 0.0266, + "step": 7308 + }, + { + "epoch": 2.919329073482428, + "grad_norm": 1.0172145059261652, + "learning_rate": 2.2015580197314868e-08, + "loss": 0.0295, + "step": 7310 + }, + { + "epoch": 2.9201277955271565, + "grad_norm": 0.8631988032830935, + "learning_rate": 2.158209863804217e-08, + "loss": 0.0258, + "step": 7312 + }, + { + "epoch": 2.920926517571885, + "grad_norm": 1.0172194609395517, + "learning_rate": 2.1152917962115606e-08, + "loss": 0.0311, + "step": 7314 + }, + { + "epoch": 2.9217252396166136, + "grad_norm": 0.8656946266189468, + "learning_rate": 2.0728038540305807e-08, + "loss": 0.025, + "step": 7316 + }, + { + "epoch": 2.9225239616613417, + "grad_norm": 1.2040154733832662, + "learning_rate": 2.030746073966916e-08, + "loss": 0.0299, + "step": 7318 + }, + { + "epoch": 2.92332268370607, + "grad_norm": 0.981108582856345, + "learning_rate": 1.9891184923544472e-08, + "loss": 0.0312, + "step": 7320 + }, + { + "epoch": 2.9241214057507987, + "grad_norm": 1.0050547681748903, + "learning_rate": 1.9479211451555735e-08, + "loss": 0.0306, + "step": 7322 + }, + { + "epoch": 2.9249201277955272, + "grad_norm": 0.9723436269119015, + "learning_rate": 1.9071540679608815e-08, + "loss": 0.0274, + "step": 7324 + }, + { + "epoch": 2.9257188498402558, + "grad_norm": 0.890152909346874, + "learning_rate": 1.8668172959891985e-08, + "loss": 0.0286, + "step": 7326 + }, + { + "epoch": 2.9265175718849843, + "grad_norm": 1.0167534719731643, + "learning_rate": 1.826910864087761e-08, + "loss": 0.0293, + "step": 7328 + }, + { + "epoch": 2.9273162939297124, + "grad_norm": 0.9875484380951924, + "learning_rate": 1.7874348067319912e-08, + "loss": 0.031, + "step": 7330 + }, + { + "epoch": 2.928115015974441, + "grad_norm": 1.1309464473632505, + "learning_rate": 1.7483891580253877e-08, + "loss": 0.0304, + "step": 7332 + }, + { + "epoch": 2.9289137380191694, + "grad_norm": 0.8279929400769935, + "learning_rate": 1.7097739516997447e-08, + "loss": 0.0268, + "step": 7334 + }, + { + "epoch": 2.9297124600638975, + "grad_norm": 0.9993434381726024, + "learning_rate": 1.6715892211150442e-08, + "loss": 0.0286, + "step": 7336 + }, + { + "epoch": 2.930511182108626, + "grad_norm": 1.007539806056888, + "learning_rate": 1.6338349992591763e-08, + "loss": 0.0294, + "step": 7338 + }, + { + "epoch": 2.9313099041533546, + "grad_norm": 1.0452749714129563, + "learning_rate": 1.5965113187482174e-08, + "loss": 0.0328, + "step": 7340 + }, + { + "epoch": 2.932108626198083, + "grad_norm": 0.8474586458371822, + "learning_rate": 1.5596182118264303e-08, + "loss": 0.0256, + "step": 7342 + }, + { + "epoch": 2.9329073482428116, + "grad_norm": 0.9287019035012101, + "learning_rate": 1.5231557103658755e-08, + "loss": 0.0286, + "step": 7344 + }, + { + "epoch": 2.93370607028754, + "grad_norm": 1.0230975340136683, + "learning_rate": 1.4871238458667447e-08, + "loss": 0.0308, + "step": 7346 + }, + { + "epoch": 2.9345047923322682, + "grad_norm": 0.9679221553199847, + "learning_rate": 1.4515226494571376e-08, + "loss": 0.0279, + "step": 7348 + }, + { + "epoch": 2.9353035143769968, + "grad_norm": 1.0129002341137234, + "learning_rate": 1.41635215189323e-08, + "loss": 0.0243, + "step": 7350 + }, + { + "epoch": 2.9361022364217253, + "grad_norm": 1.159004848542131, + "learning_rate": 1.3816123835588835e-08, + "loss": 0.0309, + "step": 7352 + }, + { + "epoch": 2.936900958466454, + "grad_norm": 1.0452986083737144, + "learning_rate": 1.3473033744660358e-08, + "loss": 0.0297, + "step": 7354 + }, + { + "epoch": 2.937699680511182, + "grad_norm": 0.944187394021172, + "learning_rate": 1.3134251542544774e-08, + "loss": 0.0255, + "step": 7356 + }, + { + "epoch": 2.9384984025559104, + "grad_norm": 1.1423229379322075, + "learning_rate": 1.2799777521916856e-08, + "loss": 0.0306, + "step": 7358 + }, + { + "epoch": 2.939297124600639, + "grad_norm": 0.9042757983900201, + "learning_rate": 1.2469611971731576e-08, + "loss": 0.028, + "step": 7360 + }, + { + "epoch": 2.9400958466453675, + "grad_norm": 1.219554574679699, + "learning_rate": 1.2143755177220774e-08, + "loss": 0.0292, + "step": 7362 + }, + { + "epoch": 2.940894568690096, + "grad_norm": 1.0160171362099817, + "learning_rate": 1.1822207419893151e-08, + "loss": 0.0316, + "step": 7364 + }, + { + "epoch": 2.9416932907348246, + "grad_norm": 0.9410768507733369, + "learning_rate": 1.1504968977536502e-08, + "loss": 0.0311, + "step": 7366 + }, + { + "epoch": 2.9424920127795526, + "grad_norm": 0.9541185215999987, + "learning_rate": 1.1192040124214931e-08, + "loss": 0.0251, + "step": 7368 + }, + { + "epoch": 2.943290734824281, + "grad_norm": 1.1401434841727258, + "learning_rate": 1.0883421130268857e-08, + "loss": 0.0389, + "step": 7370 + }, + { + "epoch": 2.9440894568690097, + "grad_norm": 0.9102452503551742, + "learning_rate": 1.0579112262316116e-08, + "loss": 0.0299, + "step": 7372 + }, + { + "epoch": 2.9448881789137378, + "grad_norm": 0.8297271919791628, + "learning_rate": 1.027911378325086e-08, + "loss": 0.0266, + "step": 7374 + }, + { + "epoch": 2.9456869009584663, + "grad_norm": 0.977055359117076, + "learning_rate": 9.983425952243552e-09, + "loss": 0.0275, + "step": 7376 + }, + { + "epoch": 2.946485623003195, + "grad_norm": 0.8856220307196366, + "learning_rate": 9.692049024740968e-09, + "loss": 0.0273, + "step": 7378 + }, + { + "epoch": 2.9472843450479234, + "grad_norm": 1.0185133421121422, + "learning_rate": 9.404983252464528e-09, + "loss": 0.0308, + "step": 7380 + }, + { + "epoch": 2.948083067092652, + "grad_norm": 0.9226722623416709, + "learning_rate": 9.12222888341252e-09, + "loss": 0.0278, + "step": 7382 + }, + { + "epoch": 2.9488817891373804, + "grad_norm": 0.8681408122208755, + "learning_rate": 8.84378616185788e-09, + "loss": 0.0289, + "step": 7384 + }, + { + "epoch": 2.9496805111821085, + "grad_norm": 0.9820066250150342, + "learning_rate": 8.569655328349302e-09, + "loss": 0.0306, + "step": 7386 + }, + { + "epoch": 2.950479233226837, + "grad_norm": 0.9224388253014746, + "learning_rate": 8.299836619709011e-09, + "loss": 0.0252, + "step": 7388 + }, + { + "epoch": 2.9512779552715656, + "grad_norm": 1.078281742555588, + "learning_rate": 8.034330269034995e-09, + "loss": 0.0339, + "step": 7390 + }, + { + "epoch": 2.952076677316294, + "grad_norm": 1.1121638039606885, + "learning_rate": 7.773136505700995e-09, + "loss": 0.0315, + "step": 7392 + }, + { + "epoch": 2.952875399361022, + "grad_norm": 0.8527091048044436, + "learning_rate": 7.516255555352069e-09, + "loss": 0.0277, + "step": 7394 + }, + { + "epoch": 2.9536741214057507, + "grad_norm": 0.9156141320301706, + "learning_rate": 7.2636876399107e-09, + "loss": 0.0269, + "step": 7396 + }, + { + "epoch": 2.9544728434504792, + "grad_norm": 1.3378705563366167, + "learning_rate": 7.015432977570679e-09, + "loss": 0.0277, + "step": 7398 + }, + { + "epoch": 2.9552715654952078, + "grad_norm": 0.9035834422975698, + "learning_rate": 6.7714917828004545e-09, + "loss": 0.0281, + "step": 7400 + }, + { + "epoch": 2.9560702875399363, + "grad_norm": 1.0788469833239718, + "learning_rate": 6.531864266343113e-09, + "loss": 0.0311, + "step": 7402 + }, + { + "epoch": 2.956869009584665, + "grad_norm": 1.0030626593801413, + "learning_rate": 6.296550635213616e-09, + "loss": 0.0282, + "step": 7404 + }, + { + "epoch": 2.957667731629393, + "grad_norm": 0.9736720908524453, + "learning_rate": 6.0655510927010165e-09, + "loss": 0.0309, + "step": 7406 + }, + { + "epoch": 2.9584664536741214, + "grad_norm": 0.976953889137446, + "learning_rate": 5.838865838366792e-09, + "loss": 0.03, + "step": 7408 + }, + { + "epoch": 2.95926517571885, + "grad_norm": 0.96316953344491, + "learning_rate": 5.616495068046513e-09, + "loss": 0.0306, + "step": 7410 + }, + { + "epoch": 2.960063897763578, + "grad_norm": 0.8882325546800076, + "learning_rate": 5.398438973845954e-09, + "loss": 0.0328, + "step": 7412 + }, + { + "epoch": 2.9608626198083066, + "grad_norm": 1.0434465347465718, + "learning_rate": 5.184697744146094e-09, + "loss": 0.0296, + "step": 7414 + }, + { + "epoch": 2.961661341853035, + "grad_norm": 1.1676463120664913, + "learning_rate": 4.975271563599227e-09, + "loss": 0.0342, + "step": 7416 + }, + { + "epoch": 2.9624600638977636, + "grad_norm": 1.125598033675333, + "learning_rate": 4.770160613129515e-09, + "loss": 0.0305, + "step": 7418 + }, + { + "epoch": 2.963258785942492, + "grad_norm": 0.7862908310858415, + "learning_rate": 4.569365069933551e-09, + "loss": 0.0256, + "step": 7420 + }, + { + "epoch": 2.9640575079872207, + "grad_norm": 0.9443569000718535, + "learning_rate": 4.372885107479796e-09, + "loss": 0.0306, + "step": 7422 + }, + { + "epoch": 2.9648562300319488, + "grad_norm": 0.800637449098191, + "learning_rate": 4.180720895508028e-09, + "loss": 0.0293, + "step": 7424 + }, + { + "epoch": 2.9656549520766773, + "grad_norm": 0.9430328576301563, + "learning_rate": 3.992872600030451e-09, + "loss": 0.0271, + "step": 7426 + }, + { + "epoch": 2.966453674121406, + "grad_norm": 0.9058280169647789, + "learning_rate": 3.809340383330584e-09, + "loss": 0.0265, + "step": 7428 + }, + { + "epoch": 2.9672523961661343, + "grad_norm": 0.976846038410627, + "learning_rate": 3.630124403961599e-09, + "loss": 0.0308, + "step": 7430 + }, + { + "epoch": 2.9680511182108624, + "grad_norm": 0.9428661317946987, + "learning_rate": 3.4552248167507576e-09, + "loss": 0.0271, + "step": 7432 + }, + { + "epoch": 2.968849840255591, + "grad_norm": 0.9860305014063514, + "learning_rate": 3.284641772793862e-09, + "loss": 0.0299, + "step": 7434 + }, + { + "epoch": 2.9696485623003195, + "grad_norm": 0.8472915574856456, + "learning_rate": 3.118375419458586e-09, + "loss": 0.0231, + "step": 7436 + }, + { + "epoch": 2.970447284345048, + "grad_norm": 0.9416785407994653, + "learning_rate": 2.956425900383919e-09, + "loss": 0.0333, + "step": 7438 + }, + { + "epoch": 2.9712460063897765, + "grad_norm": 0.7967253694302693, + "learning_rate": 2.798793355478502e-09, + "loss": 0.03, + "step": 7440 + }, + { + "epoch": 2.972044728434505, + "grad_norm": 0.9101321750296127, + "learning_rate": 2.6454779209217353e-09, + "loss": 0.0288, + "step": 7442 + }, + { + "epoch": 2.972843450479233, + "grad_norm": 0.960448163452539, + "learning_rate": 2.496479729164891e-09, + "loss": 0.0307, + "step": 7444 + }, + { + "epoch": 2.9736421725239617, + "grad_norm": 0.9953358583136821, + "learning_rate": 2.3517989089272274e-09, + "loss": 0.027, + "step": 7446 + }, + { + "epoch": 2.97444089456869, + "grad_norm": 0.9262672112649574, + "learning_rate": 2.2114355851993175e-09, + "loss": 0.0252, + "step": 7448 + }, + { + "epoch": 2.9752396166134183, + "grad_norm": 1.0161598052132237, + "learning_rate": 2.0753898792424954e-09, + "loss": 0.03, + "step": 7450 + }, + { + "epoch": 2.976038338658147, + "grad_norm": 0.8238758679662858, + "learning_rate": 1.943661908586636e-09, + "loss": 0.0226, + "step": 7452 + }, + { + "epoch": 2.9768370607028753, + "grad_norm": 1.1294195598874084, + "learning_rate": 1.81625178703293e-09, + "loss": 0.0314, + "step": 7454 + }, + { + "epoch": 2.977635782747604, + "grad_norm": 0.7948759771645315, + "learning_rate": 1.6931596246516636e-09, + "loss": 0.0237, + "step": 7456 + }, + { + "epoch": 2.9784345047923324, + "grad_norm": 0.9603638334498134, + "learning_rate": 1.5743855277822185e-09, + "loss": 0.0271, + "step": 7458 + }, + { + "epoch": 2.979233226837061, + "grad_norm": 1.004902038270927, + "learning_rate": 1.4599295990352924e-09, + "loss": 0.0274, + "step": 7460 + }, + { + "epoch": 2.980031948881789, + "grad_norm": 1.3727183415621365, + "learning_rate": 1.3497919372890135e-09, + "loss": 0.0299, + "step": 7462 + }, + { + "epoch": 2.9808306709265175, + "grad_norm": 1.0113889053741916, + "learning_rate": 1.24397263769227e-09, + "loss": 0.0334, + "step": 7464 + }, + { + "epoch": 2.981629392971246, + "grad_norm": 1.058301608314339, + "learning_rate": 1.1424717916630468e-09, + "loss": 0.0314, + "step": 7466 + }, + { + "epoch": 2.9824281150159746, + "grad_norm": 1.0685945832080852, + "learning_rate": 1.0452894868884235e-09, + "loss": 0.0287, + "step": 7468 + }, + { + "epoch": 2.9832268370607027, + "grad_norm": 0.9439988444316789, + "learning_rate": 9.52425807324575e-10, + "loss": 0.0279, + "step": 7470 + }, + { + "epoch": 2.984025559105431, + "grad_norm": 1.0539650853101212, + "learning_rate": 8.638808331973281e-10, + "loss": 0.0306, + "step": 7472 + }, + { + "epoch": 2.9848242811501597, + "grad_norm": 0.8544922347586041, + "learning_rate": 7.796546410004934e-10, + "loss": 0.0302, + "step": 7474 + }, + { + "epoch": 2.9856230031948883, + "grad_norm": 0.9303223740142429, + "learning_rate": 6.997473034986435e-10, + "loss": 0.0281, + "step": 7476 + }, + { + "epoch": 2.986421725239617, + "grad_norm": 0.9400919377169601, + "learning_rate": 6.241588897232253e-10, + "loss": 0.0276, + "step": 7478 + }, + { + "epoch": 2.987220447284345, + "grad_norm": 0.9420254495627837, + "learning_rate": 5.528894649758921e-10, + "loss": 0.0252, + "step": 7480 + }, + { + "epoch": 2.9880191693290734, + "grad_norm": 0.8844462225772538, + "learning_rate": 4.859390908268369e-10, + "loss": 0.0271, + "step": 7482 + }, + { + "epoch": 2.988817891373802, + "grad_norm": 0.9000038358376025, + "learning_rate": 4.2330782511423865e-10, + "loss": 0.0291, + "step": 7484 + }, + { + "epoch": 2.9896166134185305, + "grad_norm": 1.0116302989209616, + "learning_rate": 3.649957219464817e-10, + "loss": 0.0312, + "step": 7486 + }, + { + "epoch": 2.9904153354632586, + "grad_norm": 1.0168619061466087, + "learning_rate": 3.1100283169938074e-10, + "loss": 0.0257, + "step": 7488 + }, + { + "epoch": 2.991214057507987, + "grad_norm": 0.9317368624862982, + "learning_rate": 2.613292010172908e-10, + "loss": 0.0326, + "step": 7490 + }, + { + "epoch": 2.9920127795527156, + "grad_norm": 1.0576362124675651, + "learning_rate": 2.1597487281366236e-10, + "loss": 0.0279, + "step": 7492 + }, + { + "epoch": 2.992811501597444, + "grad_norm": 0.9509971386881086, + "learning_rate": 1.7493988627104164e-10, + "loss": 0.0318, + "step": 7494 + }, + { + "epoch": 2.9936102236421727, + "grad_norm": 0.8370706486653864, + "learning_rate": 1.3822427683884975e-10, + "loss": 0.0273, + "step": 7496 + }, + { + "epoch": 2.994408945686901, + "grad_norm": 1.0612898069636711, + "learning_rate": 1.0582807623671365e-10, + "loss": 0.0352, + "step": 7498 + }, + { + "epoch": 2.9952076677316293, + "grad_norm": 0.805238580681461, + "learning_rate": 7.775131245169044e-11, + "loss": 0.0258, + "step": 7500 + }, + { + "epoch": 2.9952076677316293, + "eval_loss": 0.18183551728725433, + "eval_runtime": 419.4635, + "eval_samples_per_second": 42.452, + "eval_steps_per_second": 5.307, + "step": 7500 + }, + { + "epoch": 2.996006389776358, + "grad_norm": 0.8616048488279973, + "learning_rate": 5.399400973882252e-11, + "loss": 0.0255, + "step": 7502 + }, + { + "epoch": 2.9968051118210863, + "grad_norm": 0.8828142582638174, + "learning_rate": 3.4556188622802964e-11, + "loss": 0.0298, + "step": 7504 + }, + { + "epoch": 2.997603833865815, + "grad_norm": 1.0530041672485366, + "learning_rate": 1.9437865895755027e-11, + "loss": 0.0281, + "step": 7506 + }, + { + "epoch": 2.998402555910543, + "grad_norm": 0.8820484624604644, + "learning_rate": 8.639054618897468e-12, + "loss": 0.0258, + "step": 7508 + }, + { + "epoch": 2.9992012779552715, + "grad_norm": 1.0521391566214802, + "learning_rate": 2.1597641214343714e-12, + "loss": 0.028, + "step": 7510 + }, + { + "epoch": 3.0, + "grad_norm": 1.0637471844248951, + "learning_rate": 0.0, + "loss": 0.0258, + "step": 7512 + }, + { + "epoch": 3.0, + "step": 7512, + "total_flos": 730578500321280.0, + "train_loss": 0.11755168595261023, + "train_runtime": 63778.2421, + "train_samples_per_second": 7.538, + "train_steps_per_second": 0.118 + } + ], + "logging_steps": 2, + "max_steps": 7512, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 730578500321280.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}