{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.4, "eval_steps": 12500, "global_step": 100000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.032, "grad_norm": 3.001525402069092, "learning_rate": 8e-05, "loss": 13.1617, "step": 500 }, { "epoch": 0.064, "grad_norm": 0.5208349227905273, "learning_rate": 7.967871485943775e-05, "loss": 1.4028, "step": 1000 }, { "epoch": 0.096, "grad_norm": 0.43057793378829956, "learning_rate": 7.93574297188755e-05, "loss": 1.0426, "step": 1500 }, { "epoch": 0.128, "grad_norm": 0.53807532787323, "learning_rate": 7.903614457831325e-05, "loss": 0.8407, "step": 2000 }, { "epoch": 0.16, "grad_norm": 0.7076811194419861, "learning_rate": 7.8714859437751e-05, "loss": 0.7035, "step": 2500 }, { "epoch": 0.192, "grad_norm": 0.5321575999259949, "learning_rate": 7.839357429718876e-05, "loss": 0.4479, "step": 3000 }, { "epoch": 0.224, "grad_norm": 0.5099969506263733, "learning_rate": 7.807228915662652e-05, "loss": 0.4085, "step": 3500 }, { "epoch": 0.256, "grad_norm": 0.3821808099746704, "learning_rate": 7.775100401606426e-05, "loss": 0.3811, "step": 4000 }, { "epoch": 0.288, "grad_norm": 0.45157989859580994, "learning_rate": 7.742971887550202e-05, "loss": 0.3604, "step": 4500 }, { "epoch": 0.32, "grad_norm": 0.3053194582462311, "learning_rate": 7.710843373493976e-05, "loss": 0.3442, "step": 5000 }, { "epoch": 0.352, "grad_norm": 0.45309627056121826, "learning_rate": 7.678714859437751e-05, "loss": 0.3308, "step": 5500 }, { "epoch": 0.384, "grad_norm": 0.40403613448143005, "learning_rate": 7.646586345381526e-05, "loss": 0.3254, "step": 6000 }, { "epoch": 0.416, "grad_norm": 0.32140398025512695, "learning_rate": 7.614457831325302e-05, "loss": 0.3142, "step": 6500 }, { "epoch": 0.448, "grad_norm": 0.36259984970092773, "learning_rate": 7.582329317269078e-05, "loss": 0.304, "step": 7000 }, { "epoch": 0.48, "grad_norm": 27.76270294189453, "learning_rate": 7.550200803212852e-05, "loss": 0.7921, "step": 7500 }, { "epoch": 0.512, "grad_norm": 0.35605108737945557, "learning_rate": 7.518072289156628e-05, "loss": 0.4819, "step": 8000 }, { "epoch": 0.544, "grad_norm": 0.29919707775115967, "learning_rate": 7.485943775100402e-05, "loss": 0.2955, "step": 8500 }, { "epoch": 0.576, "grad_norm": 0.27910181879997253, "learning_rate": 7.453815261044178e-05, "loss": 0.2891, "step": 9000 }, { "epoch": 0.608, "grad_norm": 0.3084375858306885, "learning_rate": 7.421686746987952e-05, "loss": 0.2839, "step": 9500 }, { "epoch": 0.64, "grad_norm": 1.0127158164978027, "learning_rate": 7.389558232931728e-05, "loss": 0.2975, "step": 10000 }, { "epoch": 0.672, "grad_norm": 0.29704657196998596, "learning_rate": 7.357429718875502e-05, "loss": 0.2784, "step": 10500 }, { "epoch": 0.704, "grad_norm": 0.26352909207344055, "learning_rate": 7.325301204819278e-05, "loss": 0.2722, "step": 11000 }, { "epoch": 0.736, "grad_norm": 0.3012610077857971, "learning_rate": 7.293172690763053e-05, "loss": 0.2671, "step": 11500 }, { "epoch": 0.768, "grad_norm": 239.10655212402344, "learning_rate": 7.261044176706828e-05, "loss": 0.268, "step": 12000 }, { "epoch": 0.8, "grad_norm": 0.2739589214324951, "learning_rate": 7.228915662650603e-05, "loss": 0.2758, "step": 12500 }, { "epoch": 0.8, "eval_loss": 0.1865103393793106, "eval_runtime": 13.4887, "eval_samples_per_second": 148.272, "eval_steps_per_second": 6.227, "step": 12500 }, { "epoch": 0.832, "grad_norm": 0.3086440861225128, "learning_rate": 7.196787148594378e-05, "loss": 0.258, "step": 13000 }, { "epoch": 0.864, "grad_norm": 0.25051313638687134, "learning_rate": 7.164658634538153e-05, "loss": 0.255, "step": 13500 }, { "epoch": 0.896, "grad_norm": 0.2483261674642563, "learning_rate": 7.132530120481928e-05, "loss": 0.2505, "step": 14000 }, { "epoch": 0.928, "grad_norm": 0.2732410728931427, "learning_rate": 7.100401606425703e-05, "loss": 0.2456, "step": 14500 }, { "epoch": 0.96, "grad_norm": 0.2605254054069519, "learning_rate": 7.068273092369479e-05, "loss": 0.2487, "step": 15000 }, { "epoch": 0.992, "grad_norm": 0.294114887714386, "learning_rate": 7.036144578313253e-05, "loss": 0.2448, "step": 15500 }, { "epoch": 1.024, "grad_norm": 0.27836671471595764, "learning_rate": 7.004016064257029e-05, "loss": 0.2342, "step": 16000 }, { "epoch": 1.056, "grad_norm": 0.2619302272796631, "learning_rate": 6.971887550200803e-05, "loss": 0.2282, "step": 16500 }, { "epoch": 1.088, "grad_norm": 0.2845689058303833, "learning_rate": 6.939759036144579e-05, "loss": 0.2254, "step": 17000 }, { "epoch": 1.12, "grad_norm": 0.3349343538284302, "learning_rate": 6.907630522088353e-05, "loss": 0.2226, "step": 17500 }, { "epoch": 1.152, "grad_norm": 0.293632835149765, "learning_rate": 6.875502008032129e-05, "loss": 0.2195, "step": 18000 }, { "epoch": 1.184, "grad_norm": 0.2546076476573944, "learning_rate": 6.843373493975903e-05, "loss": 0.2212, "step": 18500 }, { "epoch": 1.216, "grad_norm": 0.25859034061431885, "learning_rate": 6.811244979919679e-05, "loss": 0.2197, "step": 19000 }, { "epoch": 1.248, "grad_norm": 0.25642773509025574, "learning_rate": 6.779116465863455e-05, "loss": 0.2167, "step": 19500 }, { "epoch": 1.28, "grad_norm": 0.25777679681777954, "learning_rate": 6.74698795180723e-05, "loss": 0.2171, "step": 20000 }, { "epoch": 1.312, "grad_norm": 0.28767868876457214, "learning_rate": 6.714859437751005e-05, "loss": 0.2122, "step": 20500 }, { "epoch": 1.3439999999999999, "grad_norm": 0.24645094573497772, "learning_rate": 6.682730923694779e-05, "loss": 0.2121, "step": 21000 }, { "epoch": 1.376, "grad_norm": 0.27973559498786926, "learning_rate": 6.650602409638555e-05, "loss": 0.209, "step": 21500 }, { "epoch": 1.408, "grad_norm": 0.2312808781862259, "learning_rate": 6.618473895582329e-05, "loss": 0.207, "step": 22000 }, { "epoch": 1.44, "grad_norm": 0.32198190689086914, "learning_rate": 6.586345381526105e-05, "loss": 0.2062, "step": 22500 }, { "epoch": 1.472, "grad_norm": 0.26178357005119324, "learning_rate": 6.55421686746988e-05, "loss": 0.2054, "step": 23000 }, { "epoch": 1.504, "grad_norm": 0.2309255599975586, "learning_rate": 6.522088353413655e-05, "loss": 0.2029, "step": 23500 }, { "epoch": 1.536, "grad_norm": 0.22608640789985657, "learning_rate": 6.48995983935743e-05, "loss": 0.2038, "step": 24000 }, { "epoch": 1.568, "grad_norm": 0.20961375534534454, "learning_rate": 6.457831325301206e-05, "loss": 0.2016, "step": 24500 }, { "epoch": 1.6, "grad_norm": 0.1852613389492035, "learning_rate": 6.42570281124498e-05, "loss": 0.2003, "step": 25000 }, { "epoch": 1.6, "eval_loss": 0.13480305671691895, "eval_runtime": 13.4963, "eval_samples_per_second": 148.189, "eval_steps_per_second": 6.224, "step": 25000 }, { "epoch": 1.6320000000000001, "grad_norm": 0.25356176495552063, "learning_rate": 6.393574297188755e-05, "loss": 0.1991, "step": 25500 }, { "epoch": 1.6640000000000001, "grad_norm": 0.23418042063713074, "learning_rate": 6.36144578313253e-05, "loss": 0.1993, "step": 26000 }, { "epoch": 1.696, "grad_norm": 0.4756409823894501, "learning_rate": 6.329317269076305e-05, "loss": 0.1962, "step": 26500 }, { "epoch": 1.728, "grad_norm": 0.31074902415275574, "learning_rate": 6.29718875502008e-05, "loss": 0.1946, "step": 27000 }, { "epoch": 1.76, "grad_norm": 0.2518517076969147, "learning_rate": 6.265060240963856e-05, "loss": 0.1936, "step": 27500 }, { "epoch": 1.792, "grad_norm": 0.23294122517108917, "learning_rate": 6.232931726907632e-05, "loss": 0.1931, "step": 28000 }, { "epoch": 1.8239999999999998, "grad_norm": 0.30960217118263245, "learning_rate": 6.200803212851406e-05, "loss": 0.1916, "step": 28500 }, { "epoch": 1.8559999999999999, "grad_norm": 0.2612380385398865, "learning_rate": 6.168674698795182e-05, "loss": 0.1941, "step": 29000 }, { "epoch": 1.888, "grad_norm": 0.23748044669628143, "learning_rate": 6.136546184738956e-05, "loss": 0.1912, "step": 29500 }, { "epoch": 1.92, "grad_norm": 0.21203479170799255, "learning_rate": 6.104417670682732e-05, "loss": 0.1905, "step": 30000 }, { "epoch": 1.952, "grad_norm": 0.23570066690444946, "learning_rate": 6.072289156626506e-05, "loss": 0.1882, "step": 30500 }, { "epoch": 1.984, "grad_norm": 0.24766422808170319, "learning_rate": 6.040160642570281e-05, "loss": 0.1908, "step": 31000 }, { "epoch": 2.016, "grad_norm": 0.21270865201950073, "learning_rate": 6.008032128514057e-05, "loss": 0.1835, "step": 31500 }, { "epoch": 2.048, "grad_norm": 0.2123081386089325, "learning_rate": 5.975903614457831e-05, "loss": 0.1765, "step": 32000 }, { "epoch": 2.08, "grad_norm": 0.3412201404571533, "learning_rate": 5.943775100401607e-05, "loss": 0.1931, "step": 32500 }, { "epoch": 2.112, "grad_norm": 0.2280445396900177, "learning_rate": 5.911646586345382e-05, "loss": 0.1893, "step": 33000 }, { "epoch": 2.144, "grad_norm": 0.2307668924331665, "learning_rate": 5.8795180722891576e-05, "loss": 0.1826, "step": 33500 }, { "epoch": 2.176, "grad_norm": 0.22051069140434265, "learning_rate": 5.847389558232932e-05, "loss": 0.1751, "step": 34000 }, { "epoch": 2.208, "grad_norm": 0.28806522488594055, "learning_rate": 5.8152610441767076e-05, "loss": 0.1771, "step": 34500 }, { "epoch": 2.24, "grad_norm": 0.19143815338611603, "learning_rate": 5.7831325301204826e-05, "loss": 0.1774, "step": 35000 }, { "epoch": 2.2720000000000002, "grad_norm": 0.2986365556716919, "learning_rate": 5.751004016064257e-05, "loss": 0.1752, "step": 35500 }, { "epoch": 2.304, "grad_norm": 0.19981278479099274, "learning_rate": 5.7188755020080326e-05, "loss": 0.1756, "step": 36000 }, { "epoch": 2.336, "grad_norm": 0.2546658515930176, "learning_rate": 5.6867469879518076e-05, "loss": 0.1741, "step": 36500 }, { "epoch": 2.368, "grad_norm": 0.1719229370355606, "learning_rate": 5.6546184738955826e-05, "loss": 0.174, "step": 37000 }, { "epoch": 2.4, "grad_norm": 0.22920982539653778, "learning_rate": 5.6224899598393576e-05, "loss": 0.1707, "step": 37500 }, { "epoch": 2.4, "eval_loss": 0.11245977133512497, "eval_runtime": 13.3722, "eval_samples_per_second": 149.565, "eval_steps_per_second": 6.282, "step": 37500 }, { "epoch": 2.432, "grad_norm": 0.22950303554534912, "learning_rate": 5.590361445783133e-05, "loss": 0.1715, "step": 38000 }, { "epoch": 2.464, "grad_norm": 0.19281500577926636, "learning_rate": 5.5582329317269076e-05, "loss": 0.1719, "step": 38500 }, { "epoch": 2.496, "grad_norm": 0.3043808043003082, "learning_rate": 5.526104417670683e-05, "loss": 0.1709, "step": 39000 }, { "epoch": 2.528, "grad_norm": 0.2020847350358963, "learning_rate": 5.493975903614458e-05, "loss": 0.1687, "step": 39500 }, { "epoch": 2.56, "grad_norm": 0.2701270878314972, "learning_rate": 5.461847389558234e-05, "loss": 0.1689, "step": 40000 }, { "epoch": 2.592, "grad_norm": 0.20151746273040771, "learning_rate": 5.429718875502008e-05, "loss": 0.1703, "step": 40500 }, { "epoch": 2.624, "grad_norm": 0.28613436222076416, "learning_rate": 5.397590361445783e-05, "loss": 0.1668, "step": 41000 }, { "epoch": 2.656, "grad_norm": 0.24828064441680908, "learning_rate": 5.365461847389559e-05, "loss": 0.1671, "step": 41500 }, { "epoch": 2.6879999999999997, "grad_norm": 0.2424352616071701, "learning_rate": 5.333333333333333e-05, "loss": 0.1678, "step": 42000 }, { "epoch": 2.7199999999999998, "grad_norm": 0.1992502510547638, "learning_rate": 5.301204819277109e-05, "loss": 0.165, "step": 42500 }, { "epoch": 2.752, "grad_norm": 0.19437122344970703, "learning_rate": 5.269076305220884e-05, "loss": 0.1658, "step": 43000 }, { "epoch": 2.784, "grad_norm": 0.2736295461654663, "learning_rate": 5.23694779116466e-05, "loss": 0.165, "step": 43500 }, { "epoch": 2.816, "grad_norm": 0.20108367502689362, "learning_rate": 5.204819277108434e-05, "loss": 0.1661, "step": 44000 }, { "epoch": 2.848, "grad_norm": 0.17667348682880402, "learning_rate": 5.17269076305221e-05, "loss": 0.163, "step": 44500 }, { "epoch": 2.88, "grad_norm": 0.24210619926452637, "learning_rate": 5.140562248995984e-05, "loss": 0.1634, "step": 45000 }, { "epoch": 2.912, "grad_norm": 0.2011321634054184, "learning_rate": 5.108433734939759e-05, "loss": 0.1634, "step": 45500 }, { "epoch": 2.944, "grad_norm": 0.1979280710220337, "learning_rate": 5.076305220883535e-05, "loss": 0.162, "step": 46000 }, { "epoch": 2.976, "grad_norm": 0.23149700462818146, "learning_rate": 5.044176706827309e-05, "loss": 0.1605, "step": 46500 }, { "epoch": 3.008, "grad_norm": 0.1558334231376648, "learning_rate": 5.012048192771085e-05, "loss": 0.1575, "step": 47000 }, { "epoch": 3.04, "grad_norm": 0.251295268535614, "learning_rate": 4.97991967871486e-05, "loss": 0.1518, "step": 47500 }, { "epoch": 3.072, "grad_norm": 0.1870911866426468, "learning_rate": 4.9477911646586354e-05, "loss": 0.151, "step": 48000 }, { "epoch": 3.104, "grad_norm": 0.19055207073688507, "learning_rate": 4.91566265060241e-05, "loss": 0.1506, "step": 48500 }, { "epoch": 3.136, "grad_norm": 0.19039425253868103, "learning_rate": 4.8835341365461854e-05, "loss": 0.1502, "step": 49000 }, { "epoch": 3.168, "grad_norm": 0.2273644655942917, "learning_rate": 4.8514056224899604e-05, "loss": 0.1515, "step": 49500 }, { "epoch": 3.2, "grad_norm": 0.20844458043575287, "learning_rate": 4.819277108433736e-05, "loss": 0.15, "step": 50000 }, { "epoch": 3.2, "eval_loss": 0.09431243687868118, "eval_runtime": 13.3841, "eval_samples_per_second": 149.431, "eval_steps_per_second": 6.276, "step": 50000 }, { "epoch": 3.232, "grad_norm": 0.21888093650341034, "learning_rate": 4.7871485943775104e-05, "loss": 0.1514, "step": 50500 }, { "epoch": 3.2640000000000002, "grad_norm": 0.1797676682472229, "learning_rate": 4.7550200803212854e-05, "loss": 0.1505, "step": 51000 }, { "epoch": 3.296, "grad_norm": 0.17472127079963684, "learning_rate": 4.7228915662650604e-05, "loss": 0.15, "step": 51500 }, { "epoch": 3.328, "grad_norm": 0.1877003014087677, "learning_rate": 4.6907630522088354e-05, "loss": 0.1501, "step": 52000 }, { "epoch": 3.36, "grad_norm": 0.2039192020893097, "learning_rate": 4.658634538152611e-05, "loss": 0.1509, "step": 52500 }, { "epoch": 3.392, "grad_norm": 0.19264955818653107, "learning_rate": 4.6265060240963854e-05, "loss": 0.147, "step": 53000 }, { "epoch": 3.424, "grad_norm": 0.2461443841457367, "learning_rate": 4.594377510040161e-05, "loss": 0.1473, "step": 53500 }, { "epoch": 3.456, "grad_norm": 0.245579794049263, "learning_rate": 4.562248995983936e-05, "loss": 0.148, "step": 54000 }, { "epoch": 3.488, "grad_norm": 0.2419605553150177, "learning_rate": 4.530120481927712e-05, "loss": 0.1483, "step": 54500 }, { "epoch": 3.52, "grad_norm": 0.21280065178871155, "learning_rate": 4.497991967871486e-05, "loss": 0.1465, "step": 55000 }, { "epoch": 3.552, "grad_norm": 0.19970615208148956, "learning_rate": 4.465863453815261e-05, "loss": 0.1479, "step": 55500 }, { "epoch": 3.584, "grad_norm": 0.178068608045578, "learning_rate": 4.433734939759037e-05, "loss": 0.1469, "step": 56000 }, { "epoch": 3.616, "grad_norm": 0.2007550597190857, "learning_rate": 4.401606425702811e-05, "loss": 0.1484, "step": 56500 }, { "epoch": 3.648, "grad_norm": 0.2703693211078644, "learning_rate": 4.369477911646587e-05, "loss": 0.1461, "step": 57000 }, { "epoch": 3.68, "grad_norm": 0.18838930130004883, "learning_rate": 4.337349397590362e-05, "loss": 0.1467, "step": 57500 }, { "epoch": 3.7119999999999997, "grad_norm": 0.22705510258674622, "learning_rate": 4.3052208835341375e-05, "loss": 0.1471, "step": 58000 }, { "epoch": 3.7439999999999998, "grad_norm": 0.18866540491580963, "learning_rate": 4.273092369477912e-05, "loss": 0.1464, "step": 58500 }, { "epoch": 3.776, "grad_norm": 0.16808009147644043, "learning_rate": 4.2409638554216875e-05, "loss": 0.1469, "step": 59000 }, { "epoch": 3.808, "grad_norm": 0.20790338516235352, "learning_rate": 4.208835341365462e-05, "loss": 0.1455, "step": 59500 }, { "epoch": 3.84, "grad_norm": 0.20283524692058563, "learning_rate": 4.1767068273092375e-05, "loss": 0.1453, "step": 60000 }, { "epoch": 3.872, "grad_norm": 0.18522211909294128, "learning_rate": 4.1445783132530125e-05, "loss": 0.1449, "step": 60500 }, { "epoch": 3.904, "grad_norm": 0.2298567146062851, "learning_rate": 4.112449799196787e-05, "loss": 0.1442, "step": 61000 }, { "epoch": 3.936, "grad_norm": 0.23237478733062744, "learning_rate": 4.0803212851405625e-05, "loss": 0.1445, "step": 61500 }, { "epoch": 3.968, "grad_norm": 0.18708902597427368, "learning_rate": 4.0481927710843375e-05, "loss": 0.1436, "step": 62000 }, { "epoch": 4.0, "grad_norm": 0.2235335260629654, "learning_rate": 4.016064257028113e-05, "loss": 0.145, "step": 62500 }, { "epoch": 4.0, "eval_loss": 0.08151204138994217, "eval_runtime": 13.1737, "eval_samples_per_second": 151.818, "eval_steps_per_second": 6.376, "step": 62500 }, { "epoch": 4.032, "grad_norm": 0.27586033940315247, "learning_rate": 3.9839357429718875e-05, "loss": 0.1361, "step": 63000 }, { "epoch": 4.064, "grad_norm": 0.20399342477321625, "learning_rate": 3.9518072289156625e-05, "loss": 0.1359, "step": 63500 }, { "epoch": 4.096, "grad_norm": 0.2167077660560608, "learning_rate": 3.919678714859438e-05, "loss": 0.1349, "step": 64000 }, { "epoch": 4.128, "grad_norm": 0.2217278927564621, "learning_rate": 3.887550200803213e-05, "loss": 0.1343, "step": 64500 }, { "epoch": 4.16, "grad_norm": 0.17411163449287415, "learning_rate": 3.855421686746988e-05, "loss": 0.1353, "step": 65000 }, { "epoch": 4.192, "grad_norm": 0.2262706607580185, "learning_rate": 3.823293172690763e-05, "loss": 0.1355, "step": 65500 }, { "epoch": 4.224, "grad_norm": 0.21051813662052155, "learning_rate": 3.791164658634539e-05, "loss": 0.1351, "step": 66000 }, { "epoch": 4.256, "grad_norm": 0.2202002853155136, "learning_rate": 3.759036144578314e-05, "loss": 0.1362, "step": 66500 }, { "epoch": 4.288, "grad_norm": 0.1532248854637146, "learning_rate": 3.726907630522089e-05, "loss": 0.1338, "step": 67000 }, { "epoch": 4.32, "grad_norm": 0.25249359011650085, "learning_rate": 3.694779116465864e-05, "loss": 0.1334, "step": 67500 }, { "epoch": 4.352, "grad_norm": 0.2392909973859787, "learning_rate": 3.662650602409639e-05, "loss": 0.1354, "step": 68000 }, { "epoch": 4.384, "grad_norm": 0.3180345892906189, "learning_rate": 3.630522088353414e-05, "loss": 0.1353, "step": 68500 }, { "epoch": 4.416, "grad_norm": 0.27343523502349854, "learning_rate": 3.598393574297189e-05, "loss": 0.1354, "step": 69000 }, { "epoch": 4.448, "grad_norm": 0.17806372046470642, "learning_rate": 3.566265060240964e-05, "loss": 0.1351, "step": 69500 }, { "epoch": 4.48, "grad_norm": 0.17694541811943054, "learning_rate": 3.5341365461847396e-05, "loss": 0.135, "step": 70000 }, { "epoch": 4.5120000000000005, "grad_norm": 0.1796264797449112, "learning_rate": 3.5020080321285146e-05, "loss": 0.1326, "step": 70500 }, { "epoch": 4.5440000000000005, "grad_norm": 0.16896295547485352, "learning_rate": 3.4698795180722896e-05, "loss": 0.1327, "step": 71000 }, { "epoch": 4.576, "grad_norm": 0.16427506506443024, "learning_rate": 3.4377510040160646e-05, "loss": 0.1341, "step": 71500 }, { "epoch": 4.608, "grad_norm": 0.15947696566581726, "learning_rate": 3.4056224899598396e-05, "loss": 0.1336, "step": 72000 }, { "epoch": 4.64, "grad_norm": 0.24063943326473236, "learning_rate": 3.373493975903615e-05, "loss": 0.1326, "step": 72500 }, { "epoch": 4.672, "grad_norm": 0.2784833610057831, "learning_rate": 3.3413654618473896e-05, "loss": 0.1313, "step": 73000 }, { "epoch": 4.704, "grad_norm": 0.1624738872051239, "learning_rate": 3.3092369477911646e-05, "loss": 0.1343, "step": 73500 }, { "epoch": 4.736, "grad_norm": 0.163747176527977, "learning_rate": 3.27710843373494e-05, "loss": 0.1332, "step": 74000 }, { "epoch": 4.768, "grad_norm": 0.19286634027957916, "learning_rate": 3.244979919678715e-05, "loss": 0.1314, "step": 74500 }, { "epoch": 4.8, "grad_norm": 0.2070024311542511, "learning_rate": 3.21285140562249e-05, "loss": 0.1323, "step": 75000 }, { "epoch": 4.8, "eval_loss": 0.07321055233478546, "eval_runtime": 13.3287, "eval_samples_per_second": 150.052, "eval_steps_per_second": 6.302, "step": 75000 }, { "epoch": 4.832, "grad_norm": 0.25665566325187683, "learning_rate": 3.180722891566265e-05, "loss": 0.133, "step": 75500 }, { "epoch": 4.864, "grad_norm": 0.18933174014091492, "learning_rate": 3.14859437751004e-05, "loss": 0.1334, "step": 76000 }, { "epoch": 4.896, "grad_norm": 0.20106372237205505, "learning_rate": 3.116465863453816e-05, "loss": 0.1318, "step": 76500 }, { "epoch": 4.928, "grad_norm": 0.23300665616989136, "learning_rate": 3.084337349397591e-05, "loss": 0.1314, "step": 77000 }, { "epoch": 4.96, "grad_norm": 0.2776864171028137, "learning_rate": 3.052208835341366e-05, "loss": 0.1307, "step": 77500 }, { "epoch": 4.992, "grad_norm": 0.17158937454223633, "learning_rate": 3.0200803212851406e-05, "loss": 0.1328, "step": 78000 }, { "epoch": 5.024, "grad_norm": 0.19341541826725006, "learning_rate": 2.9879518072289156e-05, "loss": 0.1255, "step": 78500 }, { "epoch": 5.056, "grad_norm": 0.1820104718208313, "learning_rate": 2.955823293172691e-05, "loss": 0.1239, "step": 79000 }, { "epoch": 5.088, "grad_norm": 0.19204023480415344, "learning_rate": 2.923694779116466e-05, "loss": 0.1251, "step": 79500 }, { "epoch": 5.12, "grad_norm": 0.20374265313148499, "learning_rate": 2.8915662650602413e-05, "loss": 0.1238, "step": 80000 }, { "epoch": 5.152, "grad_norm": 0.17607811093330383, "learning_rate": 2.8594377510040163e-05, "loss": 0.1242, "step": 80500 }, { "epoch": 5.184, "grad_norm": 0.18394358456134796, "learning_rate": 2.8273092369477913e-05, "loss": 0.1246, "step": 81000 }, { "epoch": 5.216, "grad_norm": 0.1720678061246872, "learning_rate": 2.7951807228915666e-05, "loss": 0.1234, "step": 81500 }, { "epoch": 5.248, "grad_norm": 0.1649816781282425, "learning_rate": 2.7630522088353417e-05, "loss": 0.1246, "step": 82000 }, { "epoch": 5.28, "grad_norm": 0.24089868366718292, "learning_rate": 2.730923694779117e-05, "loss": 0.1236, "step": 82500 }, { "epoch": 5.312, "grad_norm": 0.16703809797763824, "learning_rate": 2.6987951807228917e-05, "loss": 0.1235, "step": 83000 }, { "epoch": 5.344, "grad_norm": 0.2375577837228775, "learning_rate": 2.6666666666666667e-05, "loss": 0.1255, "step": 83500 }, { "epoch": 5.376, "grad_norm": 0.20844422280788422, "learning_rate": 2.634538152610442e-05, "loss": 0.124, "step": 84000 }, { "epoch": 5.408, "grad_norm": 0.15090999007225037, "learning_rate": 2.602409638554217e-05, "loss": 0.1236, "step": 84500 }, { "epoch": 5.44, "grad_norm": 0.1982196420431137, "learning_rate": 2.570281124497992e-05, "loss": 0.1234, "step": 85000 }, { "epoch": 5.4719999999999995, "grad_norm": 0.20548874139785767, "learning_rate": 2.5381526104417673e-05, "loss": 0.1237, "step": 85500 }, { "epoch": 5.504, "grad_norm": 0.1553628295660019, "learning_rate": 2.5060240963855423e-05, "loss": 0.1241, "step": 86000 }, { "epoch": 5.536, "grad_norm": 0.20827996730804443, "learning_rate": 2.4738955823293177e-05, "loss": 0.1236, "step": 86500 }, { "epoch": 5.568, "grad_norm": 0.26525431871414185, "learning_rate": 2.4417670682730927e-05, "loss": 0.1244, "step": 87000 }, { "epoch": 5.6, "grad_norm": 0.16824448108673096, "learning_rate": 2.409638554216868e-05, "loss": 0.124, "step": 87500 }, { "epoch": 5.6, "eval_loss": 0.06600421667098999, "eval_runtime": 13.2001, "eval_samples_per_second": 151.514, "eval_steps_per_second": 6.364, "step": 87500 }, { "epoch": 5.632, "grad_norm": 0.19588659703731537, "learning_rate": 2.3775100401606427e-05, "loss": 0.1239, "step": 88000 }, { "epoch": 5.664, "grad_norm": 0.20829927921295166, "learning_rate": 2.3453815261044177e-05, "loss": 0.1236, "step": 88500 }, { "epoch": 5.696, "grad_norm": 0.17067208886146545, "learning_rate": 2.3132530120481927e-05, "loss": 0.1246, "step": 89000 }, { "epoch": 5.728, "grad_norm": 0.17796407639980316, "learning_rate": 2.281124497991968e-05, "loss": 0.123, "step": 89500 }, { "epoch": 5.76, "grad_norm": 0.17811580002307892, "learning_rate": 2.248995983935743e-05, "loss": 0.1247, "step": 90000 }, { "epoch": 5.792, "grad_norm": 0.16586844623088837, "learning_rate": 2.2168674698795184e-05, "loss": 0.1226, "step": 90500 }, { "epoch": 5.824, "grad_norm": 0.16634885966777802, "learning_rate": 2.1847389558232934e-05, "loss": 0.1239, "step": 91000 }, { "epoch": 5.856, "grad_norm": 0.1891159564256668, "learning_rate": 2.1526104417670687e-05, "loss": 0.1234, "step": 91500 }, { "epoch": 5.888, "grad_norm": 0.2127494215965271, "learning_rate": 2.1204819277108437e-05, "loss": 0.1227, "step": 92000 }, { "epoch": 5.92, "grad_norm": 0.15109600126743317, "learning_rate": 2.0883534136546187e-05, "loss": 0.1218, "step": 92500 }, { "epoch": 5.952, "grad_norm": 0.14382487535476685, "learning_rate": 2.0562248995983934e-05, "loss": 0.1216, "step": 93000 }, { "epoch": 5.984, "grad_norm": 0.2122729867696762, "learning_rate": 2.0240963855421687e-05, "loss": 0.1248, "step": 93500 }, { "epoch": 6.016, "grad_norm": 0.19725599884986877, "learning_rate": 1.9919678714859437e-05, "loss": 0.12, "step": 94000 }, { "epoch": 6.048, "grad_norm": 0.18302911520004272, "learning_rate": 1.959839357429719e-05, "loss": 0.1174, "step": 94500 }, { "epoch": 6.08, "grad_norm": 0.19952338933944702, "learning_rate": 1.927710843373494e-05, "loss": 0.1176, "step": 95000 }, { "epoch": 6.112, "grad_norm": 0.16824807226657867, "learning_rate": 1.8955823293172694e-05, "loss": 0.1174, "step": 95500 }, { "epoch": 6.144, "grad_norm": 0.22110256552696228, "learning_rate": 1.8634538152610444e-05, "loss": 0.1172, "step": 96000 }, { "epoch": 6.176, "grad_norm": 0.20097705721855164, "learning_rate": 1.8313253012048194e-05, "loss": 0.1171, "step": 96500 }, { "epoch": 6.208, "grad_norm": 0.1569439023733139, "learning_rate": 1.7991967871485944e-05, "loss": 0.1177, "step": 97000 }, { "epoch": 6.24, "grad_norm": 0.1618974804878235, "learning_rate": 1.7670682730923698e-05, "loss": 0.1169, "step": 97500 }, { "epoch": 6.272, "grad_norm": 0.2146083265542984, "learning_rate": 1.7349397590361448e-05, "loss": 0.1172, "step": 98000 }, { "epoch": 6.304, "grad_norm": 0.16460789740085602, "learning_rate": 1.7028112449799198e-05, "loss": 0.1166, "step": 98500 }, { "epoch": 6.336, "grad_norm": 0.20428918302059174, "learning_rate": 1.6706827309236948e-05, "loss": 0.1177, "step": 99000 }, { "epoch": 6.368, "grad_norm": 0.1996203064918518, "learning_rate": 1.63855421686747e-05, "loss": 0.1172, "step": 99500 }, { "epoch": 6.4, "grad_norm": 0.22684213519096375, "learning_rate": 1.606425702811245e-05, "loss": 0.118, "step": 100000 }, { "epoch": 6.4, "eval_loss": 0.061658285558223724, "eval_runtime": 13.1578, "eval_samples_per_second": 152.001, "eval_steps_per_second": 6.384, "step": 100000 } ], "logging_steps": 500, "max_steps": 125000, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 12500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.459712365560463e+18, "train_batch_size": 24, "trial_name": null, "trial_params": null }