| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 6.4, | |
| "eval_steps": 12500, | |
| "global_step": 100000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 3.001525402069092, | |
| "learning_rate": 8e-05, | |
| "loss": 13.1617, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 0.5208349227905273, | |
| "learning_rate": 7.967871485943775e-05, | |
| "loss": 1.4028, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 0.43057793378829956, | |
| "learning_rate": 7.93574297188755e-05, | |
| "loss": 1.0426, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 0.53807532787323, | |
| "learning_rate": 7.903614457831325e-05, | |
| "loss": 0.8407, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.7076811194419861, | |
| "learning_rate": 7.8714859437751e-05, | |
| "loss": 0.7035, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 0.5321575999259949, | |
| "learning_rate": 7.839357429718876e-05, | |
| "loss": 0.4479, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 0.5099969506263733, | |
| "learning_rate": 7.807228915662652e-05, | |
| "loss": 0.4085, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 0.3821808099746704, | |
| "learning_rate": 7.775100401606426e-05, | |
| "loss": 0.3811, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 0.45157989859580994, | |
| "learning_rate": 7.742971887550202e-05, | |
| "loss": 0.3604, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.3053194582462311, | |
| "learning_rate": 7.710843373493976e-05, | |
| "loss": 0.3442, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 0.45309627056121826, | |
| "learning_rate": 7.678714859437751e-05, | |
| "loss": 0.3308, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 0.40403613448143005, | |
| "learning_rate": 7.646586345381526e-05, | |
| "loss": 0.3254, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 0.32140398025512695, | |
| "learning_rate": 7.614457831325302e-05, | |
| "loss": 0.3142, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 0.36259984970092773, | |
| "learning_rate": 7.582329317269078e-05, | |
| "loss": 0.304, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 27.76270294189453, | |
| "learning_rate": 7.550200803212852e-05, | |
| "loss": 0.7921, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 0.35605108737945557, | |
| "learning_rate": 7.518072289156628e-05, | |
| "loss": 0.4819, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 0.29919707775115967, | |
| "learning_rate": 7.485943775100402e-05, | |
| "loss": 0.2955, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 0.27910181879997253, | |
| "learning_rate": 7.453815261044178e-05, | |
| "loss": 0.2891, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 0.3084375858306885, | |
| "learning_rate": 7.421686746987952e-05, | |
| "loss": 0.2839, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.0127158164978027, | |
| "learning_rate": 7.389558232931728e-05, | |
| "loss": 0.2975, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 0.29704657196998596, | |
| "learning_rate": 7.357429718875502e-05, | |
| "loss": 0.2784, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 0.26352909207344055, | |
| "learning_rate": 7.325301204819278e-05, | |
| "loss": 0.2722, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 0.3012610077857971, | |
| "learning_rate": 7.293172690763053e-05, | |
| "loss": 0.2671, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 239.10655212402344, | |
| "learning_rate": 7.261044176706828e-05, | |
| "loss": 0.268, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.2739589214324951, | |
| "learning_rate": 7.228915662650603e-05, | |
| "loss": 0.2758, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_loss": 0.1865103393793106, | |
| "eval_runtime": 13.4887, | |
| "eval_samples_per_second": 148.272, | |
| "eval_steps_per_second": 6.227, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 0.3086440861225128, | |
| "learning_rate": 7.196787148594378e-05, | |
| "loss": 0.258, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 0.25051313638687134, | |
| "learning_rate": 7.164658634538153e-05, | |
| "loss": 0.255, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 0.2483261674642563, | |
| "learning_rate": 7.132530120481928e-05, | |
| "loss": 0.2505, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 0.2732410728931427, | |
| "learning_rate": 7.100401606425703e-05, | |
| "loss": 0.2456, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.2605254054069519, | |
| "learning_rate": 7.068273092369479e-05, | |
| "loss": 0.2487, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 0.294114887714386, | |
| "learning_rate": 7.036144578313253e-05, | |
| "loss": 0.2448, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 1.024, | |
| "grad_norm": 0.27836671471595764, | |
| "learning_rate": 7.004016064257029e-05, | |
| "loss": 0.2342, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.056, | |
| "grad_norm": 0.2619302272796631, | |
| "learning_rate": 6.971887550200803e-05, | |
| "loss": 0.2282, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 1.088, | |
| "grad_norm": 0.2845689058303833, | |
| "learning_rate": 6.939759036144579e-05, | |
| "loss": 0.2254, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 0.3349343538284302, | |
| "learning_rate": 6.907630522088353e-05, | |
| "loss": 0.2226, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 1.152, | |
| "grad_norm": 0.293632835149765, | |
| "learning_rate": 6.875502008032129e-05, | |
| "loss": 0.2195, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.184, | |
| "grad_norm": 0.2546076476573944, | |
| "learning_rate": 6.843373493975903e-05, | |
| "loss": 0.2212, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 1.216, | |
| "grad_norm": 0.25859034061431885, | |
| "learning_rate": 6.811244979919679e-05, | |
| "loss": 0.2197, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.248, | |
| "grad_norm": 0.25642773509025574, | |
| "learning_rate": 6.779116465863455e-05, | |
| "loss": 0.2167, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 0.25777679681777954, | |
| "learning_rate": 6.74698795180723e-05, | |
| "loss": 0.2171, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.312, | |
| "grad_norm": 0.28767868876457214, | |
| "learning_rate": 6.714859437751005e-05, | |
| "loss": 0.2122, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 1.3439999999999999, | |
| "grad_norm": 0.24645094573497772, | |
| "learning_rate": 6.682730923694779e-05, | |
| "loss": 0.2121, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 1.376, | |
| "grad_norm": 0.27973559498786926, | |
| "learning_rate": 6.650602409638555e-05, | |
| "loss": 0.209, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 1.408, | |
| "grad_norm": 0.2312808781862259, | |
| "learning_rate": 6.618473895582329e-05, | |
| "loss": 0.207, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 0.32198190689086914, | |
| "learning_rate": 6.586345381526105e-05, | |
| "loss": 0.2062, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 1.472, | |
| "grad_norm": 0.26178357005119324, | |
| "learning_rate": 6.55421686746988e-05, | |
| "loss": 0.2054, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 1.504, | |
| "grad_norm": 0.2309255599975586, | |
| "learning_rate": 6.522088353413655e-05, | |
| "loss": 0.2029, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 1.536, | |
| "grad_norm": 0.22608640789985657, | |
| "learning_rate": 6.48995983935743e-05, | |
| "loss": 0.2038, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.568, | |
| "grad_norm": 0.20961375534534454, | |
| "learning_rate": 6.457831325301206e-05, | |
| "loss": 0.2016, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.1852613389492035, | |
| "learning_rate": 6.42570281124498e-05, | |
| "loss": 0.2003, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_loss": 0.13480305671691895, | |
| "eval_runtime": 13.4963, | |
| "eval_samples_per_second": 148.189, | |
| "eval_steps_per_second": 6.224, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.6320000000000001, | |
| "grad_norm": 0.25356176495552063, | |
| "learning_rate": 6.393574297188755e-05, | |
| "loss": 0.1991, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 1.6640000000000001, | |
| "grad_norm": 0.23418042063713074, | |
| "learning_rate": 6.36144578313253e-05, | |
| "loss": 0.1993, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 1.696, | |
| "grad_norm": 0.4756409823894501, | |
| "learning_rate": 6.329317269076305e-05, | |
| "loss": 0.1962, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 1.728, | |
| "grad_norm": 0.31074902415275574, | |
| "learning_rate": 6.29718875502008e-05, | |
| "loss": 0.1946, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 0.2518517076969147, | |
| "learning_rate": 6.265060240963856e-05, | |
| "loss": 0.1936, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 1.792, | |
| "grad_norm": 0.23294122517108917, | |
| "learning_rate": 6.232931726907632e-05, | |
| "loss": 0.1931, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 1.8239999999999998, | |
| "grad_norm": 0.30960217118263245, | |
| "learning_rate": 6.200803212851406e-05, | |
| "loss": 0.1916, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 1.8559999999999999, | |
| "grad_norm": 0.2612380385398865, | |
| "learning_rate": 6.168674698795182e-05, | |
| "loss": 0.1941, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 1.888, | |
| "grad_norm": 0.23748044669628143, | |
| "learning_rate": 6.136546184738956e-05, | |
| "loss": 0.1912, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 0.21203479170799255, | |
| "learning_rate": 6.104417670682732e-05, | |
| "loss": 0.1905, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.952, | |
| "grad_norm": 0.23570066690444946, | |
| "learning_rate": 6.072289156626506e-05, | |
| "loss": 0.1882, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 1.984, | |
| "grad_norm": 0.24766422808170319, | |
| "learning_rate": 6.040160642570281e-05, | |
| "loss": 0.1908, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 2.016, | |
| "grad_norm": 0.21270865201950073, | |
| "learning_rate": 6.008032128514057e-05, | |
| "loss": 0.1835, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 2.048, | |
| "grad_norm": 0.2123081386089325, | |
| "learning_rate": 5.975903614457831e-05, | |
| "loss": 0.1765, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 0.3412201404571533, | |
| "learning_rate": 5.943775100401607e-05, | |
| "loss": 0.1931, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 2.112, | |
| "grad_norm": 0.2280445396900177, | |
| "learning_rate": 5.911646586345382e-05, | |
| "loss": 0.1893, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 2.144, | |
| "grad_norm": 0.2307668924331665, | |
| "learning_rate": 5.8795180722891576e-05, | |
| "loss": 0.1826, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 2.176, | |
| "grad_norm": 0.22051069140434265, | |
| "learning_rate": 5.847389558232932e-05, | |
| "loss": 0.1751, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 2.208, | |
| "grad_norm": 0.28806522488594055, | |
| "learning_rate": 5.8152610441767076e-05, | |
| "loss": 0.1771, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 0.19143815338611603, | |
| "learning_rate": 5.7831325301204826e-05, | |
| "loss": 0.1774, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 2.2720000000000002, | |
| "grad_norm": 0.2986365556716919, | |
| "learning_rate": 5.751004016064257e-05, | |
| "loss": 0.1752, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 2.304, | |
| "grad_norm": 0.19981278479099274, | |
| "learning_rate": 5.7188755020080326e-05, | |
| "loss": 0.1756, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 2.336, | |
| "grad_norm": 0.2546658515930176, | |
| "learning_rate": 5.6867469879518076e-05, | |
| "loss": 0.1741, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 2.368, | |
| "grad_norm": 0.1719229370355606, | |
| "learning_rate": 5.6546184738955826e-05, | |
| "loss": 0.174, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 0.22920982539653778, | |
| "learning_rate": 5.6224899598393576e-05, | |
| "loss": 0.1707, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "eval_loss": 0.11245977133512497, | |
| "eval_runtime": 13.3722, | |
| "eval_samples_per_second": 149.565, | |
| "eval_steps_per_second": 6.282, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 2.432, | |
| "grad_norm": 0.22950303554534912, | |
| "learning_rate": 5.590361445783133e-05, | |
| "loss": 0.1715, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 2.464, | |
| "grad_norm": 0.19281500577926636, | |
| "learning_rate": 5.5582329317269076e-05, | |
| "loss": 0.1719, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 2.496, | |
| "grad_norm": 0.3043808043003082, | |
| "learning_rate": 5.526104417670683e-05, | |
| "loss": 0.1709, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 2.528, | |
| "grad_norm": 0.2020847350358963, | |
| "learning_rate": 5.493975903614458e-05, | |
| "loss": 0.1687, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 0.2701270878314972, | |
| "learning_rate": 5.461847389558234e-05, | |
| "loss": 0.1689, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 2.592, | |
| "grad_norm": 0.20151746273040771, | |
| "learning_rate": 5.429718875502008e-05, | |
| "loss": 0.1703, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 2.624, | |
| "grad_norm": 0.28613436222076416, | |
| "learning_rate": 5.397590361445783e-05, | |
| "loss": 0.1668, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 2.656, | |
| "grad_norm": 0.24828064441680908, | |
| "learning_rate": 5.365461847389559e-05, | |
| "loss": 0.1671, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 2.6879999999999997, | |
| "grad_norm": 0.2424352616071701, | |
| "learning_rate": 5.333333333333333e-05, | |
| "loss": 0.1678, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "grad_norm": 0.1992502510547638, | |
| "learning_rate": 5.301204819277109e-05, | |
| "loss": 0.165, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 2.752, | |
| "grad_norm": 0.19437122344970703, | |
| "learning_rate": 5.269076305220884e-05, | |
| "loss": 0.1658, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 2.784, | |
| "grad_norm": 0.2736295461654663, | |
| "learning_rate": 5.23694779116466e-05, | |
| "loss": 0.165, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 2.816, | |
| "grad_norm": 0.20108367502689362, | |
| "learning_rate": 5.204819277108434e-05, | |
| "loss": 0.1661, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 2.848, | |
| "grad_norm": 0.17667348682880402, | |
| "learning_rate": 5.17269076305221e-05, | |
| "loss": 0.163, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 0.24210619926452637, | |
| "learning_rate": 5.140562248995984e-05, | |
| "loss": 0.1634, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 2.912, | |
| "grad_norm": 0.2011321634054184, | |
| "learning_rate": 5.108433734939759e-05, | |
| "loss": 0.1634, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 2.944, | |
| "grad_norm": 0.1979280710220337, | |
| "learning_rate": 5.076305220883535e-05, | |
| "loss": 0.162, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 2.976, | |
| "grad_norm": 0.23149700462818146, | |
| "learning_rate": 5.044176706827309e-05, | |
| "loss": 0.1605, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 3.008, | |
| "grad_norm": 0.1558334231376648, | |
| "learning_rate": 5.012048192771085e-05, | |
| "loss": 0.1575, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 0.251295268535614, | |
| "learning_rate": 4.97991967871486e-05, | |
| "loss": 0.1518, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 3.072, | |
| "grad_norm": 0.1870911866426468, | |
| "learning_rate": 4.9477911646586354e-05, | |
| "loss": 0.151, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 3.104, | |
| "grad_norm": 0.19055207073688507, | |
| "learning_rate": 4.91566265060241e-05, | |
| "loss": 0.1506, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 3.136, | |
| "grad_norm": 0.19039425253868103, | |
| "learning_rate": 4.8835341365461854e-05, | |
| "loss": 0.1502, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 3.168, | |
| "grad_norm": 0.2273644655942917, | |
| "learning_rate": 4.8514056224899604e-05, | |
| "loss": 0.1515, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 0.20844458043575287, | |
| "learning_rate": 4.819277108433736e-05, | |
| "loss": 0.15, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "eval_loss": 0.09431243687868118, | |
| "eval_runtime": 13.3841, | |
| "eval_samples_per_second": 149.431, | |
| "eval_steps_per_second": 6.276, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 3.232, | |
| "grad_norm": 0.21888093650341034, | |
| "learning_rate": 4.7871485943775104e-05, | |
| "loss": 0.1514, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 3.2640000000000002, | |
| "grad_norm": 0.1797676682472229, | |
| "learning_rate": 4.7550200803212854e-05, | |
| "loss": 0.1505, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 3.296, | |
| "grad_norm": 0.17472127079963684, | |
| "learning_rate": 4.7228915662650604e-05, | |
| "loss": 0.15, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 3.328, | |
| "grad_norm": 0.1877003014087677, | |
| "learning_rate": 4.6907630522088354e-05, | |
| "loss": 0.1501, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "grad_norm": 0.2039192020893097, | |
| "learning_rate": 4.658634538152611e-05, | |
| "loss": 0.1509, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 3.392, | |
| "grad_norm": 0.19264955818653107, | |
| "learning_rate": 4.6265060240963854e-05, | |
| "loss": 0.147, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 3.424, | |
| "grad_norm": 0.2461443841457367, | |
| "learning_rate": 4.594377510040161e-05, | |
| "loss": 0.1473, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 3.456, | |
| "grad_norm": 0.245579794049263, | |
| "learning_rate": 4.562248995983936e-05, | |
| "loss": 0.148, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 3.488, | |
| "grad_norm": 0.2419605553150177, | |
| "learning_rate": 4.530120481927712e-05, | |
| "loss": 0.1483, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 0.21280065178871155, | |
| "learning_rate": 4.497991967871486e-05, | |
| "loss": 0.1465, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 3.552, | |
| "grad_norm": 0.19970615208148956, | |
| "learning_rate": 4.465863453815261e-05, | |
| "loss": 0.1479, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 3.584, | |
| "grad_norm": 0.178068608045578, | |
| "learning_rate": 4.433734939759037e-05, | |
| "loss": 0.1469, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 3.616, | |
| "grad_norm": 0.2007550597190857, | |
| "learning_rate": 4.401606425702811e-05, | |
| "loss": 0.1484, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 3.648, | |
| "grad_norm": 0.2703693211078644, | |
| "learning_rate": 4.369477911646587e-05, | |
| "loss": 0.1461, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "grad_norm": 0.18838930130004883, | |
| "learning_rate": 4.337349397590362e-05, | |
| "loss": 0.1467, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 3.7119999999999997, | |
| "grad_norm": 0.22705510258674622, | |
| "learning_rate": 4.3052208835341375e-05, | |
| "loss": 0.1471, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 3.7439999999999998, | |
| "grad_norm": 0.18866540491580963, | |
| "learning_rate": 4.273092369477912e-05, | |
| "loss": 0.1464, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 3.776, | |
| "grad_norm": 0.16808009147644043, | |
| "learning_rate": 4.2409638554216875e-05, | |
| "loss": 0.1469, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 3.808, | |
| "grad_norm": 0.20790338516235352, | |
| "learning_rate": 4.208835341365462e-05, | |
| "loss": 0.1455, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "grad_norm": 0.20283524692058563, | |
| "learning_rate": 4.1767068273092375e-05, | |
| "loss": 0.1453, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 3.872, | |
| "grad_norm": 0.18522211909294128, | |
| "learning_rate": 4.1445783132530125e-05, | |
| "loss": 0.1449, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 3.904, | |
| "grad_norm": 0.2298567146062851, | |
| "learning_rate": 4.112449799196787e-05, | |
| "loss": 0.1442, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 3.936, | |
| "grad_norm": 0.23237478733062744, | |
| "learning_rate": 4.0803212851405625e-05, | |
| "loss": 0.1445, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 3.968, | |
| "grad_norm": 0.18708902597427368, | |
| "learning_rate": 4.0481927710843375e-05, | |
| "loss": 0.1436, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.2235335260629654, | |
| "learning_rate": 4.016064257028113e-05, | |
| "loss": 0.145, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.08151204138994217, | |
| "eval_runtime": 13.1737, | |
| "eval_samples_per_second": 151.818, | |
| "eval_steps_per_second": 6.376, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 4.032, | |
| "grad_norm": 0.27586033940315247, | |
| "learning_rate": 3.9839357429718875e-05, | |
| "loss": 0.1361, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 4.064, | |
| "grad_norm": 0.20399342477321625, | |
| "learning_rate": 3.9518072289156625e-05, | |
| "loss": 0.1359, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 4.096, | |
| "grad_norm": 0.2167077660560608, | |
| "learning_rate": 3.919678714859438e-05, | |
| "loss": 0.1349, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 4.128, | |
| "grad_norm": 0.2217278927564621, | |
| "learning_rate": 3.887550200803213e-05, | |
| "loss": 0.1343, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 4.16, | |
| "grad_norm": 0.17411163449287415, | |
| "learning_rate": 3.855421686746988e-05, | |
| "loss": 0.1353, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 4.192, | |
| "grad_norm": 0.2262706607580185, | |
| "learning_rate": 3.823293172690763e-05, | |
| "loss": 0.1355, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 4.224, | |
| "grad_norm": 0.21051813662052155, | |
| "learning_rate": 3.791164658634539e-05, | |
| "loss": 0.1351, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 4.256, | |
| "grad_norm": 0.2202002853155136, | |
| "learning_rate": 3.759036144578314e-05, | |
| "loss": 0.1362, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 4.288, | |
| "grad_norm": 0.1532248854637146, | |
| "learning_rate": 3.726907630522089e-05, | |
| "loss": 0.1338, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 4.32, | |
| "grad_norm": 0.25249359011650085, | |
| "learning_rate": 3.694779116465864e-05, | |
| "loss": 0.1334, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 4.352, | |
| "grad_norm": 0.2392909973859787, | |
| "learning_rate": 3.662650602409639e-05, | |
| "loss": 0.1354, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 4.384, | |
| "grad_norm": 0.3180345892906189, | |
| "learning_rate": 3.630522088353414e-05, | |
| "loss": 0.1353, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 4.416, | |
| "grad_norm": 0.27343523502349854, | |
| "learning_rate": 3.598393574297189e-05, | |
| "loss": 0.1354, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 4.448, | |
| "grad_norm": 0.17806372046470642, | |
| "learning_rate": 3.566265060240964e-05, | |
| "loss": 0.1351, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "grad_norm": 0.17694541811943054, | |
| "learning_rate": 3.5341365461847396e-05, | |
| "loss": 0.135, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 4.5120000000000005, | |
| "grad_norm": 0.1796264797449112, | |
| "learning_rate": 3.5020080321285146e-05, | |
| "loss": 0.1326, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 4.5440000000000005, | |
| "grad_norm": 0.16896295547485352, | |
| "learning_rate": 3.4698795180722896e-05, | |
| "loss": 0.1327, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 4.576, | |
| "grad_norm": 0.16427506506443024, | |
| "learning_rate": 3.4377510040160646e-05, | |
| "loss": 0.1341, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 4.608, | |
| "grad_norm": 0.15947696566581726, | |
| "learning_rate": 3.4056224899598396e-05, | |
| "loss": 0.1336, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 4.64, | |
| "grad_norm": 0.24063943326473236, | |
| "learning_rate": 3.373493975903615e-05, | |
| "loss": 0.1326, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 4.672, | |
| "grad_norm": 0.2784833610057831, | |
| "learning_rate": 3.3413654618473896e-05, | |
| "loss": 0.1313, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 4.704, | |
| "grad_norm": 0.1624738872051239, | |
| "learning_rate": 3.3092369477911646e-05, | |
| "loss": 0.1343, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 4.736, | |
| "grad_norm": 0.163747176527977, | |
| "learning_rate": 3.27710843373494e-05, | |
| "loss": 0.1332, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 4.768, | |
| "grad_norm": 0.19286634027957916, | |
| "learning_rate": 3.244979919678715e-05, | |
| "loss": 0.1314, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 0.2070024311542511, | |
| "learning_rate": 3.21285140562249e-05, | |
| "loss": 0.1323, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "eval_loss": 0.07321055233478546, | |
| "eval_runtime": 13.3287, | |
| "eval_samples_per_second": 150.052, | |
| "eval_steps_per_second": 6.302, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 4.832, | |
| "grad_norm": 0.25665566325187683, | |
| "learning_rate": 3.180722891566265e-05, | |
| "loss": 0.133, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 4.864, | |
| "grad_norm": 0.18933174014091492, | |
| "learning_rate": 3.14859437751004e-05, | |
| "loss": 0.1334, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 4.896, | |
| "grad_norm": 0.20106372237205505, | |
| "learning_rate": 3.116465863453816e-05, | |
| "loss": 0.1318, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 4.928, | |
| "grad_norm": 0.23300665616989136, | |
| "learning_rate": 3.084337349397591e-05, | |
| "loss": 0.1314, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 4.96, | |
| "grad_norm": 0.2776864171028137, | |
| "learning_rate": 3.052208835341366e-05, | |
| "loss": 0.1307, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 4.992, | |
| "grad_norm": 0.17158937454223633, | |
| "learning_rate": 3.0200803212851406e-05, | |
| "loss": 0.1328, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 5.024, | |
| "grad_norm": 0.19341541826725006, | |
| "learning_rate": 2.9879518072289156e-05, | |
| "loss": 0.1255, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 5.056, | |
| "grad_norm": 0.1820104718208313, | |
| "learning_rate": 2.955823293172691e-05, | |
| "loss": 0.1239, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 5.088, | |
| "grad_norm": 0.19204023480415344, | |
| "learning_rate": 2.923694779116466e-05, | |
| "loss": 0.1251, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 5.12, | |
| "grad_norm": 0.20374265313148499, | |
| "learning_rate": 2.8915662650602413e-05, | |
| "loss": 0.1238, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 5.152, | |
| "grad_norm": 0.17607811093330383, | |
| "learning_rate": 2.8594377510040163e-05, | |
| "loss": 0.1242, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 5.184, | |
| "grad_norm": 0.18394358456134796, | |
| "learning_rate": 2.8273092369477913e-05, | |
| "loss": 0.1246, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 5.216, | |
| "grad_norm": 0.1720678061246872, | |
| "learning_rate": 2.7951807228915666e-05, | |
| "loss": 0.1234, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 5.248, | |
| "grad_norm": 0.1649816781282425, | |
| "learning_rate": 2.7630522088353417e-05, | |
| "loss": 0.1246, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 5.28, | |
| "grad_norm": 0.24089868366718292, | |
| "learning_rate": 2.730923694779117e-05, | |
| "loss": 0.1236, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 5.312, | |
| "grad_norm": 0.16703809797763824, | |
| "learning_rate": 2.6987951807228917e-05, | |
| "loss": 0.1235, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 5.344, | |
| "grad_norm": 0.2375577837228775, | |
| "learning_rate": 2.6666666666666667e-05, | |
| "loss": 0.1255, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 5.376, | |
| "grad_norm": 0.20844422280788422, | |
| "learning_rate": 2.634538152610442e-05, | |
| "loss": 0.124, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 5.408, | |
| "grad_norm": 0.15090999007225037, | |
| "learning_rate": 2.602409638554217e-05, | |
| "loss": 0.1236, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 5.44, | |
| "grad_norm": 0.1982196420431137, | |
| "learning_rate": 2.570281124497992e-05, | |
| "loss": 0.1234, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 5.4719999999999995, | |
| "grad_norm": 0.20548874139785767, | |
| "learning_rate": 2.5381526104417673e-05, | |
| "loss": 0.1237, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 5.504, | |
| "grad_norm": 0.1553628295660019, | |
| "learning_rate": 2.5060240963855423e-05, | |
| "loss": 0.1241, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 5.536, | |
| "grad_norm": 0.20827996730804443, | |
| "learning_rate": 2.4738955823293177e-05, | |
| "loss": 0.1236, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 5.568, | |
| "grad_norm": 0.26525431871414185, | |
| "learning_rate": 2.4417670682730927e-05, | |
| "loss": 0.1244, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 5.6, | |
| "grad_norm": 0.16824448108673096, | |
| "learning_rate": 2.409638554216868e-05, | |
| "loss": 0.124, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 5.6, | |
| "eval_loss": 0.06600421667098999, | |
| "eval_runtime": 13.2001, | |
| "eval_samples_per_second": 151.514, | |
| "eval_steps_per_second": 6.364, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 5.632, | |
| "grad_norm": 0.19588659703731537, | |
| "learning_rate": 2.3775100401606427e-05, | |
| "loss": 0.1239, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 5.664, | |
| "grad_norm": 0.20829927921295166, | |
| "learning_rate": 2.3453815261044177e-05, | |
| "loss": 0.1236, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 5.696, | |
| "grad_norm": 0.17067208886146545, | |
| "learning_rate": 2.3132530120481927e-05, | |
| "loss": 0.1246, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 5.728, | |
| "grad_norm": 0.17796407639980316, | |
| "learning_rate": 2.281124497991968e-05, | |
| "loss": 0.123, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 5.76, | |
| "grad_norm": 0.17811580002307892, | |
| "learning_rate": 2.248995983935743e-05, | |
| "loss": 0.1247, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 5.792, | |
| "grad_norm": 0.16586844623088837, | |
| "learning_rate": 2.2168674698795184e-05, | |
| "loss": 0.1226, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 5.824, | |
| "grad_norm": 0.16634885966777802, | |
| "learning_rate": 2.1847389558232934e-05, | |
| "loss": 0.1239, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 5.856, | |
| "grad_norm": 0.1891159564256668, | |
| "learning_rate": 2.1526104417670687e-05, | |
| "loss": 0.1234, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 5.888, | |
| "grad_norm": 0.2127494215965271, | |
| "learning_rate": 2.1204819277108437e-05, | |
| "loss": 0.1227, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 5.92, | |
| "grad_norm": 0.15109600126743317, | |
| "learning_rate": 2.0883534136546187e-05, | |
| "loss": 0.1218, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 5.952, | |
| "grad_norm": 0.14382487535476685, | |
| "learning_rate": 2.0562248995983934e-05, | |
| "loss": 0.1216, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 5.984, | |
| "grad_norm": 0.2122729867696762, | |
| "learning_rate": 2.0240963855421687e-05, | |
| "loss": 0.1248, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 6.016, | |
| "grad_norm": 0.19725599884986877, | |
| "learning_rate": 1.9919678714859437e-05, | |
| "loss": 0.12, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 6.048, | |
| "grad_norm": 0.18302911520004272, | |
| "learning_rate": 1.959839357429719e-05, | |
| "loss": 0.1174, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 6.08, | |
| "grad_norm": 0.19952338933944702, | |
| "learning_rate": 1.927710843373494e-05, | |
| "loss": 0.1176, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 6.112, | |
| "grad_norm": 0.16824807226657867, | |
| "learning_rate": 1.8955823293172694e-05, | |
| "loss": 0.1174, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 6.144, | |
| "grad_norm": 0.22110256552696228, | |
| "learning_rate": 1.8634538152610444e-05, | |
| "loss": 0.1172, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 6.176, | |
| "grad_norm": 0.20097705721855164, | |
| "learning_rate": 1.8313253012048194e-05, | |
| "loss": 0.1171, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 6.208, | |
| "grad_norm": 0.1569439023733139, | |
| "learning_rate": 1.7991967871485944e-05, | |
| "loss": 0.1177, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 6.24, | |
| "grad_norm": 0.1618974804878235, | |
| "learning_rate": 1.7670682730923698e-05, | |
| "loss": 0.1169, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 6.272, | |
| "grad_norm": 0.2146083265542984, | |
| "learning_rate": 1.7349397590361448e-05, | |
| "loss": 0.1172, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 6.304, | |
| "grad_norm": 0.16460789740085602, | |
| "learning_rate": 1.7028112449799198e-05, | |
| "loss": 0.1166, | |
| "step": 98500 | |
| }, | |
| { | |
| "epoch": 6.336, | |
| "grad_norm": 0.20428918302059174, | |
| "learning_rate": 1.6706827309236948e-05, | |
| "loss": 0.1177, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 6.368, | |
| "grad_norm": 0.1996203064918518, | |
| "learning_rate": 1.63855421686747e-05, | |
| "loss": 0.1172, | |
| "step": 99500 | |
| }, | |
| { | |
| "epoch": 6.4, | |
| "grad_norm": 0.22684213519096375, | |
| "learning_rate": 1.606425702811245e-05, | |
| "loss": 0.118, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 6.4, | |
| "eval_loss": 0.061658285558223724, | |
| "eval_runtime": 13.1578, | |
| "eval_samples_per_second": 152.001, | |
| "eval_steps_per_second": 6.384, | |
| "step": 100000 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 125000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 8, | |
| "save_steps": 12500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.459712365560463e+18, | |
| "train_batch_size": 24, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |