| { | |
| "best_metric": 0.15788726458515429, | |
| "best_model_checkpoint": "checkpoints/checkpoint-8800", | |
| "epoch": 4.637143519591931, | |
| "eval_steps": 50, | |
| "global_step": 10000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.023185717597959656, | |
| "grad_norm": 0.16052097082138062, | |
| "learning_rate": 5e-05, | |
| "loss": 0.6225, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.023185717597959656, | |
| "eval_loss": 0.1987911110084725, | |
| "eval_runtime": 63.5433, | |
| "eval_samples_per_second": 654.451, | |
| "eval_steps_per_second": 0.645, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.04637143519591931, | |
| "grad_norm": 0.09532159566879272, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1508, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04637143519591931, | |
| "eval_loss": 0.18357936446787168, | |
| "eval_runtime": 60.9844, | |
| "eval_samples_per_second": 681.912, | |
| "eval_steps_per_second": 0.672, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.06955715279387897, | |
| "grad_norm": 0.24056212604045868, | |
| "learning_rate": 9.999370638369377e-05, | |
| "loss": 0.1449, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.06955715279387897, | |
| "eval_loss": 0.17892658896642444, | |
| "eval_runtime": 60.7834, | |
| "eval_samples_per_second": 684.167, | |
| "eval_steps_per_second": 0.675, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.09274287039183862, | |
| "grad_norm": 0.09350813180208206, | |
| "learning_rate": 9.997482711915927e-05, | |
| "loss": 0.1421, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.09274287039183862, | |
| "eval_loss": 0.17624869869175752, | |
| "eval_runtime": 60.3826, | |
| "eval_samples_per_second": 688.708, | |
| "eval_steps_per_second": 0.679, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.11592858798979828, | |
| "grad_norm": 0.12230529636144638, | |
| "learning_rate": 9.99433669591504e-05, | |
| "loss": 0.141, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.11592858798979828, | |
| "eval_loss": 0.17641382363047173, | |
| "eval_runtime": 60.4169, | |
| "eval_samples_per_second": 688.317, | |
| "eval_steps_per_second": 0.679, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.13911430558775795, | |
| "grad_norm": 0.14592748880386353, | |
| "learning_rate": 9.989933382359422e-05, | |
| "loss": 0.1397, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.13911430558775795, | |
| "eval_loss": 0.17552215792639078, | |
| "eval_runtime": 61.6101, | |
| "eval_samples_per_second": 674.987, | |
| "eval_steps_per_second": 0.665, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.1623000231857176, | |
| "grad_norm": 0.10219988226890564, | |
| "learning_rate": 9.984273879759713e-05, | |
| "loss": 0.1393, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.1623000231857176, | |
| "eval_loss": 0.17414749172793012, | |
| "eval_runtime": 61.4962, | |
| "eval_samples_per_second": 676.237, | |
| "eval_steps_per_second": 0.667, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.18548574078367724, | |
| "grad_norm": 0.12338168174028397, | |
| "learning_rate": 9.977359612865423e-05, | |
| "loss": 0.1388, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.18548574078367724, | |
| "eval_loss": 0.17378012638412807, | |
| "eval_runtime": 61.0462, | |
| "eval_samples_per_second": 681.221, | |
| "eval_steps_per_second": 0.672, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.20867145838163692, | |
| "grad_norm": 0.09479879587888718, | |
| "learning_rate": 9.969192322306271e-05, | |
| "loss": 0.1394, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.20867145838163692, | |
| "eval_loss": 0.17252362204688398, | |
| "eval_runtime": 60.963, | |
| "eval_samples_per_second": 682.151, | |
| "eval_steps_per_second": 0.673, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.23185717597959657, | |
| "grad_norm": 0.1108623668551445, | |
| "learning_rate": 9.959774064153977e-05, | |
| "loss": 0.1383, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.23185717597959657, | |
| "eval_loss": 0.17298176916843877, | |
| "eval_runtime": 60.6546, | |
| "eval_samples_per_second": 685.62, | |
| "eval_steps_per_second": 0.676, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2550428935775562, | |
| "grad_norm": 0.0725204199552536, | |
| "learning_rate": 9.949107209404665e-05, | |
| "loss": 0.1376, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.2550428935775562, | |
| "eval_loss": 0.17165218165539878, | |
| "eval_runtime": 59.4888, | |
| "eval_samples_per_second": 699.056, | |
| "eval_steps_per_second": 0.689, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.2782286111755159, | |
| "grad_norm": 0.0955963134765625, | |
| "learning_rate": 9.937194443381972e-05, | |
| "loss": 0.1372, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.2782286111755159, | |
| "eval_loss": 0.17077083113718278, | |
| "eval_runtime": 60.6021, | |
| "eval_samples_per_second": 686.214, | |
| "eval_steps_per_second": 0.677, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.3014143287734755, | |
| "grad_norm": 0.18736732006072998, | |
| "learning_rate": 9.924038765061042e-05, | |
| "loss": 0.1361, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.3014143287734755, | |
| "eval_loss": 0.1727738813183492, | |
| "eval_runtime": 60.3343, | |
| "eval_samples_per_second": 689.259, | |
| "eval_steps_per_second": 0.68, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.3246000463714352, | |
| "grad_norm": 0.09572151303291321, | |
| "learning_rate": 9.909643486313533e-05, | |
| "loss": 0.1362, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.3246000463714352, | |
| "eval_loss": 0.17145407115151273, | |
| "eval_runtime": 60.2732, | |
| "eval_samples_per_second": 689.959, | |
| "eval_steps_per_second": 0.68, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.34778576396939487, | |
| "grad_norm": 0.07214252650737762, | |
| "learning_rate": 9.894012231073894e-05, | |
| "loss": 0.1364, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.34778576396939487, | |
| "eval_loss": 0.17133199408489355, | |
| "eval_runtime": 60.0148, | |
| "eval_samples_per_second": 692.929, | |
| "eval_steps_per_second": 0.683, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.3709714815673545, | |
| "grad_norm": 0.18224318325519562, | |
| "learning_rate": 9.877148934427037e-05, | |
| "loss": 0.1356, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.3709714815673545, | |
| "eval_loss": 0.16949569222888886, | |
| "eval_runtime": 60.1491, | |
| "eval_samples_per_second": 691.382, | |
| "eval_steps_per_second": 0.682, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.39415719916531416, | |
| "grad_norm": 0.06306415796279907, | |
| "learning_rate": 9.859057841617709e-05, | |
| "loss": 0.1353, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.39415719916531416, | |
| "eval_loss": 0.1686952690798172, | |
| "eval_runtime": 60.6447, | |
| "eval_samples_per_second": 685.731, | |
| "eval_steps_per_second": 0.676, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.41734291676327384, | |
| "grad_norm": 0.10090287029743195, | |
| "learning_rate": 9.839743506981782e-05, | |
| "loss": 0.1361, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.41734291676327384, | |
| "eval_loss": 0.17026100034088926, | |
| "eval_runtime": 61.5224, | |
| "eval_samples_per_second": 675.949, | |
| "eval_steps_per_second": 0.666, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.44052863436123346, | |
| "grad_norm": 0.10061236470937729, | |
| "learning_rate": 9.819210792799712e-05, | |
| "loss": 0.1354, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.44052863436123346, | |
| "eval_loss": 0.16971544565694113, | |
| "eval_runtime": 60.488, | |
| "eval_samples_per_second": 687.508, | |
| "eval_steps_per_second": 0.678, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.46371435195919314, | |
| "grad_norm": 0.06525534391403198, | |
| "learning_rate": 9.797464868072488e-05, | |
| "loss": 0.1352, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.46371435195919314, | |
| "eval_loss": 0.16946903195393553, | |
| "eval_runtime": 61.3558, | |
| "eval_samples_per_second": 677.784, | |
| "eval_steps_per_second": 0.668, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.4869000695571528, | |
| "grad_norm": 0.06269507855176926, | |
| "learning_rate": 9.77451120722037e-05, | |
| "loss": 0.1335, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.4869000695571528, | |
| "eval_loss": 0.16825352947444114, | |
| "eval_runtime": 60.1971, | |
| "eval_samples_per_second": 690.831, | |
| "eval_steps_per_second": 0.681, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.5100857871551124, | |
| "grad_norm": 0.08187470585107803, | |
| "learning_rate": 9.750355588704727e-05, | |
| "loss": 0.1327, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.5100857871551124, | |
| "eval_loss": 0.16861737282523587, | |
| "eval_runtime": 59.5715, | |
| "eval_samples_per_second": 698.085, | |
| "eval_steps_per_second": 0.688, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.5332715047530721, | |
| "grad_norm": 0.06607680767774582, | |
| "learning_rate": 9.725004093573342e-05, | |
| "loss": 0.1337, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.5332715047530721, | |
| "eval_loss": 0.1692070748762034, | |
| "eval_runtime": 60.0637, | |
| "eval_samples_per_second": 692.364, | |
| "eval_steps_per_second": 0.683, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.5564572223510318, | |
| "grad_norm": 0.09759815782308578, | |
| "learning_rate": 9.698463103929542e-05, | |
| "loss": 0.134, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.5564572223510318, | |
| "eval_loss": 0.16649228385381692, | |
| "eval_runtime": 60.4305, | |
| "eval_samples_per_second": 688.162, | |
| "eval_steps_per_second": 0.678, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.5796429399489914, | |
| "grad_norm": 0.10353852063417435, | |
| "learning_rate": 9.670739301325534e-05, | |
| "loss": 0.1341, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.5796429399489914, | |
| "eval_loss": 0.16802514322459206, | |
| "eval_runtime": 60.0955, | |
| "eval_samples_per_second": 691.999, | |
| "eval_steps_per_second": 0.682, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.602828657546951, | |
| "grad_norm": 0.11834366619586945, | |
| "learning_rate": 9.641839665080363e-05, | |
| "loss": 0.1347, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.602828657546951, | |
| "eval_loss": 0.1672302417427292, | |
| "eval_runtime": 60.3484, | |
| "eval_samples_per_second": 689.098, | |
| "eval_steps_per_second": 0.679, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.6260143751449108, | |
| "grad_norm": 0.06963012367486954, | |
| "learning_rate": 9.611771470522908e-05, | |
| "loss": 0.1335, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.6260143751449108, | |
| "eval_loss": 0.16607839684977216, | |
| "eval_runtime": 60.3308, | |
| "eval_samples_per_second": 689.3, | |
| "eval_steps_per_second": 0.68, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.6492000927428704, | |
| "grad_norm": 0.06842990219593048, | |
| "learning_rate": 9.580542287160348e-05, | |
| "loss": 0.1338, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.6492000927428704, | |
| "eval_loss": 0.16628812684035693, | |
| "eval_runtime": 59.9335, | |
| "eval_samples_per_second": 693.87, | |
| "eval_steps_per_second": 0.684, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.67238581034083, | |
| "grad_norm": 0.07053674757480621, | |
| "learning_rate": 9.548159976772592e-05, | |
| "loss": 0.1335, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.67238581034083, | |
| "eval_loss": 0.16696060882262428, | |
| "eval_runtime": 59.8079, | |
| "eval_samples_per_second": 695.326, | |
| "eval_steps_per_second": 0.686, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.6955715279387897, | |
| "grad_norm": 0.09175281971693039, | |
| "learning_rate": 9.514632691433107e-05, | |
| "loss": 0.1332, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.6955715279387897, | |
| "eval_loss": 0.16521949465081834, | |
| "eval_runtime": 60.1856, | |
| "eval_samples_per_second": 690.963, | |
| "eval_steps_per_second": 0.681, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.7187572455367494, | |
| "grad_norm": 0.05836635082960129, | |
| "learning_rate": 9.479968871456679e-05, | |
| "loss": 0.1336, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.7187572455367494, | |
| "eval_loss": 0.16626366255041727, | |
| "eval_runtime": 60.6256, | |
| "eval_samples_per_second": 685.948, | |
| "eval_steps_per_second": 0.676, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.741942963134709, | |
| "grad_norm": 0.07249301671981812, | |
| "learning_rate": 9.444177243274618e-05, | |
| "loss": 0.133, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.741942963134709, | |
| "eval_loss": 0.1655649439629329, | |
| "eval_runtime": 60.2447, | |
| "eval_samples_per_second": 690.285, | |
| "eval_steps_per_second": 0.681, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.7651286807326687, | |
| "grad_norm": 0.07509302347898483, | |
| "learning_rate": 9.407266817237911e-05, | |
| "loss": 0.1332, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.7651286807326687, | |
| "eval_loss": 0.16605371203296967, | |
| "eval_runtime": 59.8196, | |
| "eval_samples_per_second": 695.191, | |
| "eval_steps_per_second": 0.685, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.7883143983306283, | |
| "grad_norm": 0.07540406286716461, | |
| "learning_rate": 9.369246885348926e-05, | |
| "loss": 0.1327, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.7883143983306283, | |
| "eval_loss": 0.16555590021301406, | |
| "eval_runtime": 60.4119, | |
| "eval_samples_per_second": 688.374, | |
| "eval_steps_per_second": 0.679, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.811500115928588, | |
| "grad_norm": 0.06061087176203728, | |
| "learning_rate": 9.330127018922194e-05, | |
| "loss": 0.1318, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.811500115928588, | |
| "eval_loss": 0.16623179673527624, | |
| "eval_runtime": 59.7807, | |
| "eval_samples_per_second": 695.643, | |
| "eval_steps_per_second": 0.686, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.8346858335265477, | |
| "grad_norm": 0.05577518790960312, | |
| "learning_rate": 9.289917066174886e-05, | |
| "loss": 0.1319, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.8346858335265477, | |
| "eval_loss": 0.16519989030959317, | |
| "eval_runtime": 60.1508, | |
| "eval_samples_per_second": 691.363, | |
| "eval_steps_per_second": 0.682, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.8578715511245073, | |
| "grad_norm": 0.06929640471935272, | |
| "learning_rate": 9.248627149747573e-05, | |
| "loss": 0.1337, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.8578715511245073, | |
| "eval_loss": 0.16394849849125304, | |
| "eval_runtime": 60.1044, | |
| "eval_samples_per_second": 691.896, | |
| "eval_steps_per_second": 0.682, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.8810572687224669, | |
| "grad_norm": 0.07941466569900513, | |
| "learning_rate": 9.206267664155907e-05, | |
| "loss": 0.1324, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.8810572687224669, | |
| "eval_loss": 0.1648257591054525, | |
| "eval_runtime": 59.9818, | |
| "eval_samples_per_second": 693.31, | |
| "eval_steps_per_second": 0.684, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.9042429863204267, | |
| "grad_norm": 0.09700328856706619, | |
| "learning_rate": 9.162849273173857e-05, | |
| "loss": 0.1334, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.9042429863204267, | |
| "eval_loss": 0.16508159820082235, | |
| "eval_runtime": 60.0956, | |
| "eval_samples_per_second": 691.997, | |
| "eval_steps_per_second": 0.682, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.9274287039183863, | |
| "grad_norm": 0.09397923946380615, | |
| "learning_rate": 9.118382907149165e-05, | |
| "loss": 0.1317, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.9274287039183863, | |
| "eval_loss": 0.16377645660046958, | |
| "eval_runtime": 60.471, | |
| "eval_samples_per_second": 687.701, | |
| "eval_steps_per_second": 0.678, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.9506144215163459, | |
| "grad_norm": 0.08097202330827713, | |
| "learning_rate": 9.072879760251679e-05, | |
| "loss": 0.1324, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.9506144215163459, | |
| "eval_loss": 0.16491611914973717, | |
| "eval_runtime": 60.6678, | |
| "eval_samples_per_second": 685.471, | |
| "eval_steps_per_second": 0.676, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.9738001391143056, | |
| "grad_norm": 0.08455361425876617, | |
| "learning_rate": 9.026351287655294e-05, | |
| "loss": 0.1326, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.9738001391143056, | |
| "eval_loss": 0.16602741997032858, | |
| "eval_runtime": 60.5593, | |
| "eval_samples_per_second": 686.698, | |
| "eval_steps_per_second": 0.677, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.9969858567122653, | |
| "grad_norm": 0.056316621601581573, | |
| "learning_rate": 8.978809202654162e-05, | |
| "loss": 0.1326, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.9969858567122653, | |
| "eval_loss": 0.1640188218462461, | |
| "eval_runtime": 60.9228, | |
| "eval_samples_per_second": 682.602, | |
| "eval_steps_per_second": 0.673, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.0201715743102249, | |
| "grad_norm": 0.06686601787805557, | |
| "learning_rate": 8.930265473713938e-05, | |
| "loss": 0.132, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.0201715743102249, | |
| "eval_loss": 0.1652621257294944, | |
| "eval_runtime": 60.883, | |
| "eval_samples_per_second": 683.048, | |
| "eval_steps_per_second": 0.673, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.0433572919081846, | |
| "grad_norm": 0.040202509611845016, | |
| "learning_rate": 8.880732321458784e-05, | |
| "loss": 0.1319, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.0433572919081846, | |
| "eval_loss": 0.1655291575008717, | |
| "eval_runtime": 60.3109, | |
| "eval_samples_per_second": 689.527, | |
| "eval_steps_per_second": 0.68, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.0665430095061441, | |
| "grad_norm": 0.0656428411602974, | |
| "learning_rate": 8.83022221559489e-05, | |
| "loss": 0.1326, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.0665430095061441, | |
| "eval_loss": 0.16431572407087935, | |
| "eval_runtime": 60.1036, | |
| "eval_samples_per_second": 691.906, | |
| "eval_steps_per_second": 0.682, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.0897287271041038, | |
| "grad_norm": 0.06945247948169708, | |
| "learning_rate": 8.778747871771292e-05, | |
| "loss": 0.1321, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.0897287271041038, | |
| "eval_loss": 0.16585482329987242, | |
| "eval_runtime": 60.6263, | |
| "eval_samples_per_second": 685.94, | |
| "eval_steps_per_second": 0.676, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.1129144447020636, | |
| "grad_norm": 0.0523492731153965, | |
| "learning_rate": 8.726322248378775e-05, | |
| "loss": 0.1317, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.1129144447020636, | |
| "eval_loss": 0.16438524923736036, | |
| "eval_runtime": 60.317, | |
| "eval_samples_per_second": 689.457, | |
| "eval_steps_per_second": 0.68, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.136100162300023, | |
| "grad_norm": 0.07777334004640579, | |
| "learning_rate": 8.672958543287666e-05, | |
| "loss": 0.1322, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.136100162300023, | |
| "eval_loss": 0.16509696053644565, | |
| "eval_runtime": 60.26, | |
| "eval_samples_per_second": 690.109, | |
| "eval_steps_per_second": 0.68, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.1592858798979828, | |
| "grad_norm": 0.06430637836456299, | |
| "learning_rate": 8.618670190525352e-05, | |
| "loss": 0.1325, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.1592858798979828, | |
| "eval_loss": 0.1639541608445008, | |
| "eval_runtime": 60.5314, | |
| "eval_samples_per_second": 687.015, | |
| "eval_steps_per_second": 0.677, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.1824715974959426, | |
| "grad_norm": 0.11194106936454773, | |
| "learning_rate": 8.563470856894316e-05, | |
| "loss": 0.1311, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 1.1824715974959426, | |
| "eval_loss": 0.16260699934317355, | |
| "eval_runtime": 60.3659, | |
| "eval_samples_per_second": 688.899, | |
| "eval_steps_per_second": 0.679, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 1.205657315093902, | |
| "grad_norm": 0.06165901944041252, | |
| "learning_rate": 8.507374438531607e-05, | |
| "loss": 0.1323, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.205657315093902, | |
| "eval_loss": 0.1626319663130242, | |
| "eval_runtime": 59.9516, | |
| "eval_samples_per_second": 693.66, | |
| "eval_steps_per_second": 0.684, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.2288430326918618, | |
| "grad_norm": 0.10654885321855545, | |
| "learning_rate": 8.450395057410561e-05, | |
| "loss": 0.1316, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 1.2288430326918618, | |
| "eval_loss": 0.16393000041041636, | |
| "eval_runtime": 59.576, | |
| "eval_samples_per_second": 698.032, | |
| "eval_steps_per_second": 0.688, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 1.2520287502898215, | |
| "grad_norm": 0.04848140478134155, | |
| "learning_rate": 8.392547057785661e-05, | |
| "loss": 0.1314, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.2520287502898215, | |
| "eval_loss": 0.16348152455768114, | |
| "eval_runtime": 60.098, | |
| "eval_samples_per_second": 691.97, | |
| "eval_steps_per_second": 0.682, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.275214467887781, | |
| "grad_norm": 0.0573604516685009, | |
| "learning_rate": 8.333845002581458e-05, | |
| "loss": 0.1314, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 1.275214467887781, | |
| "eval_loss": 0.16364089140116167, | |
| "eval_runtime": 60.1364, | |
| "eval_samples_per_second": 691.528, | |
| "eval_steps_per_second": 0.682, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 1.2984001854857408, | |
| "grad_norm": 0.053159259259700775, | |
| "learning_rate": 8.274303669726426e-05, | |
| "loss": 0.131, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.2984001854857408, | |
| "eval_loss": 0.16257415365129801, | |
| "eval_runtime": 60.0025, | |
| "eval_samples_per_second": 693.071, | |
| "eval_steps_per_second": 0.683, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.3215859030837005, | |
| "grad_norm": 0.09136148542165756, | |
| "learning_rate": 8.213938048432697e-05, | |
| "loss": 0.1313, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 1.3215859030837005, | |
| "eval_loss": 0.16324665471619784, | |
| "eval_runtime": 59.8429, | |
| "eval_samples_per_second": 694.92, | |
| "eval_steps_per_second": 0.685, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 1.34477162068166, | |
| "grad_norm": 0.05825324356555939, | |
| "learning_rate": 8.152763335422613e-05, | |
| "loss": 0.1312, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.34477162068166, | |
| "eval_loss": 0.16367374608121235, | |
| "eval_runtime": 60.219, | |
| "eval_samples_per_second": 690.579, | |
| "eval_steps_per_second": 0.681, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.3679573382796197, | |
| "grad_norm": 0.06379790604114532, | |
| "learning_rate": 8.090794931103026e-05, | |
| "loss": 0.1317, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 1.3679573382796197, | |
| "eval_loss": 0.16400733758786312, | |
| "eval_runtime": 59.9641, | |
| "eval_samples_per_second": 693.515, | |
| "eval_steps_per_second": 0.684, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 1.3911430558775795, | |
| "grad_norm": 0.05361103266477585, | |
| "learning_rate": 8.028048435688333e-05, | |
| "loss": 0.1311, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.3911430558775795, | |
| "eval_loss": 0.16210626991928834, | |
| "eval_runtime": 59.5858, | |
| "eval_samples_per_second": 697.919, | |
| "eval_steps_per_second": 0.688, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.414328773475539, | |
| "grad_norm": 0.04593402519822121, | |
| "learning_rate": 7.964539645273204e-05, | |
| "loss": 0.1304, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 1.414328773475539, | |
| "eval_loss": 0.163067463275087, | |
| "eval_runtime": 60.098, | |
| "eval_samples_per_second": 691.97, | |
| "eval_steps_per_second": 0.682, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 1.4375144910734987, | |
| "grad_norm": 0.057480327785015106, | |
| "learning_rate": 7.900284547855991e-05, | |
| "loss": 0.1307, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.4375144910734987, | |
| "eval_loss": 0.16243572043734797, | |
| "eval_runtime": 59.5674, | |
| "eval_samples_per_second": 698.133, | |
| "eval_steps_per_second": 0.688, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.4607002086714584, | |
| "grad_norm": 0.08223798871040344, | |
| "learning_rate": 7.835299319313853e-05, | |
| "loss": 0.1315, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 1.4607002086714584, | |
| "eval_loss": 0.1641944734489707, | |
| "eval_runtime": 59.5423, | |
| "eval_samples_per_second": 698.428, | |
| "eval_steps_per_second": 0.689, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 1.483885926269418, | |
| "grad_norm": 0.09742949903011322, | |
| "learning_rate": 7.769600319330552e-05, | |
| "loss": 0.1303, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.483885926269418, | |
| "eval_loss": 0.16355698856626613, | |
| "eval_runtime": 60.1946, | |
| "eval_samples_per_second": 690.859, | |
| "eval_steps_per_second": 0.681, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.5070716438673777, | |
| "grad_norm": 0.06401767581701279, | |
| "learning_rate": 7.703204087277988e-05, | |
| "loss": 0.1315, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.5070716438673777, | |
| "eval_loss": 0.16215006705140952, | |
| "eval_runtime": 59.7822, | |
| "eval_samples_per_second": 695.625, | |
| "eval_steps_per_second": 0.686, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.5302573614653374, | |
| "grad_norm": 0.07916898280382156, | |
| "learning_rate": 7.636127338052512e-05, | |
| "loss": 0.1315, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.5302573614653374, | |
| "eval_loss": 0.16288597734760557, | |
| "eval_runtime": 59.2757, | |
| "eval_samples_per_second": 701.57, | |
| "eval_steps_per_second": 0.692, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.553443079063297, | |
| "grad_norm": 0.06549016386270523, | |
| "learning_rate": 7.568386957867033e-05, | |
| "loss": 0.1303, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.553443079063297, | |
| "eval_loss": 0.16416664097655873, | |
| "eval_runtime": 59.84, | |
| "eval_samples_per_second": 694.953, | |
| "eval_steps_per_second": 0.685, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.5766287966612567, | |
| "grad_norm": 0.0709395632147789, | |
| "learning_rate": 7.500000000000001e-05, | |
| "loss": 0.1309, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.5766287966612567, | |
| "eval_loss": 0.16179194486424098, | |
| "eval_runtime": 59.8634, | |
| "eval_samples_per_second": 694.682, | |
| "eval_steps_per_second": 0.685, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.5998145142592164, | |
| "grad_norm": 0.05671363323926926, | |
| "learning_rate": 7.430983680502344e-05, | |
| "loss": 0.1307, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.5998145142592164, | |
| "eval_loss": 0.16309191886303373, | |
| "eval_runtime": 59.618, | |
| "eval_samples_per_second": 697.541, | |
| "eval_steps_per_second": 0.688, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.623000231857176, | |
| "grad_norm": 0.04889162629842758, | |
| "learning_rate": 7.361355373863414e-05, | |
| "loss": 0.1314, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.623000231857176, | |
| "eval_loss": 0.16290782983414598, | |
| "eval_runtime": 60.3904, | |
| "eval_samples_per_second": 688.619, | |
| "eval_steps_per_second": 0.679, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.6461859494551356, | |
| "grad_norm": 0.0970933735370636, | |
| "learning_rate": 7.291132608637052e-05, | |
| "loss": 0.1314, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.6461859494551356, | |
| "eval_loss": 0.16278222993823557, | |
| "eval_runtime": 59.8666, | |
| "eval_samples_per_second": 694.644, | |
| "eval_steps_per_second": 0.685, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.6693716670530954, | |
| "grad_norm": 0.056557025760412216, | |
| "learning_rate": 7.220333063028872e-05, | |
| "loss": 0.1312, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.6693716670530954, | |
| "eval_loss": 0.16313205291311117, | |
| "eval_runtime": 60.0092, | |
| "eval_samples_per_second": 692.993, | |
| "eval_steps_per_second": 0.683, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.6925573846510549, | |
| "grad_norm": 0.04870522394776344, | |
| "learning_rate": 7.148974560445859e-05, | |
| "loss": 0.1299, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.6925573846510549, | |
| "eval_loss": 0.1617941082289122, | |
| "eval_runtime": 60.1721, | |
| "eval_samples_per_second": 691.117, | |
| "eval_steps_per_second": 0.681, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.7157431022490146, | |
| "grad_norm": 0.0681833028793335, | |
| "learning_rate": 7.077075065009433e-05, | |
| "loss": 0.1304, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.7157431022490146, | |
| "eval_loss": 0.16243406602519425, | |
| "eval_runtime": 59.3626, | |
| "eval_samples_per_second": 700.542, | |
| "eval_steps_per_second": 0.691, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.7389288198469743, | |
| "grad_norm": 0.06506156921386719, | |
| "learning_rate": 7.004652677033068e-05, | |
| "loss": 0.1299, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.7389288198469743, | |
| "eval_loss": 0.16324780134312317, | |
| "eval_runtime": 59.6022, | |
| "eval_samples_per_second": 697.726, | |
| "eval_steps_per_second": 0.688, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.7621145374449338, | |
| "grad_norm": 0.06188170611858368, | |
| "learning_rate": 6.931725628465643e-05, | |
| "loss": 0.1309, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.7621145374449338, | |
| "eval_loss": 0.1623115342294882, | |
| "eval_runtime": 59.7694, | |
| "eval_samples_per_second": 695.774, | |
| "eval_steps_per_second": 0.686, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.7853002550428936, | |
| "grad_norm": 0.05675831064581871, | |
| "learning_rate": 6.858312278301637e-05, | |
| "loss": 0.1303, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.7853002550428936, | |
| "eval_loss": 0.1630547638293529, | |
| "eval_runtime": 59.779, | |
| "eval_samples_per_second": 695.662, | |
| "eval_steps_per_second": 0.686, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.8084859726408533, | |
| "grad_norm": 0.04727062210440636, | |
| "learning_rate": 6.784431107959359e-05, | |
| "loss": 0.1312, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.8084859726408533, | |
| "eval_loss": 0.1616409071893626, | |
| "eval_runtime": 59.6005, | |
| "eval_samples_per_second": 697.746, | |
| "eval_steps_per_second": 0.688, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.8316716902388128, | |
| "grad_norm": 0.06378892064094543, | |
| "learning_rate": 6.710100716628344e-05, | |
| "loss": 0.1303, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.8316716902388128, | |
| "eval_loss": 0.1622395658739077, | |
| "eval_runtime": 60.1499, | |
| "eval_samples_per_second": 691.373, | |
| "eval_steps_per_second": 0.682, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.8548574078367726, | |
| "grad_norm": 0.05470576509833336, | |
| "learning_rate": 6.635339816587109e-05, | |
| "loss": 0.1308, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.8548574078367726, | |
| "eval_loss": 0.16317236170181762, | |
| "eval_runtime": 60.014, | |
| "eval_samples_per_second": 692.939, | |
| "eval_steps_per_second": 0.683, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.8780431254347323, | |
| "grad_norm": 0.053886763751506805, | |
| "learning_rate": 6.560167228492436e-05, | |
| "loss": 0.1297, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.8780431254347323, | |
| "eval_loss": 0.16198886262197804, | |
| "eval_runtime": 60.8262, | |
| "eval_samples_per_second": 683.685, | |
| "eval_steps_per_second": 0.674, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.9012288430326918, | |
| "grad_norm": 0.054583676159381866, | |
| "learning_rate": 6.484601876641375e-05, | |
| "loss": 0.1301, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.9012288430326918, | |
| "eval_loss": 0.1616550050294764, | |
| "eval_runtime": 59.7779, | |
| "eval_samples_per_second": 695.675, | |
| "eval_steps_per_second": 0.686, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.9244145606306515, | |
| "grad_norm": 0.071171335875988, | |
| "learning_rate": 6.408662784207149e-05, | |
| "loss": 0.131, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.9244145606306515, | |
| "eval_loss": 0.15968682813566223, | |
| "eval_runtime": 60.227, | |
| "eval_samples_per_second": 690.487, | |
| "eval_steps_per_second": 0.681, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.9476002782286113, | |
| "grad_norm": 0.05775531381368637, | |
| "learning_rate": 6.332369068450174e-05, | |
| "loss": 0.1296, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.9476002782286113, | |
| "eval_loss": 0.16262199212265846, | |
| "eval_runtime": 60.3405, | |
| "eval_samples_per_second": 689.189, | |
| "eval_steps_per_second": 0.679, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.9707859958265708, | |
| "grad_norm": 0.06425776332616806, | |
| "learning_rate": 6.255739935905396e-05, | |
| "loss": 0.1299, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.9707859958265708, | |
| "eval_loss": 0.16324524366491053, | |
| "eval_runtime": 61.417, | |
| "eval_samples_per_second": 677.109, | |
| "eval_steps_per_second": 0.668, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.9939717134245305, | |
| "grad_norm": 0.045762140303850174, | |
| "learning_rate": 6.178794677547137e-05, | |
| "loss": 0.1299, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.9939717134245305, | |
| "eval_loss": 0.16053301797614244, | |
| "eval_runtime": 61.0801, | |
| "eval_samples_per_second": 680.844, | |
| "eval_steps_per_second": 0.671, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 2.0171574310224902, | |
| "grad_norm": 0.07060451060533524, | |
| "learning_rate": 6.1015526639327035e-05, | |
| "loss": 0.1296, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 2.0171574310224902, | |
| "eval_loss": 0.1620254674138633, | |
| "eval_runtime": 61.0829, | |
| "eval_samples_per_second": 680.812, | |
| "eval_steps_per_second": 0.671, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 2.0403431486204497, | |
| "grad_norm": 0.059919316321611404, | |
| "learning_rate": 6.024033340325954e-05, | |
| "loss": 0.1302, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 2.0403431486204497, | |
| "eval_loss": 0.16284223807997533, | |
| "eval_runtime": 61.5789, | |
| "eval_samples_per_second": 675.328, | |
| "eval_steps_per_second": 0.666, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 2.0635288662184093, | |
| "grad_norm": 0.07983385026454926, | |
| "learning_rate": 5.946256221802051e-05, | |
| "loss": 0.13, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 2.0635288662184093, | |
| "eval_loss": 0.16209282393788932, | |
| "eval_runtime": 61.6584, | |
| "eval_samples_per_second": 674.458, | |
| "eval_steps_per_second": 0.665, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 2.086714583816369, | |
| "grad_norm": 0.07582173496484756, | |
| "learning_rate": 5.868240888334653e-05, | |
| "loss": 0.1296, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 2.086714583816369, | |
| "eval_loss": 0.16158196377974565, | |
| "eval_runtime": 61.1826, | |
| "eval_samples_per_second": 679.703, | |
| "eval_steps_per_second": 0.67, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 2.1099003014143287, | |
| "grad_norm": 0.06049995869398117, | |
| "learning_rate": 5.79000697986675e-05, | |
| "loss": 0.1298, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 2.1099003014143287, | |
| "eval_loss": 0.16130609279963956, | |
| "eval_runtime": 61.0153, | |
| "eval_samples_per_second": 681.567, | |
| "eval_steps_per_second": 0.672, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 2.1330860190122882, | |
| "grad_norm": 0.0440148264169693, | |
| "learning_rate": 5.7115741913664264e-05, | |
| "loss": 0.1299, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 2.1330860190122882, | |
| "eval_loss": 0.16027799763638953, | |
| "eval_runtime": 61.1993, | |
| "eval_samples_per_second": 679.517, | |
| "eval_steps_per_second": 0.67, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 2.156271736610248, | |
| "grad_norm": 0.05254065990447998, | |
| "learning_rate": 5.6329622678687463e-05, | |
| "loss": 0.1299, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 2.156271736610248, | |
| "eval_loss": 0.16206274484291652, | |
| "eval_runtime": 61.4415, | |
| "eval_samples_per_second": 676.839, | |
| "eval_steps_per_second": 0.667, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 2.1794574542082077, | |
| "grad_norm": 0.06294432282447815, | |
| "learning_rate": 5.5541909995050554e-05, | |
| "loss": 0.1306, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 2.1794574542082077, | |
| "eval_loss": 0.16140170723024802, | |
| "eval_runtime": 60.8861, | |
| "eval_samples_per_second": 683.013, | |
| "eval_steps_per_second": 0.673, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 2.202643171806167, | |
| "grad_norm": 0.06710942089557648, | |
| "learning_rate": 5.475280216520913e-05, | |
| "loss": 0.1303, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 2.202643171806167, | |
| "eval_loss": 0.16245448075670843, | |
| "eval_runtime": 61.2839, | |
| "eval_samples_per_second": 678.58, | |
| "eval_steps_per_second": 0.669, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 2.225828889404127, | |
| "grad_norm": 0.05298132076859474, | |
| "learning_rate": 5.396249784283942e-05, | |
| "loss": 0.13, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 2.225828889404127, | |
| "eval_loss": 0.1623738898660767, | |
| "eval_runtime": 61.1531, | |
| "eval_samples_per_second": 680.031, | |
| "eval_steps_per_second": 0.67, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 2.2490146070020867, | |
| "grad_norm": 0.04066763445734978, | |
| "learning_rate": 5.317119598282823e-05, | |
| "loss": 0.1295, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 2.2490146070020867, | |
| "eval_loss": 0.1627438727811327, | |
| "eval_runtime": 61.0414, | |
| "eval_samples_per_second": 681.275, | |
| "eval_steps_per_second": 0.672, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 2.272200324600046, | |
| "grad_norm": 0.061821240931749344, | |
| "learning_rate": 5.2379095791187124e-05, | |
| "loss": 0.1299, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 2.272200324600046, | |
| "eval_loss": 0.16086717177928397, | |
| "eval_runtime": 60.7945, | |
| "eval_samples_per_second": 684.042, | |
| "eval_steps_per_second": 0.674, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 2.295386042198006, | |
| "grad_norm": 0.08038394153118134, | |
| "learning_rate": 5.158639667490339e-05, | |
| "loss": 0.13, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 2.295386042198006, | |
| "eval_loss": 0.16221664317086187, | |
| "eval_runtime": 61.6742, | |
| "eval_samples_per_second": 674.285, | |
| "eval_steps_per_second": 0.665, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 2.3185717597959656, | |
| "grad_norm": 0.0556926503777504, | |
| "learning_rate": 5.0793298191740404e-05, | |
| "loss": 0.1311, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.3185717597959656, | |
| "eval_loss": 0.16015339844546791, | |
| "eval_runtime": 61.3657, | |
| "eval_samples_per_second": 677.675, | |
| "eval_steps_per_second": 0.668, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.3417574773939256, | |
| "grad_norm": 0.06645477563142776, | |
| "learning_rate": 5e-05, | |
| "loss": 0.1284, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 2.3417574773939256, | |
| "eval_loss": 0.16160674186313023, | |
| "eval_runtime": 61.4737, | |
| "eval_samples_per_second": 676.484, | |
| "eval_steps_per_second": 0.667, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 2.364943194991885, | |
| "grad_norm": 0.05365500971674919, | |
| "learning_rate": 4.92067018082596e-05, | |
| "loss": 0.13, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 2.364943194991885, | |
| "eval_loss": 0.16016484459556096, | |
| "eval_runtime": 61.4058, | |
| "eval_samples_per_second": 677.232, | |
| "eval_steps_per_second": 0.668, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 2.3881289125898446, | |
| "grad_norm": 0.0499204620718956, | |
| "learning_rate": 4.841360332509663e-05, | |
| "loss": 0.129, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 2.3881289125898446, | |
| "eval_loss": 0.16054727464378063, | |
| "eval_runtime": 61.1539, | |
| "eval_samples_per_second": 680.023, | |
| "eval_steps_per_second": 0.67, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 2.411314630187804, | |
| "grad_norm": 0.07284457236528397, | |
| "learning_rate": 4.762090420881289e-05, | |
| "loss": 0.129, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 2.411314630187804, | |
| "eval_loss": 0.16057287778830004, | |
| "eval_runtime": 60.5785, | |
| "eval_samples_per_second": 686.481, | |
| "eval_steps_per_second": 0.677, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 2.434500347785764, | |
| "grad_norm": 0.06511891633272171, | |
| "learning_rate": 4.6828804017171776e-05, | |
| "loss": 0.1297, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 2.434500347785764, | |
| "eval_loss": 0.16202011190836896, | |
| "eval_runtime": 61.4053, | |
| "eval_samples_per_second": 677.238, | |
| "eval_steps_per_second": 0.668, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 2.4576860653837236, | |
| "grad_norm": 0.05936937406659126, | |
| "learning_rate": 4.603750215716057e-05, | |
| "loss": 0.1293, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 2.4576860653837236, | |
| "eval_loss": 0.16067086041480225, | |
| "eval_runtime": 60.4469, | |
| "eval_samples_per_second": 687.976, | |
| "eval_steps_per_second": 0.678, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 2.480871782981683, | |
| "grad_norm": 0.039836496114730835, | |
| "learning_rate": 4.5247197834790876e-05, | |
| "loss": 0.1288, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 2.480871782981683, | |
| "eval_loss": 0.1614640227625451, | |
| "eval_runtime": 60.9513, | |
| "eval_samples_per_second": 682.283, | |
| "eval_steps_per_second": 0.673, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 2.504057500579643, | |
| "grad_norm": 0.04305760934948921, | |
| "learning_rate": 4.445809000494946e-05, | |
| "loss": 0.1294, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 2.504057500579643, | |
| "eval_loss": 0.16139181990447046, | |
| "eval_runtime": 60.6766, | |
| "eval_samples_per_second": 685.371, | |
| "eval_steps_per_second": 0.676, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 2.5272432181776026, | |
| "grad_norm": 0.06780368089675903, | |
| "learning_rate": 4.3670377321312535e-05, | |
| "loss": 0.1285, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 2.5272432181776026, | |
| "eval_loss": 0.1619736397134425, | |
| "eval_runtime": 60.7281, | |
| "eval_samples_per_second": 684.79, | |
| "eval_steps_per_second": 0.675, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 2.550428935775562, | |
| "grad_norm": 0.052273835986852646, | |
| "learning_rate": 4.288425808633575e-05, | |
| "loss": 0.1303, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 2.550428935775562, | |
| "eval_loss": 0.16178818674979198, | |
| "eval_runtime": 60.8875, | |
| "eval_samples_per_second": 682.997, | |
| "eval_steps_per_second": 0.673, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 2.573614653373522, | |
| "grad_norm": 0.045574627816677094, | |
| "learning_rate": 4.20999302013325e-05, | |
| "loss": 0.1291, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 2.573614653373522, | |
| "eval_loss": 0.16034006952877458, | |
| "eval_runtime": 60.7378, | |
| "eval_samples_per_second": 684.681, | |
| "eval_steps_per_second": 0.675, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 2.5968003709714815, | |
| "grad_norm": 0.044092051684856415, | |
| "learning_rate": 4.131759111665349e-05, | |
| "loss": 0.1298, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 2.5968003709714815, | |
| "eval_loss": 0.16090484909780667, | |
| "eval_runtime": 60.4675, | |
| "eval_samples_per_second": 687.741, | |
| "eval_steps_per_second": 0.678, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 2.6199860885694415, | |
| "grad_norm": 0.05473971739411354, | |
| "learning_rate": 4.0537437781979506e-05, | |
| "loss": 0.1288, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 2.6199860885694415, | |
| "eval_loss": 0.1604315377337276, | |
| "eval_runtime": 62.8239, | |
| "eval_samples_per_second": 661.946, | |
| "eval_steps_per_second": 0.653, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 2.643171806167401, | |
| "grad_norm": 0.07100555300712585, | |
| "learning_rate": 3.9759666596740476e-05, | |
| "loss": 0.129, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 2.643171806167401, | |
| "eval_loss": 0.15997494100305837, | |
| "eval_runtime": 61.3008, | |
| "eval_samples_per_second": 678.392, | |
| "eval_steps_per_second": 0.669, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 2.6663575237653605, | |
| "grad_norm": 0.04020215570926666, | |
| "learning_rate": 3.898447336067297e-05, | |
| "loss": 0.1291, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 2.6663575237653605, | |
| "eval_loss": 0.1596748490832133, | |
| "eval_runtime": 60.6148, | |
| "eval_samples_per_second": 686.07, | |
| "eval_steps_per_second": 0.676, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 2.68954324136332, | |
| "grad_norm": 0.05526584014296532, | |
| "learning_rate": 3.821205322452863e-05, | |
| "loss": 0.1291, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 2.68954324136332, | |
| "eval_loss": 0.16091962633426782, | |
| "eval_runtime": 60.1717, | |
| "eval_samples_per_second": 691.122, | |
| "eval_steps_per_second": 0.681, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 2.71272895896128, | |
| "grad_norm": 0.052167922258377075, | |
| "learning_rate": 3.744260064094604e-05, | |
| "loss": 0.129, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 2.71272895896128, | |
| "eval_loss": 0.16112806362615253, | |
| "eval_runtime": 60.1273, | |
| "eval_samples_per_second": 691.633, | |
| "eval_steps_per_second": 0.682, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 2.7359146765592395, | |
| "grad_norm": 0.054320793598890305, | |
| "learning_rate": 3.6676309315498256e-05, | |
| "loss": 0.13, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 2.7359146765592395, | |
| "eval_loss": 0.15996250695505343, | |
| "eval_runtime": 60.655, | |
| "eval_samples_per_second": 685.616, | |
| "eval_steps_per_second": 0.676, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 2.7591003941571994, | |
| "grad_norm": 0.05470626428723335, | |
| "learning_rate": 3.591337215792852e-05, | |
| "loss": 0.1296, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 2.7591003941571994, | |
| "eval_loss": 0.16025288890609335, | |
| "eval_runtime": 60.826, | |
| "eval_samples_per_second": 683.688, | |
| "eval_steps_per_second": 0.674, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 2.782286111755159, | |
| "grad_norm": 0.04805810749530792, | |
| "learning_rate": 3.515398123358627e-05, | |
| "loss": 0.1294, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.782286111755159, | |
| "eval_loss": 0.15918263724182835, | |
| "eval_runtime": 60.2321, | |
| "eval_samples_per_second": 690.429, | |
| "eval_steps_per_second": 0.681, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.8054718293531185, | |
| "grad_norm": 0.04185302183032036, | |
| "learning_rate": 3.439832771507565e-05, | |
| "loss": 0.1283, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 2.8054718293531185, | |
| "eval_loss": 0.16179385240233157, | |
| "eval_runtime": 60.9176, | |
| "eval_samples_per_second": 682.66, | |
| "eval_steps_per_second": 0.673, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 2.828657546951078, | |
| "grad_norm": 0.04609336704015732, | |
| "learning_rate": 3.364660183412892e-05, | |
| "loss": 0.1292, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 2.828657546951078, | |
| "eval_loss": 0.1611929898635588, | |
| "eval_runtime": 60.5916, | |
| "eval_samples_per_second": 686.333, | |
| "eval_steps_per_second": 0.677, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 2.851843264549038, | |
| "grad_norm": 0.05404876172542572, | |
| "learning_rate": 3.289899283371657e-05, | |
| "loss": 0.128, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 2.851843264549038, | |
| "eval_loss": 0.16039360794951976, | |
| "eval_runtime": 60.5961, | |
| "eval_samples_per_second": 686.282, | |
| "eval_steps_per_second": 0.677, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 2.8750289821469974, | |
| "grad_norm": 0.06787659227848053, | |
| "learning_rate": 3.215568892040641e-05, | |
| "loss": 0.1288, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 2.8750289821469974, | |
| "eval_loss": 0.16113480515361805, | |
| "eval_runtime": 60.2775, | |
| "eval_samples_per_second": 689.909, | |
| "eval_steps_per_second": 0.68, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 2.8982146997449574, | |
| "grad_norm": 0.06937435269355774, | |
| "learning_rate": 3.141687721698363e-05, | |
| "loss": 0.1283, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 2.8982146997449574, | |
| "eval_loss": 0.16087572214972407, | |
| "eval_runtime": 60.6789, | |
| "eval_samples_per_second": 685.345, | |
| "eval_steps_per_second": 0.676, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 2.921400417342917, | |
| "grad_norm": 0.08074232190847397, | |
| "learning_rate": 3.0682743715343564e-05, | |
| "loss": 0.1292, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 2.921400417342917, | |
| "eval_loss": 0.16049740787316144, | |
| "eval_runtime": 60.3194, | |
| "eval_samples_per_second": 689.43, | |
| "eval_steps_per_second": 0.68, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 2.9445861349408764, | |
| "grad_norm": 0.03976515680551529, | |
| "learning_rate": 2.9953473229669328e-05, | |
| "loss": 0.1302, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 2.9445861349408764, | |
| "eval_loss": 0.16023700059761273, | |
| "eval_runtime": 60.8537, | |
| "eval_samples_per_second": 683.377, | |
| "eval_steps_per_second": 0.674, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 2.967771852538836, | |
| "grad_norm": 0.05303976684808731, | |
| "learning_rate": 2.9229249349905684e-05, | |
| "loss": 0.1285, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 2.967771852538836, | |
| "eval_loss": 0.1601465398516622, | |
| "eval_runtime": 60.6472, | |
| "eval_samples_per_second": 685.703, | |
| "eval_steps_per_second": 0.676, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 2.990957570136796, | |
| "grad_norm": 0.0519745759665966, | |
| "learning_rate": 2.851025439554142e-05, | |
| "loss": 0.1286, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 2.990957570136796, | |
| "eval_loss": 0.16085429229133483, | |
| "eval_runtime": 60.2507, | |
| "eval_samples_per_second": 690.216, | |
| "eval_steps_per_second": 0.68, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 3.0141432877347554, | |
| "grad_norm": 0.050518251955509186, | |
| "learning_rate": 2.7796669369711294e-05, | |
| "loss": 0.1301, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 3.0141432877347554, | |
| "eval_loss": 0.16015394660421692, | |
| "eval_runtime": 60.5015, | |
| "eval_samples_per_second": 687.355, | |
| "eval_steps_per_second": 0.678, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 3.037329005332715, | |
| "grad_norm": 0.04253960773348808, | |
| "learning_rate": 2.708867391362948e-05, | |
| "loss": 0.1296, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 3.037329005332715, | |
| "eval_loss": 0.1597283595131218, | |
| "eval_runtime": 60.13, | |
| "eval_samples_per_second": 691.601, | |
| "eval_steps_per_second": 0.682, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 3.060514722930675, | |
| "grad_norm": 0.06899340450763702, | |
| "learning_rate": 2.638644626136587e-05, | |
| "loss": 0.1291, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 3.060514722930675, | |
| "eval_loss": 0.1604277250117246, | |
| "eval_runtime": 60.4618, | |
| "eval_samples_per_second": 687.806, | |
| "eval_steps_per_second": 0.678, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 3.0837004405286343, | |
| "grad_norm": 0.06556117534637451, | |
| "learning_rate": 2.5690163194976575e-05, | |
| "loss": 0.1288, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 3.0837004405286343, | |
| "eval_loss": 0.15953636330193482, | |
| "eval_runtime": 60.2757, | |
| "eval_samples_per_second": 689.93, | |
| "eval_steps_per_second": 0.68, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 3.106886158126594, | |
| "grad_norm": 0.03685734421014786, | |
| "learning_rate": 2.500000000000001e-05, | |
| "loss": 0.129, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 3.106886158126594, | |
| "eval_loss": 0.159308270335797, | |
| "eval_runtime": 60.624, | |
| "eval_samples_per_second": 685.966, | |
| "eval_steps_per_second": 0.676, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 3.130071875724554, | |
| "grad_norm": 0.0451020672917366, | |
| "learning_rate": 2.4316130421329697e-05, | |
| "loss": 0.1286, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 3.130071875724554, | |
| "eval_loss": 0.15995884031774596, | |
| "eval_runtime": 60.3654, | |
| "eval_samples_per_second": 688.905, | |
| "eval_steps_per_second": 0.679, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 3.1532575933225133, | |
| "grad_norm": 0.0495733842253685, | |
| "learning_rate": 2.363872661947488e-05, | |
| "loss": 0.1293, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 3.1532575933225133, | |
| "eval_loss": 0.15987331824692497, | |
| "eval_runtime": 60.4636, | |
| "eval_samples_per_second": 687.786, | |
| "eval_steps_per_second": 0.678, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 3.176443310920473, | |
| "grad_norm": 0.05756652355194092, | |
| "learning_rate": 2.296795912722014e-05, | |
| "loss": 0.1289, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 3.176443310920473, | |
| "eval_loss": 0.15986134614331013, | |
| "eval_runtime": 61.0063, | |
| "eval_samples_per_second": 681.667, | |
| "eval_steps_per_second": 0.672, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 3.199629028518433, | |
| "grad_norm": 0.0467820018529892, | |
| "learning_rate": 2.2303996806694488e-05, | |
| "loss": 0.1295, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 3.199629028518433, | |
| "eval_loss": 0.16011030076900337, | |
| "eval_runtime": 60.1041, | |
| "eval_samples_per_second": 691.9, | |
| "eval_steps_per_second": 0.682, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 3.2228147461163923, | |
| "grad_norm": 0.04179982468485832, | |
| "learning_rate": 2.164700680686147e-05, | |
| "loss": 0.1287, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 3.2228147461163923, | |
| "eval_loss": 0.15917751068552838, | |
| "eval_runtime": 60.5321, | |
| "eval_samples_per_second": 687.007, | |
| "eval_steps_per_second": 0.677, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 3.246000463714352, | |
| "grad_norm": 0.053910572081804276, | |
| "learning_rate": 2.09971545214401e-05, | |
| "loss": 0.1286, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 3.246000463714352, | |
| "eval_loss": 0.15998092838627764, | |
| "eval_runtime": 60.4067, | |
| "eval_samples_per_second": 688.434, | |
| "eval_steps_per_second": 0.679, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 3.2691861813123118, | |
| "grad_norm": 0.04404950886964798, | |
| "learning_rate": 2.0354603547267985e-05, | |
| "loss": 0.1283, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 3.2691861813123118, | |
| "eval_loss": 0.1597617331551387, | |
| "eval_runtime": 60.4218, | |
| "eval_samples_per_second": 688.262, | |
| "eval_steps_per_second": 0.679, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 3.2923718989102713, | |
| "grad_norm": 0.04763752967119217, | |
| "learning_rate": 1.9719515643116674e-05, | |
| "loss": 0.1288, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 3.2923718989102713, | |
| "eval_loss": 0.16116006530852447, | |
| "eval_runtime": 60.2132, | |
| "eval_samples_per_second": 690.646, | |
| "eval_steps_per_second": 0.681, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 3.3155576165082308, | |
| "grad_norm": 0.049567196518182755, | |
| "learning_rate": 1.9092050688969738e-05, | |
| "loss": 0.1298, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 3.3155576165082308, | |
| "eval_loss": 0.15965543804361845, | |
| "eval_runtime": 60.3928, | |
| "eval_samples_per_second": 688.592, | |
| "eval_steps_per_second": 0.679, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 3.3387433341061907, | |
| "grad_norm": 0.05488676205277443, | |
| "learning_rate": 1.847236664577389e-05, | |
| "loss": 0.1284, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 3.3387433341061907, | |
| "eval_loss": 0.16050384662882064, | |
| "eval_runtime": 60.121, | |
| "eval_samples_per_second": 691.705, | |
| "eval_steps_per_second": 0.682, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 3.3619290517041502, | |
| "grad_norm": 0.04124298691749573, | |
| "learning_rate": 1.7860619515673033e-05, | |
| "loss": 0.1289, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 3.3619290517041502, | |
| "eval_loss": 0.16054145931691394, | |
| "eval_runtime": 60.2046, | |
| "eval_samples_per_second": 690.745, | |
| "eval_steps_per_second": 0.681, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 3.3851147693021097, | |
| "grad_norm": 0.04400424286723137, | |
| "learning_rate": 1.725696330273575e-05, | |
| "loss": 0.1289, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 3.3851147693021097, | |
| "eval_loss": 0.15999099129576416, | |
| "eval_runtime": 60.4869, | |
| "eval_samples_per_second": 687.52, | |
| "eval_steps_per_second": 0.678, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 3.4083004869000697, | |
| "grad_norm": 0.05488509312272072, | |
| "learning_rate": 1.6661549974185424e-05, | |
| "loss": 0.1285, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 3.4083004869000697, | |
| "eval_loss": 0.16051823730892306, | |
| "eval_runtime": 60.1981, | |
| "eval_samples_per_second": 690.819, | |
| "eval_steps_per_second": 0.681, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 3.431486204498029, | |
| "grad_norm": 0.06722457706928253, | |
| "learning_rate": 1.60745294221434e-05, | |
| "loss": 0.1286, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 3.431486204498029, | |
| "eval_loss": 0.1610307768591294, | |
| "eval_runtime": 60.7755, | |
| "eval_samples_per_second": 684.256, | |
| "eval_steps_per_second": 0.675, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 3.4546719220959887, | |
| "grad_norm": 0.04814394935965538, | |
| "learning_rate": 1.549604942589441e-05, | |
| "loss": 0.1278, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 3.4546719220959887, | |
| "eval_loss": 0.1598065741965525, | |
| "eval_runtime": 59.9968, | |
| "eval_samples_per_second": 693.136, | |
| "eval_steps_per_second": 0.683, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 3.4778576396939487, | |
| "grad_norm": 0.04934167116880417, | |
| "learning_rate": 1.4926255614683932e-05, | |
| "loss": 0.1274, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 3.4778576396939487, | |
| "eval_loss": 0.15982454893723042, | |
| "eval_runtime": 60.201, | |
| "eval_samples_per_second": 690.786, | |
| "eval_steps_per_second": 0.681, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 3.501043357291908, | |
| "grad_norm": 0.04529615864157677, | |
| "learning_rate": 1.4365291431056871e-05, | |
| "loss": 0.1297, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 3.501043357291908, | |
| "eval_loss": 0.15986133524024926, | |
| "eval_runtime": 59.95, | |
| "eval_samples_per_second": 693.678, | |
| "eval_steps_per_second": 0.684, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 3.5242290748898677, | |
| "grad_norm": 0.0399620421230793, | |
| "learning_rate": 1.3813298094746491e-05, | |
| "loss": 0.1288, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 3.5242290748898677, | |
| "eval_loss": 0.15905609221590689, | |
| "eval_runtime": 61.181, | |
| "eval_samples_per_second": 679.72, | |
| "eval_steps_per_second": 0.67, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 3.5474147924878277, | |
| "grad_norm": 0.05973295867443085, | |
| "learning_rate": 1.327041456712334e-05, | |
| "loss": 0.1281, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 3.5474147924878277, | |
| "eval_loss": 0.15981091550942805, | |
| "eval_runtime": 60.5605, | |
| "eval_samples_per_second": 686.685, | |
| "eval_steps_per_second": 0.677, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 3.570600510085787, | |
| "grad_norm": 0.04896661266684532, | |
| "learning_rate": 1.2736777516212266e-05, | |
| "loss": 0.1288, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 3.570600510085787, | |
| "eval_loss": 0.1599924400443614, | |
| "eval_runtime": 60.486, | |
| "eval_samples_per_second": 687.531, | |
| "eval_steps_per_second": 0.678, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 3.5937862276837467, | |
| "grad_norm": 0.07458525151014328, | |
| "learning_rate": 1.2212521282287092e-05, | |
| "loss": 0.128, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 3.5937862276837467, | |
| "eval_loss": 0.15936126278835275, | |
| "eval_runtime": 60.9341, | |
| "eval_samples_per_second": 682.475, | |
| "eval_steps_per_second": 0.673, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 3.6169719452817066, | |
| "grad_norm": 0.04200127348303795, | |
| "learning_rate": 1.1697777844051105e-05, | |
| "loss": 0.1287, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 3.6169719452817066, | |
| "eval_loss": 0.1603394617678833, | |
| "eval_runtime": 60.5155, | |
| "eval_samples_per_second": 687.195, | |
| "eval_steps_per_second": 0.678, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 3.640157662879666, | |
| "grad_norm": 0.06712640821933746, | |
| "learning_rate": 1.1192676785412154e-05, | |
| "loss": 0.1291, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 3.640157662879666, | |
| "eval_loss": 0.15920067938345067, | |
| "eval_runtime": 60.0225, | |
| "eval_samples_per_second": 692.84, | |
| "eval_steps_per_second": 0.683, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 3.6633433804776256, | |
| "grad_norm": 0.049462996423244476, | |
| "learning_rate": 1.0697345262860636e-05, | |
| "loss": 0.1287, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 3.6633433804776256, | |
| "eval_loss": 0.15964593569874527, | |
| "eval_runtime": 60.1965, | |
| "eval_samples_per_second": 690.837, | |
| "eval_steps_per_second": 0.681, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 3.6865290980755856, | |
| "grad_norm": 0.05148932337760925, | |
| "learning_rate": 1.021190797345839e-05, | |
| "loss": 0.1283, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 3.6865290980755856, | |
| "eval_loss": 0.15903354419354673, | |
| "eval_runtime": 60.0507, | |
| "eval_samples_per_second": 692.515, | |
| "eval_steps_per_second": 0.683, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 3.709714815673545, | |
| "grad_norm": 0.05164024233818054, | |
| "learning_rate": 9.73648712344707e-06, | |
| "loss": 0.128, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 3.709714815673545, | |
| "eval_loss": 0.15835035051131605, | |
| "eval_runtime": 60.5739, | |
| "eval_samples_per_second": 686.533, | |
| "eval_steps_per_second": 0.677, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 3.7329005332715046, | |
| "grad_norm": 0.04926716163754463, | |
| "learning_rate": 9.271202397483215e-06, | |
| "loss": 0.1276, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 3.7329005332715046, | |
| "eval_loss": 0.160225615529793, | |
| "eval_runtime": 60.4555, | |
| "eval_samples_per_second": 687.878, | |
| "eval_steps_per_second": 0.678, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 3.7560862508694646, | |
| "grad_norm": 0.04355842247605324, | |
| "learning_rate": 8.816170928508365e-06, | |
| "loss": 0.1287, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 3.7560862508694646, | |
| "eval_loss": 0.1601867779420742, | |
| "eval_runtime": 60.7386, | |
| "eval_samples_per_second": 684.672, | |
| "eval_steps_per_second": 0.675, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 3.779271968467424, | |
| "grad_norm": 0.039105553179979324, | |
| "learning_rate": 8.371507268261437e-06, | |
| "loss": 0.1306, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 3.779271968467424, | |
| "eval_loss": 0.15946348937187382, | |
| "eval_runtime": 60.9253, | |
| "eval_samples_per_second": 682.574, | |
| "eval_steps_per_second": 0.673, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 3.8024576860653836, | |
| "grad_norm": 0.04452899843454361, | |
| "learning_rate": 7.937323358440935e-06, | |
| "loss": 0.1286, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 3.8024576860653836, | |
| "eval_loss": 0.15871429728364056, | |
| "eval_runtime": 60.2776, | |
| "eval_samples_per_second": 689.908, | |
| "eval_steps_per_second": 0.68, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 3.8256434036633435, | |
| "grad_norm": 0.043075498193502426, | |
| "learning_rate": 7.513728502524286e-06, | |
| "loss": 0.1292, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 3.8256434036633435, | |
| "eval_loss": 0.1592580359542711, | |
| "eval_runtime": 60.7244, | |
| "eval_samples_per_second": 684.832, | |
| "eval_steps_per_second": 0.675, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 3.848829121261303, | |
| "grad_norm": 0.05848800390958786, | |
| "learning_rate": 7.100829338251147e-06, | |
| "loss": 0.1275, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 3.848829121261303, | |
| "eval_loss": 0.15895083163665807, | |
| "eval_runtime": 60.3677, | |
| "eval_samples_per_second": 688.878, | |
| "eval_steps_per_second": 0.679, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 3.8720148388592626, | |
| "grad_norm": 0.04980336129665375, | |
| "learning_rate": 6.698729810778065e-06, | |
| "loss": 0.1277, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 3.8720148388592626, | |
| "eval_loss": 0.16002303550437, | |
| "eval_runtime": 60.2742, | |
| "eval_samples_per_second": 689.947, | |
| "eval_steps_per_second": 0.68, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 3.8952005564572225, | |
| "grad_norm": 0.057385146617889404, | |
| "learning_rate": 6.3075311465107535e-06, | |
| "loss": 0.129, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 3.8952005564572225, | |
| "eval_loss": 0.1601535826416112, | |
| "eval_runtime": 60.4053, | |
| "eval_samples_per_second": 688.45, | |
| "eval_steps_per_second": 0.679, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 3.918386274055182, | |
| "grad_norm": 0.045788682997226715, | |
| "learning_rate": 5.927331827620903e-06, | |
| "loss": 0.1286, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 3.918386274055182, | |
| "eval_loss": 0.15926720973175468, | |
| "eval_runtime": 60.6783, | |
| "eval_samples_per_second": 685.352, | |
| "eval_steps_per_second": 0.676, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 3.9415719916531415, | |
| "grad_norm": 0.045575451105833054, | |
| "learning_rate": 5.558227567253832e-06, | |
| "loss": 0.1281, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 3.9415719916531415, | |
| "eval_loss": 0.16032033338606583, | |
| "eval_runtime": 60.4563, | |
| "eval_samples_per_second": 687.868, | |
| "eval_steps_per_second": 0.678, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 3.964757709251101, | |
| "grad_norm": 0.034972067922353745, | |
| "learning_rate": 5.200311285433213e-06, | |
| "loss": 0.1285, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 3.964757709251101, | |
| "eval_loss": 0.1590997571686103, | |
| "eval_runtime": 60.7642, | |
| "eval_samples_per_second": 684.384, | |
| "eval_steps_per_second": 0.675, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 3.987943426849061, | |
| "grad_norm": 0.05060684680938721, | |
| "learning_rate": 4.853673085668947e-06, | |
| "loss": 0.1293, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 3.987943426849061, | |
| "eval_loss": 0.15924322809570868, | |
| "eval_runtime": 60.0799, | |
| "eval_samples_per_second": 692.178, | |
| "eval_steps_per_second": 0.682, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 4.011129144447021, | |
| "grad_norm": 0.04898017644882202, | |
| "learning_rate": 4.5184002322740785e-06, | |
| "loss": 0.1283, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 4.011129144447021, | |
| "eval_loss": 0.1587491140112498, | |
| "eval_runtime": 60.6393, | |
| "eval_samples_per_second": 685.793, | |
| "eval_steps_per_second": 0.676, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 4.0343148620449805, | |
| "grad_norm": 0.058361586183309555, | |
| "learning_rate": 4.19457712839652e-06, | |
| "loss": 0.1277, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 4.0343148620449805, | |
| "eval_loss": 0.1597737118597627, | |
| "eval_runtime": 61.5486, | |
| "eval_samples_per_second": 675.661, | |
| "eval_steps_per_second": 0.666, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 4.05750057964294, | |
| "grad_norm": 0.05138258635997772, | |
| "learning_rate": 3.8822852947709375e-06, | |
| "loss": 0.1283, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 4.05750057964294, | |
| "eval_loss": 0.15985116115580386, | |
| "eval_runtime": 60.5634, | |
| "eval_samples_per_second": 686.652, | |
| "eval_steps_per_second": 0.677, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 4.0806862972408995, | |
| "grad_norm": 0.0461881086230278, | |
| "learning_rate": 3.581603349196372e-06, | |
| "loss": 0.1288, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 4.0806862972408995, | |
| "eval_loss": 0.15788726458515429, | |
| "eval_runtime": 60.6057, | |
| "eval_samples_per_second": 686.173, | |
| "eval_steps_per_second": 0.677, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 4.103872014838859, | |
| "grad_norm": 0.0618111789226532, | |
| "learning_rate": 3.2926069867446675e-06, | |
| "loss": 0.1287, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 4.103872014838859, | |
| "eval_loss": 0.15881183094974458, | |
| "eval_runtime": 60.3747, | |
| "eval_samples_per_second": 688.799, | |
| "eval_steps_per_second": 0.679, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 4.1270577324368185, | |
| "grad_norm": 0.04804789274930954, | |
| "learning_rate": 3.0153689607045845e-06, | |
| "loss": 0.1294, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 4.1270577324368185, | |
| "eval_loss": 0.1607356553004913, | |
| "eval_runtime": 60.6979, | |
| "eval_samples_per_second": 685.131, | |
| "eval_steps_per_second": 0.675, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 4.150243450034779, | |
| "grad_norm": 0.04835003986954689, | |
| "learning_rate": 2.7499590642665774e-06, | |
| "loss": 0.1277, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 4.150243450034779, | |
| "eval_loss": 0.1598761189516689, | |
| "eval_runtime": 61.2003, | |
| "eval_samples_per_second": 679.507, | |
| "eval_steps_per_second": 0.67, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 4.173429167632738, | |
| "grad_norm": 0.05750919133424759, | |
| "learning_rate": 2.496444112952734e-06, | |
| "loss": 0.1285, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 4.173429167632738, | |
| "eval_loss": 0.15946166188972705, | |
| "eval_runtime": 60.6795, | |
| "eval_samples_per_second": 685.339, | |
| "eval_steps_per_second": 0.676, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 4.196614885230698, | |
| "grad_norm": 0.06801807135343552, | |
| "learning_rate": 2.2548879277963064e-06, | |
| "loss": 0.1289, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 4.196614885230698, | |
| "eval_loss": 0.1609577221237089, | |
| "eval_runtime": 61.0186, | |
| "eval_samples_per_second": 681.53, | |
| "eval_steps_per_second": 0.672, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 4.219800602828657, | |
| "grad_norm": 0.04383298382163048, | |
| "learning_rate": 2.0253513192751373e-06, | |
| "loss": 0.1289, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 4.219800602828657, | |
| "eval_loss": 0.1598739506376352, | |
| "eval_runtime": 60.6256, | |
| "eval_samples_per_second": 685.948, | |
| "eval_steps_per_second": 0.676, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 4.242986320426617, | |
| "grad_norm": 0.044339120388031006, | |
| "learning_rate": 1.807892072002898e-06, | |
| "loss": 0.1283, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 4.242986320426617, | |
| "eval_loss": 0.158920794598519, | |
| "eval_runtime": 60.5454, | |
| "eval_samples_per_second": 686.856, | |
| "eval_steps_per_second": 0.677, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 4.2661720380245765, | |
| "grad_norm": 0.04090524837374687, | |
| "learning_rate": 1.6025649301821876e-06, | |
| "loss": 0.1282, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 4.2661720380245765, | |
| "eval_loss": 0.1596859048948022, | |
| "eval_runtime": 60.7716, | |
| "eval_samples_per_second": 684.3, | |
| "eval_steps_per_second": 0.675, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 4.289357755622537, | |
| "grad_norm": 0.042642634361982346, | |
| "learning_rate": 1.4094215838229176e-06, | |
| "loss": 0.1286, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 4.289357755622537, | |
| "eval_loss": 0.16079005979316527, | |
| "eval_runtime": 60.6239, | |
| "eval_samples_per_second": 685.967, | |
| "eval_steps_per_second": 0.676, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 4.312543473220496, | |
| "grad_norm": 0.04924129322171211, | |
| "learning_rate": 1.2285106557296477e-06, | |
| "loss": 0.1287, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 4.312543473220496, | |
| "eval_loss": 0.16084020796323667, | |
| "eval_runtime": 60.2581, | |
| "eval_samples_per_second": 690.131, | |
| "eval_steps_per_second": 0.68, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 4.335729190818456, | |
| "grad_norm": 0.04222133755683899, | |
| "learning_rate": 1.0598776892610685e-06, | |
| "loss": 0.1287, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 4.335729190818456, | |
| "eval_loss": 0.16020395921618655, | |
| "eval_runtime": 60.5816, | |
| "eval_samples_per_second": 686.446, | |
| "eval_steps_per_second": 0.677, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 4.358914908416415, | |
| "grad_norm": 0.05593874678015709, | |
| "learning_rate": 9.035651368646648e-07, | |
| "loss": 0.1286, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 4.358914908416415, | |
| "eval_loss": 0.15957607987630548, | |
| "eval_runtime": 60.8726, | |
| "eval_samples_per_second": 683.164, | |
| "eval_steps_per_second": 0.674, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 4.382100626014375, | |
| "grad_norm": 0.059049129486083984, | |
| "learning_rate": 7.596123493895991e-07, | |
| "loss": 0.1289, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 4.382100626014375, | |
| "eval_loss": 0.15975197211451994, | |
| "eval_runtime": 60.6704, | |
| "eval_samples_per_second": 685.441, | |
| "eval_steps_per_second": 0.676, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 4.405286343612334, | |
| "grad_norm": 0.053555767983198166, | |
| "learning_rate": 6.280555661802856e-07, | |
| "loss": 0.1286, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 4.405286343612334, | |
| "eval_loss": 0.16117730336557945, | |
| "eval_runtime": 61.9214, | |
| "eval_samples_per_second": 671.593, | |
| "eval_steps_per_second": 0.662, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 4.428472061210295, | |
| "grad_norm": 0.04488294571638107, | |
| "learning_rate": 5.089279059533658e-07, | |
| "loss": 0.1281, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 4.428472061210295, | |
| "eval_loss": 0.15896389365558133, | |
| "eval_runtime": 62.0889, | |
| "eval_samples_per_second": 669.782, | |
| "eval_steps_per_second": 0.66, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 4.451657778808254, | |
| "grad_norm": 0.044143371284008026, | |
| "learning_rate": 4.02259358460233e-07, | |
| "loss": 0.1276, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 4.451657778808254, | |
| "eval_loss": 0.15880485748262804, | |
| "eval_runtime": 61.9007, | |
| "eval_samples_per_second": 671.818, | |
| "eval_steps_per_second": 0.662, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 4.474843496406214, | |
| "grad_norm": 0.054890409111976624, | |
| "learning_rate": 3.080767769372939e-07, | |
| "loss": 0.1289, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 4.474843496406214, | |
| "eval_loss": 0.15899979579394047, | |
| "eval_runtime": 61.7264, | |
| "eval_samples_per_second": 673.714, | |
| "eval_steps_per_second": 0.664, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 4.498029214004173, | |
| "grad_norm": 0.04276006668806076, | |
| "learning_rate": 2.2640387134577058e-07, | |
| "loss": 0.1284, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 4.498029214004173, | |
| "eval_loss": 0.1587265635928511, | |
| "eval_runtime": 61.254, | |
| "eval_samples_per_second": 678.911, | |
| "eval_steps_per_second": 0.669, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 4.521214931602133, | |
| "grad_norm": 0.04374442994594574, | |
| "learning_rate": 1.5726120240288634e-07, | |
| "loss": 0.1284, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 4.521214931602133, | |
| "eval_loss": 0.1596641951113874, | |
| "eval_runtime": 61.5999, | |
| "eval_samples_per_second": 675.099, | |
| "eval_steps_per_second": 0.666, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 4.544400649200092, | |
| "grad_norm": 0.039518803358078, | |
| "learning_rate": 1.0066617640578368e-07, | |
| "loss": 0.1297, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 4.544400649200092, | |
| "eval_loss": 0.15941302591091938, | |
| "eval_runtime": 61.7763, | |
| "eval_samples_per_second": 673.17, | |
| "eval_steps_per_second": 0.664, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 4.567586366798053, | |
| "grad_norm": 0.037454187870025635, | |
| "learning_rate": 5.663304084960186e-08, | |
| "loss": 0.1276, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 4.567586366798053, | |
| "eval_loss": 0.15932704533345807, | |
| "eval_runtime": 61.0673, | |
| "eval_samples_per_second": 680.987, | |
| "eval_steps_per_second": 0.671, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 4.590772084396012, | |
| "grad_norm": 0.05642937496304512, | |
| "learning_rate": 2.5172880840745873e-08, | |
| "loss": 0.129, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 4.590772084396012, | |
| "eval_loss": 0.15923822751292327, | |
| "eval_runtime": 61.7949, | |
| "eval_samples_per_second": 672.968, | |
| "eval_steps_per_second": 0.663, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 4.613957801993972, | |
| "grad_norm": 0.03662274032831192, | |
| "learning_rate": 6.293616306246586e-09, | |
| "loss": 0.1285, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 4.613957801993972, | |
| "eval_loss": 0.160277338388973, | |
| "eval_runtime": 62.2438, | |
| "eval_samples_per_second": 668.115, | |
| "eval_steps_per_second": 0.659, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 4.637143519591931, | |
| "grad_norm": 0.0563049279153347, | |
| "learning_rate": 0.0, | |
| "loss": 0.1282, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 4.637143519591931, | |
| "eval_loss": 0.16006394581914293, | |
| "eval_runtime": 61.4917, | |
| "eval_samples_per_second": 676.286, | |
| "eval_steps_per_second": 0.667, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 4.637143519591931, | |
| "step": 10000, | |
| "total_flos": 2.3231400526217216e+17, | |
| "train_loss": 0.13326009378433226, | |
| "train_runtime": 41368.3669, | |
| "train_samples_per_second": 495.064, | |
| "train_steps_per_second": 0.242 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 10000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 50, | |
| "total_flos": 2.3231400526217216e+17, | |
| "train_batch_size": 1024, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |