{ "best_metric": 0.15788726458515429, "best_model_checkpoint": "checkpoints/checkpoint-8800", "epoch": 4.637143519591931, "eval_steps": 50, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.023185717597959656, "grad_norm": 0.16052097082138062, "learning_rate": 5e-05, "loss": 0.6225, "step": 50 }, { "epoch": 0.023185717597959656, "eval_loss": 0.1987911110084725, "eval_runtime": 63.5433, "eval_samples_per_second": 654.451, "eval_steps_per_second": 0.645, "step": 50 }, { "epoch": 0.04637143519591931, "grad_norm": 0.09532159566879272, "learning_rate": 0.0001, "loss": 0.1508, "step": 100 }, { "epoch": 0.04637143519591931, "eval_loss": 0.18357936446787168, "eval_runtime": 60.9844, "eval_samples_per_second": 681.912, "eval_steps_per_second": 0.672, "step": 100 }, { "epoch": 0.06955715279387897, "grad_norm": 0.24056212604045868, "learning_rate": 9.999370638369377e-05, "loss": 0.1449, "step": 150 }, { "epoch": 0.06955715279387897, "eval_loss": 0.17892658896642444, "eval_runtime": 60.7834, "eval_samples_per_second": 684.167, "eval_steps_per_second": 0.675, "step": 150 }, { "epoch": 0.09274287039183862, "grad_norm": 0.09350813180208206, "learning_rate": 9.997482711915927e-05, "loss": 0.1421, "step": 200 }, { "epoch": 0.09274287039183862, "eval_loss": 0.17624869869175752, "eval_runtime": 60.3826, "eval_samples_per_second": 688.708, "eval_steps_per_second": 0.679, "step": 200 }, { "epoch": 0.11592858798979828, "grad_norm": 0.12230529636144638, "learning_rate": 9.99433669591504e-05, "loss": 0.141, "step": 250 }, { "epoch": 0.11592858798979828, "eval_loss": 0.17641382363047173, "eval_runtime": 60.4169, "eval_samples_per_second": 688.317, "eval_steps_per_second": 0.679, "step": 250 }, { "epoch": 0.13911430558775795, "grad_norm": 0.14592748880386353, "learning_rate": 9.989933382359422e-05, "loss": 0.1397, "step": 300 }, { "epoch": 0.13911430558775795, "eval_loss": 0.17552215792639078, "eval_runtime": 61.6101, "eval_samples_per_second": 674.987, "eval_steps_per_second": 0.665, "step": 300 }, { "epoch": 0.1623000231857176, "grad_norm": 0.10219988226890564, "learning_rate": 9.984273879759713e-05, "loss": 0.1393, "step": 350 }, { "epoch": 0.1623000231857176, "eval_loss": 0.17414749172793012, "eval_runtime": 61.4962, "eval_samples_per_second": 676.237, "eval_steps_per_second": 0.667, "step": 350 }, { "epoch": 0.18548574078367724, "grad_norm": 0.12338168174028397, "learning_rate": 9.977359612865423e-05, "loss": 0.1388, "step": 400 }, { "epoch": 0.18548574078367724, "eval_loss": 0.17378012638412807, "eval_runtime": 61.0462, "eval_samples_per_second": 681.221, "eval_steps_per_second": 0.672, "step": 400 }, { "epoch": 0.20867145838163692, "grad_norm": 0.09479879587888718, "learning_rate": 9.969192322306271e-05, "loss": 0.1394, "step": 450 }, { "epoch": 0.20867145838163692, "eval_loss": 0.17252362204688398, "eval_runtime": 60.963, "eval_samples_per_second": 682.151, "eval_steps_per_second": 0.673, "step": 450 }, { "epoch": 0.23185717597959657, "grad_norm": 0.1108623668551445, "learning_rate": 9.959774064153977e-05, "loss": 0.1383, "step": 500 }, { "epoch": 0.23185717597959657, "eval_loss": 0.17298176916843877, "eval_runtime": 60.6546, "eval_samples_per_second": 685.62, "eval_steps_per_second": 0.676, "step": 500 }, { "epoch": 0.2550428935775562, "grad_norm": 0.0725204199552536, "learning_rate": 9.949107209404665e-05, "loss": 0.1376, "step": 550 }, { "epoch": 0.2550428935775562, "eval_loss": 0.17165218165539878, "eval_runtime": 59.4888, "eval_samples_per_second": 699.056, "eval_steps_per_second": 0.689, "step": 550 }, { "epoch": 0.2782286111755159, "grad_norm": 0.0955963134765625, "learning_rate": 9.937194443381972e-05, "loss": 0.1372, "step": 600 }, { "epoch": 0.2782286111755159, "eval_loss": 0.17077083113718278, "eval_runtime": 60.6021, "eval_samples_per_second": 686.214, "eval_steps_per_second": 0.677, "step": 600 }, { "epoch": 0.3014143287734755, "grad_norm": 0.18736732006072998, "learning_rate": 9.924038765061042e-05, "loss": 0.1361, "step": 650 }, { "epoch": 0.3014143287734755, "eval_loss": 0.1727738813183492, "eval_runtime": 60.3343, "eval_samples_per_second": 689.259, "eval_steps_per_second": 0.68, "step": 650 }, { "epoch": 0.3246000463714352, "grad_norm": 0.09572151303291321, "learning_rate": 9.909643486313533e-05, "loss": 0.1362, "step": 700 }, { "epoch": 0.3246000463714352, "eval_loss": 0.17145407115151273, "eval_runtime": 60.2732, "eval_samples_per_second": 689.959, "eval_steps_per_second": 0.68, "step": 700 }, { "epoch": 0.34778576396939487, "grad_norm": 0.07214252650737762, "learning_rate": 9.894012231073894e-05, "loss": 0.1364, "step": 750 }, { "epoch": 0.34778576396939487, "eval_loss": 0.17133199408489355, "eval_runtime": 60.0148, "eval_samples_per_second": 692.929, "eval_steps_per_second": 0.683, "step": 750 }, { "epoch": 0.3709714815673545, "grad_norm": 0.18224318325519562, "learning_rate": 9.877148934427037e-05, "loss": 0.1356, "step": 800 }, { "epoch": 0.3709714815673545, "eval_loss": 0.16949569222888886, "eval_runtime": 60.1491, "eval_samples_per_second": 691.382, "eval_steps_per_second": 0.682, "step": 800 }, { "epoch": 0.39415719916531416, "grad_norm": 0.06306415796279907, "learning_rate": 9.859057841617709e-05, "loss": 0.1353, "step": 850 }, { "epoch": 0.39415719916531416, "eval_loss": 0.1686952690798172, "eval_runtime": 60.6447, "eval_samples_per_second": 685.731, "eval_steps_per_second": 0.676, "step": 850 }, { "epoch": 0.41734291676327384, "grad_norm": 0.10090287029743195, "learning_rate": 9.839743506981782e-05, "loss": 0.1361, "step": 900 }, { "epoch": 0.41734291676327384, "eval_loss": 0.17026100034088926, "eval_runtime": 61.5224, "eval_samples_per_second": 675.949, "eval_steps_per_second": 0.666, "step": 900 }, { "epoch": 0.44052863436123346, "grad_norm": 0.10061236470937729, "learning_rate": 9.819210792799712e-05, "loss": 0.1354, "step": 950 }, { "epoch": 0.44052863436123346, "eval_loss": 0.16971544565694113, "eval_runtime": 60.488, "eval_samples_per_second": 687.508, "eval_steps_per_second": 0.678, "step": 950 }, { "epoch": 0.46371435195919314, "grad_norm": 0.06525534391403198, "learning_rate": 9.797464868072488e-05, "loss": 0.1352, "step": 1000 }, { "epoch": 0.46371435195919314, "eval_loss": 0.16946903195393553, "eval_runtime": 61.3558, "eval_samples_per_second": 677.784, "eval_steps_per_second": 0.668, "step": 1000 }, { "epoch": 0.4869000695571528, "grad_norm": 0.06269507855176926, "learning_rate": 9.77451120722037e-05, "loss": 0.1335, "step": 1050 }, { "epoch": 0.4869000695571528, "eval_loss": 0.16825352947444114, "eval_runtime": 60.1971, "eval_samples_per_second": 690.831, "eval_steps_per_second": 0.681, "step": 1050 }, { "epoch": 0.5100857871551124, "grad_norm": 0.08187470585107803, "learning_rate": 9.750355588704727e-05, "loss": 0.1327, "step": 1100 }, { "epoch": 0.5100857871551124, "eval_loss": 0.16861737282523587, "eval_runtime": 59.5715, "eval_samples_per_second": 698.085, "eval_steps_per_second": 0.688, "step": 1100 }, { "epoch": 0.5332715047530721, "grad_norm": 0.06607680767774582, "learning_rate": 9.725004093573342e-05, "loss": 0.1337, "step": 1150 }, { "epoch": 0.5332715047530721, "eval_loss": 0.1692070748762034, "eval_runtime": 60.0637, "eval_samples_per_second": 692.364, "eval_steps_per_second": 0.683, "step": 1150 }, { "epoch": 0.5564572223510318, "grad_norm": 0.09759815782308578, "learning_rate": 9.698463103929542e-05, "loss": 0.134, "step": 1200 }, { "epoch": 0.5564572223510318, "eval_loss": 0.16649228385381692, "eval_runtime": 60.4305, "eval_samples_per_second": 688.162, "eval_steps_per_second": 0.678, "step": 1200 }, { "epoch": 0.5796429399489914, "grad_norm": 0.10353852063417435, "learning_rate": 9.670739301325534e-05, "loss": 0.1341, "step": 1250 }, { "epoch": 0.5796429399489914, "eval_loss": 0.16802514322459206, "eval_runtime": 60.0955, "eval_samples_per_second": 691.999, "eval_steps_per_second": 0.682, "step": 1250 }, { "epoch": 0.602828657546951, "grad_norm": 0.11834366619586945, "learning_rate": 9.641839665080363e-05, "loss": 0.1347, "step": 1300 }, { "epoch": 0.602828657546951, "eval_loss": 0.1672302417427292, "eval_runtime": 60.3484, "eval_samples_per_second": 689.098, "eval_steps_per_second": 0.679, "step": 1300 }, { "epoch": 0.6260143751449108, "grad_norm": 0.06963012367486954, "learning_rate": 9.611771470522908e-05, "loss": 0.1335, "step": 1350 }, { "epoch": 0.6260143751449108, "eval_loss": 0.16607839684977216, "eval_runtime": 60.3308, "eval_samples_per_second": 689.3, "eval_steps_per_second": 0.68, "step": 1350 }, { "epoch": 0.6492000927428704, "grad_norm": 0.06842990219593048, "learning_rate": 9.580542287160348e-05, "loss": 0.1338, "step": 1400 }, { "epoch": 0.6492000927428704, "eval_loss": 0.16628812684035693, "eval_runtime": 59.9335, "eval_samples_per_second": 693.87, "eval_steps_per_second": 0.684, "step": 1400 }, { "epoch": 0.67238581034083, "grad_norm": 0.07053674757480621, "learning_rate": 9.548159976772592e-05, "loss": 0.1335, "step": 1450 }, { "epoch": 0.67238581034083, "eval_loss": 0.16696060882262428, "eval_runtime": 59.8079, "eval_samples_per_second": 695.326, "eval_steps_per_second": 0.686, "step": 1450 }, { "epoch": 0.6955715279387897, "grad_norm": 0.09175281971693039, "learning_rate": 9.514632691433107e-05, "loss": 0.1332, "step": 1500 }, { "epoch": 0.6955715279387897, "eval_loss": 0.16521949465081834, "eval_runtime": 60.1856, "eval_samples_per_second": 690.963, "eval_steps_per_second": 0.681, "step": 1500 }, { "epoch": 0.7187572455367494, "grad_norm": 0.05836635082960129, "learning_rate": 9.479968871456679e-05, "loss": 0.1336, "step": 1550 }, { "epoch": 0.7187572455367494, "eval_loss": 0.16626366255041727, "eval_runtime": 60.6256, "eval_samples_per_second": 685.948, "eval_steps_per_second": 0.676, "step": 1550 }, { "epoch": 0.741942963134709, "grad_norm": 0.07249301671981812, "learning_rate": 9.444177243274618e-05, "loss": 0.133, "step": 1600 }, { "epoch": 0.741942963134709, "eval_loss": 0.1655649439629329, "eval_runtime": 60.2447, "eval_samples_per_second": 690.285, "eval_steps_per_second": 0.681, "step": 1600 }, { "epoch": 0.7651286807326687, "grad_norm": 0.07509302347898483, "learning_rate": 9.407266817237911e-05, "loss": 0.1332, "step": 1650 }, { "epoch": 0.7651286807326687, "eval_loss": 0.16605371203296967, "eval_runtime": 59.8196, "eval_samples_per_second": 695.191, "eval_steps_per_second": 0.685, "step": 1650 }, { "epoch": 0.7883143983306283, "grad_norm": 0.07540406286716461, "learning_rate": 9.369246885348926e-05, "loss": 0.1327, "step": 1700 }, { "epoch": 0.7883143983306283, "eval_loss": 0.16555590021301406, "eval_runtime": 60.4119, "eval_samples_per_second": 688.374, "eval_steps_per_second": 0.679, "step": 1700 }, { "epoch": 0.811500115928588, "grad_norm": 0.06061087176203728, "learning_rate": 9.330127018922194e-05, "loss": 0.1318, "step": 1750 }, { "epoch": 0.811500115928588, "eval_loss": 0.16623179673527624, "eval_runtime": 59.7807, "eval_samples_per_second": 695.643, "eval_steps_per_second": 0.686, "step": 1750 }, { "epoch": 0.8346858335265477, "grad_norm": 0.05577518790960312, "learning_rate": 9.289917066174886e-05, "loss": 0.1319, "step": 1800 }, { "epoch": 0.8346858335265477, "eval_loss": 0.16519989030959317, "eval_runtime": 60.1508, "eval_samples_per_second": 691.363, "eval_steps_per_second": 0.682, "step": 1800 }, { "epoch": 0.8578715511245073, "grad_norm": 0.06929640471935272, "learning_rate": 9.248627149747573e-05, "loss": 0.1337, "step": 1850 }, { "epoch": 0.8578715511245073, "eval_loss": 0.16394849849125304, "eval_runtime": 60.1044, "eval_samples_per_second": 691.896, "eval_steps_per_second": 0.682, "step": 1850 }, { "epoch": 0.8810572687224669, "grad_norm": 0.07941466569900513, "learning_rate": 9.206267664155907e-05, "loss": 0.1324, "step": 1900 }, { "epoch": 0.8810572687224669, "eval_loss": 0.1648257591054525, "eval_runtime": 59.9818, "eval_samples_per_second": 693.31, "eval_steps_per_second": 0.684, "step": 1900 }, { "epoch": 0.9042429863204267, "grad_norm": 0.09700328856706619, "learning_rate": 9.162849273173857e-05, "loss": 0.1334, "step": 1950 }, { "epoch": 0.9042429863204267, "eval_loss": 0.16508159820082235, "eval_runtime": 60.0956, "eval_samples_per_second": 691.997, "eval_steps_per_second": 0.682, "step": 1950 }, { "epoch": 0.9274287039183863, "grad_norm": 0.09397923946380615, "learning_rate": 9.118382907149165e-05, "loss": 0.1317, "step": 2000 }, { "epoch": 0.9274287039183863, "eval_loss": 0.16377645660046958, "eval_runtime": 60.471, "eval_samples_per_second": 687.701, "eval_steps_per_second": 0.678, "step": 2000 }, { "epoch": 0.9506144215163459, "grad_norm": 0.08097202330827713, "learning_rate": 9.072879760251679e-05, "loss": 0.1324, "step": 2050 }, { "epoch": 0.9506144215163459, "eval_loss": 0.16491611914973717, "eval_runtime": 60.6678, "eval_samples_per_second": 685.471, "eval_steps_per_second": 0.676, "step": 2050 }, { "epoch": 0.9738001391143056, "grad_norm": 0.08455361425876617, "learning_rate": 9.026351287655294e-05, "loss": 0.1326, "step": 2100 }, { "epoch": 0.9738001391143056, "eval_loss": 0.16602741997032858, "eval_runtime": 60.5593, "eval_samples_per_second": 686.698, "eval_steps_per_second": 0.677, "step": 2100 }, { "epoch": 0.9969858567122653, "grad_norm": 0.056316621601581573, "learning_rate": 8.978809202654162e-05, "loss": 0.1326, "step": 2150 }, { "epoch": 0.9969858567122653, "eval_loss": 0.1640188218462461, "eval_runtime": 60.9228, "eval_samples_per_second": 682.602, "eval_steps_per_second": 0.673, "step": 2150 }, { "epoch": 1.0201715743102249, "grad_norm": 0.06686601787805557, "learning_rate": 8.930265473713938e-05, "loss": 0.132, "step": 2200 }, { "epoch": 1.0201715743102249, "eval_loss": 0.1652621257294944, "eval_runtime": 60.883, "eval_samples_per_second": 683.048, "eval_steps_per_second": 0.673, "step": 2200 }, { "epoch": 1.0433572919081846, "grad_norm": 0.040202509611845016, "learning_rate": 8.880732321458784e-05, "loss": 0.1319, "step": 2250 }, { "epoch": 1.0433572919081846, "eval_loss": 0.1655291575008717, "eval_runtime": 60.3109, "eval_samples_per_second": 689.527, "eval_steps_per_second": 0.68, "step": 2250 }, { "epoch": 1.0665430095061441, "grad_norm": 0.0656428411602974, "learning_rate": 8.83022221559489e-05, "loss": 0.1326, "step": 2300 }, { "epoch": 1.0665430095061441, "eval_loss": 0.16431572407087935, "eval_runtime": 60.1036, "eval_samples_per_second": 691.906, "eval_steps_per_second": 0.682, "step": 2300 }, { "epoch": 1.0897287271041038, "grad_norm": 0.06945247948169708, "learning_rate": 8.778747871771292e-05, "loss": 0.1321, "step": 2350 }, { "epoch": 1.0897287271041038, "eval_loss": 0.16585482329987242, "eval_runtime": 60.6263, "eval_samples_per_second": 685.94, "eval_steps_per_second": 0.676, "step": 2350 }, { "epoch": 1.1129144447020636, "grad_norm": 0.0523492731153965, "learning_rate": 8.726322248378775e-05, "loss": 0.1317, "step": 2400 }, { "epoch": 1.1129144447020636, "eval_loss": 0.16438524923736036, "eval_runtime": 60.317, "eval_samples_per_second": 689.457, "eval_steps_per_second": 0.68, "step": 2400 }, { "epoch": 1.136100162300023, "grad_norm": 0.07777334004640579, "learning_rate": 8.672958543287666e-05, "loss": 0.1322, "step": 2450 }, { "epoch": 1.136100162300023, "eval_loss": 0.16509696053644565, "eval_runtime": 60.26, "eval_samples_per_second": 690.109, "eval_steps_per_second": 0.68, "step": 2450 }, { "epoch": 1.1592858798979828, "grad_norm": 0.06430637836456299, "learning_rate": 8.618670190525352e-05, "loss": 0.1325, "step": 2500 }, { "epoch": 1.1592858798979828, "eval_loss": 0.1639541608445008, "eval_runtime": 60.5314, "eval_samples_per_second": 687.015, "eval_steps_per_second": 0.677, "step": 2500 }, { "epoch": 1.1824715974959426, "grad_norm": 0.11194106936454773, "learning_rate": 8.563470856894316e-05, "loss": 0.1311, "step": 2550 }, { "epoch": 1.1824715974959426, "eval_loss": 0.16260699934317355, "eval_runtime": 60.3659, "eval_samples_per_second": 688.899, "eval_steps_per_second": 0.679, "step": 2550 }, { "epoch": 1.205657315093902, "grad_norm": 0.06165901944041252, "learning_rate": 8.507374438531607e-05, "loss": 0.1323, "step": 2600 }, { "epoch": 1.205657315093902, "eval_loss": 0.1626319663130242, "eval_runtime": 59.9516, "eval_samples_per_second": 693.66, "eval_steps_per_second": 0.684, "step": 2600 }, { "epoch": 1.2288430326918618, "grad_norm": 0.10654885321855545, "learning_rate": 8.450395057410561e-05, "loss": 0.1316, "step": 2650 }, { "epoch": 1.2288430326918618, "eval_loss": 0.16393000041041636, "eval_runtime": 59.576, "eval_samples_per_second": 698.032, "eval_steps_per_second": 0.688, "step": 2650 }, { "epoch": 1.2520287502898215, "grad_norm": 0.04848140478134155, "learning_rate": 8.392547057785661e-05, "loss": 0.1314, "step": 2700 }, { "epoch": 1.2520287502898215, "eval_loss": 0.16348152455768114, "eval_runtime": 60.098, "eval_samples_per_second": 691.97, "eval_steps_per_second": 0.682, "step": 2700 }, { "epoch": 1.275214467887781, "grad_norm": 0.0573604516685009, "learning_rate": 8.333845002581458e-05, "loss": 0.1314, "step": 2750 }, { "epoch": 1.275214467887781, "eval_loss": 0.16364089140116167, "eval_runtime": 60.1364, "eval_samples_per_second": 691.528, "eval_steps_per_second": 0.682, "step": 2750 }, { "epoch": 1.2984001854857408, "grad_norm": 0.053159259259700775, "learning_rate": 8.274303669726426e-05, "loss": 0.131, "step": 2800 }, { "epoch": 1.2984001854857408, "eval_loss": 0.16257415365129801, "eval_runtime": 60.0025, "eval_samples_per_second": 693.071, "eval_steps_per_second": 0.683, "step": 2800 }, { "epoch": 1.3215859030837005, "grad_norm": 0.09136148542165756, "learning_rate": 8.213938048432697e-05, "loss": 0.1313, "step": 2850 }, { "epoch": 1.3215859030837005, "eval_loss": 0.16324665471619784, "eval_runtime": 59.8429, "eval_samples_per_second": 694.92, "eval_steps_per_second": 0.685, "step": 2850 }, { "epoch": 1.34477162068166, "grad_norm": 0.05825324356555939, "learning_rate": 8.152763335422613e-05, "loss": 0.1312, "step": 2900 }, { "epoch": 1.34477162068166, "eval_loss": 0.16367374608121235, "eval_runtime": 60.219, "eval_samples_per_second": 690.579, "eval_steps_per_second": 0.681, "step": 2900 }, { "epoch": 1.3679573382796197, "grad_norm": 0.06379790604114532, "learning_rate": 8.090794931103026e-05, "loss": 0.1317, "step": 2950 }, { "epoch": 1.3679573382796197, "eval_loss": 0.16400733758786312, "eval_runtime": 59.9641, "eval_samples_per_second": 693.515, "eval_steps_per_second": 0.684, "step": 2950 }, { "epoch": 1.3911430558775795, "grad_norm": 0.05361103266477585, "learning_rate": 8.028048435688333e-05, "loss": 0.1311, "step": 3000 }, { "epoch": 1.3911430558775795, "eval_loss": 0.16210626991928834, "eval_runtime": 59.5858, "eval_samples_per_second": 697.919, "eval_steps_per_second": 0.688, "step": 3000 }, { "epoch": 1.414328773475539, "grad_norm": 0.04593402519822121, "learning_rate": 7.964539645273204e-05, "loss": 0.1304, "step": 3050 }, { "epoch": 1.414328773475539, "eval_loss": 0.163067463275087, "eval_runtime": 60.098, "eval_samples_per_second": 691.97, "eval_steps_per_second": 0.682, "step": 3050 }, { "epoch": 1.4375144910734987, "grad_norm": 0.057480327785015106, "learning_rate": 7.900284547855991e-05, "loss": 0.1307, "step": 3100 }, { "epoch": 1.4375144910734987, "eval_loss": 0.16243572043734797, "eval_runtime": 59.5674, "eval_samples_per_second": 698.133, "eval_steps_per_second": 0.688, "step": 3100 }, { "epoch": 1.4607002086714584, "grad_norm": 0.08223798871040344, "learning_rate": 7.835299319313853e-05, "loss": 0.1315, "step": 3150 }, { "epoch": 1.4607002086714584, "eval_loss": 0.1641944734489707, "eval_runtime": 59.5423, "eval_samples_per_second": 698.428, "eval_steps_per_second": 0.689, "step": 3150 }, { "epoch": 1.483885926269418, "grad_norm": 0.09742949903011322, "learning_rate": 7.769600319330552e-05, "loss": 0.1303, "step": 3200 }, { "epoch": 1.483885926269418, "eval_loss": 0.16355698856626613, "eval_runtime": 60.1946, "eval_samples_per_second": 690.859, "eval_steps_per_second": 0.681, "step": 3200 }, { "epoch": 1.5070716438673777, "grad_norm": 0.06401767581701279, "learning_rate": 7.703204087277988e-05, "loss": 0.1315, "step": 3250 }, { "epoch": 1.5070716438673777, "eval_loss": 0.16215006705140952, "eval_runtime": 59.7822, "eval_samples_per_second": 695.625, "eval_steps_per_second": 0.686, "step": 3250 }, { "epoch": 1.5302573614653374, "grad_norm": 0.07916898280382156, "learning_rate": 7.636127338052512e-05, "loss": 0.1315, "step": 3300 }, { "epoch": 1.5302573614653374, "eval_loss": 0.16288597734760557, "eval_runtime": 59.2757, "eval_samples_per_second": 701.57, "eval_steps_per_second": 0.692, "step": 3300 }, { "epoch": 1.553443079063297, "grad_norm": 0.06549016386270523, "learning_rate": 7.568386957867033e-05, "loss": 0.1303, "step": 3350 }, { "epoch": 1.553443079063297, "eval_loss": 0.16416664097655873, "eval_runtime": 59.84, "eval_samples_per_second": 694.953, "eval_steps_per_second": 0.685, "step": 3350 }, { "epoch": 1.5766287966612567, "grad_norm": 0.0709395632147789, "learning_rate": 7.500000000000001e-05, "loss": 0.1309, "step": 3400 }, { "epoch": 1.5766287966612567, "eval_loss": 0.16179194486424098, "eval_runtime": 59.8634, "eval_samples_per_second": 694.682, "eval_steps_per_second": 0.685, "step": 3400 }, { "epoch": 1.5998145142592164, "grad_norm": 0.05671363323926926, "learning_rate": 7.430983680502344e-05, "loss": 0.1307, "step": 3450 }, { "epoch": 1.5998145142592164, "eval_loss": 0.16309191886303373, "eval_runtime": 59.618, "eval_samples_per_second": 697.541, "eval_steps_per_second": 0.688, "step": 3450 }, { "epoch": 1.623000231857176, "grad_norm": 0.04889162629842758, "learning_rate": 7.361355373863414e-05, "loss": 0.1314, "step": 3500 }, { "epoch": 1.623000231857176, "eval_loss": 0.16290782983414598, "eval_runtime": 60.3904, "eval_samples_per_second": 688.619, "eval_steps_per_second": 0.679, "step": 3500 }, { "epoch": 1.6461859494551356, "grad_norm": 0.0970933735370636, "learning_rate": 7.291132608637052e-05, "loss": 0.1314, "step": 3550 }, { "epoch": 1.6461859494551356, "eval_loss": 0.16278222993823557, "eval_runtime": 59.8666, "eval_samples_per_second": 694.644, "eval_steps_per_second": 0.685, "step": 3550 }, { "epoch": 1.6693716670530954, "grad_norm": 0.056557025760412216, "learning_rate": 7.220333063028872e-05, "loss": 0.1312, "step": 3600 }, { "epoch": 1.6693716670530954, "eval_loss": 0.16313205291311117, "eval_runtime": 60.0092, "eval_samples_per_second": 692.993, "eval_steps_per_second": 0.683, "step": 3600 }, { "epoch": 1.6925573846510549, "grad_norm": 0.04870522394776344, "learning_rate": 7.148974560445859e-05, "loss": 0.1299, "step": 3650 }, { "epoch": 1.6925573846510549, "eval_loss": 0.1617941082289122, "eval_runtime": 60.1721, "eval_samples_per_second": 691.117, "eval_steps_per_second": 0.681, "step": 3650 }, { "epoch": 1.7157431022490146, "grad_norm": 0.0681833028793335, "learning_rate": 7.077075065009433e-05, "loss": 0.1304, "step": 3700 }, { "epoch": 1.7157431022490146, "eval_loss": 0.16243406602519425, "eval_runtime": 59.3626, "eval_samples_per_second": 700.542, "eval_steps_per_second": 0.691, "step": 3700 }, { "epoch": 1.7389288198469743, "grad_norm": 0.06506156921386719, "learning_rate": 7.004652677033068e-05, "loss": 0.1299, "step": 3750 }, { "epoch": 1.7389288198469743, "eval_loss": 0.16324780134312317, "eval_runtime": 59.6022, "eval_samples_per_second": 697.726, "eval_steps_per_second": 0.688, "step": 3750 }, { "epoch": 1.7621145374449338, "grad_norm": 0.06188170611858368, "learning_rate": 6.931725628465643e-05, "loss": 0.1309, "step": 3800 }, { "epoch": 1.7621145374449338, "eval_loss": 0.1623115342294882, "eval_runtime": 59.7694, "eval_samples_per_second": 695.774, "eval_steps_per_second": 0.686, "step": 3800 }, { "epoch": 1.7853002550428936, "grad_norm": 0.05675831064581871, "learning_rate": 6.858312278301637e-05, "loss": 0.1303, "step": 3850 }, { "epoch": 1.7853002550428936, "eval_loss": 0.1630547638293529, "eval_runtime": 59.779, "eval_samples_per_second": 695.662, "eval_steps_per_second": 0.686, "step": 3850 }, { "epoch": 1.8084859726408533, "grad_norm": 0.04727062210440636, "learning_rate": 6.784431107959359e-05, "loss": 0.1312, "step": 3900 }, { "epoch": 1.8084859726408533, "eval_loss": 0.1616409071893626, "eval_runtime": 59.6005, "eval_samples_per_second": 697.746, "eval_steps_per_second": 0.688, "step": 3900 }, { "epoch": 1.8316716902388128, "grad_norm": 0.06378892064094543, "learning_rate": 6.710100716628344e-05, "loss": 0.1303, "step": 3950 }, { "epoch": 1.8316716902388128, "eval_loss": 0.1622395658739077, "eval_runtime": 60.1499, "eval_samples_per_second": 691.373, "eval_steps_per_second": 0.682, "step": 3950 }, { "epoch": 1.8548574078367726, "grad_norm": 0.05470576509833336, "learning_rate": 6.635339816587109e-05, "loss": 0.1308, "step": 4000 }, { "epoch": 1.8548574078367726, "eval_loss": 0.16317236170181762, "eval_runtime": 60.014, "eval_samples_per_second": 692.939, "eval_steps_per_second": 0.683, "step": 4000 }, { "epoch": 1.8780431254347323, "grad_norm": 0.053886763751506805, "learning_rate": 6.560167228492436e-05, "loss": 0.1297, "step": 4050 }, { "epoch": 1.8780431254347323, "eval_loss": 0.16198886262197804, "eval_runtime": 60.8262, "eval_samples_per_second": 683.685, "eval_steps_per_second": 0.674, "step": 4050 }, { "epoch": 1.9012288430326918, "grad_norm": 0.054583676159381866, "learning_rate": 6.484601876641375e-05, "loss": 0.1301, "step": 4100 }, { "epoch": 1.9012288430326918, "eval_loss": 0.1616550050294764, "eval_runtime": 59.7779, "eval_samples_per_second": 695.675, "eval_steps_per_second": 0.686, "step": 4100 }, { "epoch": 1.9244145606306515, "grad_norm": 0.071171335875988, "learning_rate": 6.408662784207149e-05, "loss": 0.131, "step": 4150 }, { "epoch": 1.9244145606306515, "eval_loss": 0.15968682813566223, "eval_runtime": 60.227, "eval_samples_per_second": 690.487, "eval_steps_per_second": 0.681, "step": 4150 }, { "epoch": 1.9476002782286113, "grad_norm": 0.05775531381368637, "learning_rate": 6.332369068450174e-05, "loss": 0.1296, "step": 4200 }, { "epoch": 1.9476002782286113, "eval_loss": 0.16262199212265846, "eval_runtime": 60.3405, "eval_samples_per_second": 689.189, "eval_steps_per_second": 0.679, "step": 4200 }, { "epoch": 1.9707859958265708, "grad_norm": 0.06425776332616806, "learning_rate": 6.255739935905396e-05, "loss": 0.1299, "step": 4250 }, { "epoch": 1.9707859958265708, "eval_loss": 0.16324524366491053, "eval_runtime": 61.417, "eval_samples_per_second": 677.109, "eval_steps_per_second": 0.668, "step": 4250 }, { "epoch": 1.9939717134245305, "grad_norm": 0.045762140303850174, "learning_rate": 6.178794677547137e-05, "loss": 0.1299, "step": 4300 }, { "epoch": 1.9939717134245305, "eval_loss": 0.16053301797614244, "eval_runtime": 61.0801, "eval_samples_per_second": 680.844, "eval_steps_per_second": 0.671, "step": 4300 }, { "epoch": 2.0171574310224902, "grad_norm": 0.07060451060533524, "learning_rate": 6.1015526639327035e-05, "loss": 0.1296, "step": 4350 }, { "epoch": 2.0171574310224902, "eval_loss": 0.1620254674138633, "eval_runtime": 61.0829, "eval_samples_per_second": 680.812, "eval_steps_per_second": 0.671, "step": 4350 }, { "epoch": 2.0403431486204497, "grad_norm": 0.059919316321611404, "learning_rate": 6.024033340325954e-05, "loss": 0.1302, "step": 4400 }, { "epoch": 2.0403431486204497, "eval_loss": 0.16284223807997533, "eval_runtime": 61.5789, "eval_samples_per_second": 675.328, "eval_steps_per_second": 0.666, "step": 4400 }, { "epoch": 2.0635288662184093, "grad_norm": 0.07983385026454926, "learning_rate": 5.946256221802051e-05, "loss": 0.13, "step": 4450 }, { "epoch": 2.0635288662184093, "eval_loss": 0.16209282393788932, "eval_runtime": 61.6584, "eval_samples_per_second": 674.458, "eval_steps_per_second": 0.665, "step": 4450 }, { "epoch": 2.086714583816369, "grad_norm": 0.07582173496484756, "learning_rate": 5.868240888334653e-05, "loss": 0.1296, "step": 4500 }, { "epoch": 2.086714583816369, "eval_loss": 0.16158196377974565, "eval_runtime": 61.1826, "eval_samples_per_second": 679.703, "eval_steps_per_second": 0.67, "step": 4500 }, { "epoch": 2.1099003014143287, "grad_norm": 0.06049995869398117, "learning_rate": 5.79000697986675e-05, "loss": 0.1298, "step": 4550 }, { "epoch": 2.1099003014143287, "eval_loss": 0.16130609279963956, "eval_runtime": 61.0153, "eval_samples_per_second": 681.567, "eval_steps_per_second": 0.672, "step": 4550 }, { "epoch": 2.1330860190122882, "grad_norm": 0.0440148264169693, "learning_rate": 5.7115741913664264e-05, "loss": 0.1299, "step": 4600 }, { "epoch": 2.1330860190122882, "eval_loss": 0.16027799763638953, "eval_runtime": 61.1993, "eval_samples_per_second": 679.517, "eval_steps_per_second": 0.67, "step": 4600 }, { "epoch": 2.156271736610248, "grad_norm": 0.05254065990447998, "learning_rate": 5.6329622678687463e-05, "loss": 0.1299, "step": 4650 }, { "epoch": 2.156271736610248, "eval_loss": 0.16206274484291652, "eval_runtime": 61.4415, "eval_samples_per_second": 676.839, "eval_steps_per_second": 0.667, "step": 4650 }, { "epoch": 2.1794574542082077, "grad_norm": 0.06294432282447815, "learning_rate": 5.5541909995050554e-05, "loss": 0.1306, "step": 4700 }, { "epoch": 2.1794574542082077, "eval_loss": 0.16140170723024802, "eval_runtime": 60.8861, "eval_samples_per_second": 683.013, "eval_steps_per_second": 0.673, "step": 4700 }, { "epoch": 2.202643171806167, "grad_norm": 0.06710942089557648, "learning_rate": 5.475280216520913e-05, "loss": 0.1303, "step": 4750 }, { "epoch": 2.202643171806167, "eval_loss": 0.16245448075670843, "eval_runtime": 61.2839, "eval_samples_per_second": 678.58, "eval_steps_per_second": 0.669, "step": 4750 }, { "epoch": 2.225828889404127, "grad_norm": 0.05298132076859474, "learning_rate": 5.396249784283942e-05, "loss": 0.13, "step": 4800 }, { "epoch": 2.225828889404127, "eval_loss": 0.1623738898660767, "eval_runtime": 61.1531, "eval_samples_per_second": 680.031, "eval_steps_per_second": 0.67, "step": 4800 }, { "epoch": 2.2490146070020867, "grad_norm": 0.04066763445734978, "learning_rate": 5.317119598282823e-05, "loss": 0.1295, "step": 4850 }, { "epoch": 2.2490146070020867, "eval_loss": 0.1627438727811327, "eval_runtime": 61.0414, "eval_samples_per_second": 681.275, "eval_steps_per_second": 0.672, "step": 4850 }, { "epoch": 2.272200324600046, "grad_norm": 0.061821240931749344, "learning_rate": 5.2379095791187124e-05, "loss": 0.1299, "step": 4900 }, { "epoch": 2.272200324600046, "eval_loss": 0.16086717177928397, "eval_runtime": 60.7945, "eval_samples_per_second": 684.042, "eval_steps_per_second": 0.674, "step": 4900 }, { "epoch": 2.295386042198006, "grad_norm": 0.08038394153118134, "learning_rate": 5.158639667490339e-05, "loss": 0.13, "step": 4950 }, { "epoch": 2.295386042198006, "eval_loss": 0.16221664317086187, "eval_runtime": 61.6742, "eval_samples_per_second": 674.285, "eval_steps_per_second": 0.665, "step": 4950 }, { "epoch": 2.3185717597959656, "grad_norm": 0.0556926503777504, "learning_rate": 5.0793298191740404e-05, "loss": 0.1311, "step": 5000 }, { "epoch": 2.3185717597959656, "eval_loss": 0.16015339844546791, "eval_runtime": 61.3657, "eval_samples_per_second": 677.675, "eval_steps_per_second": 0.668, "step": 5000 }, { "epoch": 2.3417574773939256, "grad_norm": 0.06645477563142776, "learning_rate": 5e-05, "loss": 0.1284, "step": 5050 }, { "epoch": 2.3417574773939256, "eval_loss": 0.16160674186313023, "eval_runtime": 61.4737, "eval_samples_per_second": 676.484, "eval_steps_per_second": 0.667, "step": 5050 }, { "epoch": 2.364943194991885, "grad_norm": 0.05365500971674919, "learning_rate": 4.92067018082596e-05, "loss": 0.13, "step": 5100 }, { "epoch": 2.364943194991885, "eval_loss": 0.16016484459556096, "eval_runtime": 61.4058, "eval_samples_per_second": 677.232, "eval_steps_per_second": 0.668, "step": 5100 }, { "epoch": 2.3881289125898446, "grad_norm": 0.0499204620718956, "learning_rate": 4.841360332509663e-05, "loss": 0.129, "step": 5150 }, { "epoch": 2.3881289125898446, "eval_loss": 0.16054727464378063, "eval_runtime": 61.1539, "eval_samples_per_second": 680.023, "eval_steps_per_second": 0.67, "step": 5150 }, { "epoch": 2.411314630187804, "grad_norm": 0.07284457236528397, "learning_rate": 4.762090420881289e-05, "loss": 0.129, "step": 5200 }, { "epoch": 2.411314630187804, "eval_loss": 0.16057287778830004, "eval_runtime": 60.5785, "eval_samples_per_second": 686.481, "eval_steps_per_second": 0.677, "step": 5200 }, { "epoch": 2.434500347785764, "grad_norm": 0.06511891633272171, "learning_rate": 4.6828804017171776e-05, "loss": 0.1297, "step": 5250 }, { "epoch": 2.434500347785764, "eval_loss": 0.16202011190836896, "eval_runtime": 61.4053, "eval_samples_per_second": 677.238, "eval_steps_per_second": 0.668, "step": 5250 }, { "epoch": 2.4576860653837236, "grad_norm": 0.05936937406659126, "learning_rate": 4.603750215716057e-05, "loss": 0.1293, "step": 5300 }, { "epoch": 2.4576860653837236, "eval_loss": 0.16067086041480225, "eval_runtime": 60.4469, "eval_samples_per_second": 687.976, "eval_steps_per_second": 0.678, "step": 5300 }, { "epoch": 2.480871782981683, "grad_norm": 0.039836496114730835, "learning_rate": 4.5247197834790876e-05, "loss": 0.1288, "step": 5350 }, { "epoch": 2.480871782981683, "eval_loss": 0.1614640227625451, "eval_runtime": 60.9513, "eval_samples_per_second": 682.283, "eval_steps_per_second": 0.673, "step": 5350 }, { "epoch": 2.504057500579643, "grad_norm": 0.04305760934948921, "learning_rate": 4.445809000494946e-05, "loss": 0.1294, "step": 5400 }, { "epoch": 2.504057500579643, "eval_loss": 0.16139181990447046, "eval_runtime": 60.6766, "eval_samples_per_second": 685.371, "eval_steps_per_second": 0.676, "step": 5400 }, { "epoch": 2.5272432181776026, "grad_norm": 0.06780368089675903, "learning_rate": 4.3670377321312535e-05, "loss": 0.1285, "step": 5450 }, { "epoch": 2.5272432181776026, "eval_loss": 0.1619736397134425, "eval_runtime": 60.7281, "eval_samples_per_second": 684.79, "eval_steps_per_second": 0.675, "step": 5450 }, { "epoch": 2.550428935775562, "grad_norm": 0.052273835986852646, "learning_rate": 4.288425808633575e-05, "loss": 0.1303, "step": 5500 }, { "epoch": 2.550428935775562, "eval_loss": 0.16178818674979198, "eval_runtime": 60.8875, "eval_samples_per_second": 682.997, "eval_steps_per_second": 0.673, "step": 5500 }, { "epoch": 2.573614653373522, "grad_norm": 0.045574627816677094, "learning_rate": 4.20999302013325e-05, "loss": 0.1291, "step": 5550 }, { "epoch": 2.573614653373522, "eval_loss": 0.16034006952877458, "eval_runtime": 60.7378, "eval_samples_per_second": 684.681, "eval_steps_per_second": 0.675, "step": 5550 }, { "epoch": 2.5968003709714815, "grad_norm": 0.044092051684856415, "learning_rate": 4.131759111665349e-05, "loss": 0.1298, "step": 5600 }, { "epoch": 2.5968003709714815, "eval_loss": 0.16090484909780667, "eval_runtime": 60.4675, "eval_samples_per_second": 687.741, "eval_steps_per_second": 0.678, "step": 5600 }, { "epoch": 2.6199860885694415, "grad_norm": 0.05473971739411354, "learning_rate": 4.0537437781979506e-05, "loss": 0.1288, "step": 5650 }, { "epoch": 2.6199860885694415, "eval_loss": 0.1604315377337276, "eval_runtime": 62.8239, "eval_samples_per_second": 661.946, "eval_steps_per_second": 0.653, "step": 5650 }, { "epoch": 2.643171806167401, "grad_norm": 0.07100555300712585, "learning_rate": 3.9759666596740476e-05, "loss": 0.129, "step": 5700 }, { "epoch": 2.643171806167401, "eval_loss": 0.15997494100305837, "eval_runtime": 61.3008, "eval_samples_per_second": 678.392, "eval_steps_per_second": 0.669, "step": 5700 }, { "epoch": 2.6663575237653605, "grad_norm": 0.04020215570926666, "learning_rate": 3.898447336067297e-05, "loss": 0.1291, "step": 5750 }, { "epoch": 2.6663575237653605, "eval_loss": 0.1596748490832133, "eval_runtime": 60.6148, "eval_samples_per_second": 686.07, "eval_steps_per_second": 0.676, "step": 5750 }, { "epoch": 2.68954324136332, "grad_norm": 0.05526584014296532, "learning_rate": 3.821205322452863e-05, "loss": 0.1291, "step": 5800 }, { "epoch": 2.68954324136332, "eval_loss": 0.16091962633426782, "eval_runtime": 60.1717, "eval_samples_per_second": 691.122, "eval_steps_per_second": 0.681, "step": 5800 }, { "epoch": 2.71272895896128, "grad_norm": 0.052167922258377075, "learning_rate": 3.744260064094604e-05, "loss": 0.129, "step": 5850 }, { "epoch": 2.71272895896128, "eval_loss": 0.16112806362615253, "eval_runtime": 60.1273, "eval_samples_per_second": 691.633, "eval_steps_per_second": 0.682, "step": 5850 }, { "epoch": 2.7359146765592395, "grad_norm": 0.054320793598890305, "learning_rate": 3.6676309315498256e-05, "loss": 0.13, "step": 5900 }, { "epoch": 2.7359146765592395, "eval_loss": 0.15996250695505343, "eval_runtime": 60.655, "eval_samples_per_second": 685.616, "eval_steps_per_second": 0.676, "step": 5900 }, { "epoch": 2.7591003941571994, "grad_norm": 0.05470626428723335, "learning_rate": 3.591337215792852e-05, "loss": 0.1296, "step": 5950 }, { "epoch": 2.7591003941571994, "eval_loss": 0.16025288890609335, "eval_runtime": 60.826, "eval_samples_per_second": 683.688, "eval_steps_per_second": 0.674, "step": 5950 }, { "epoch": 2.782286111755159, "grad_norm": 0.04805810749530792, "learning_rate": 3.515398123358627e-05, "loss": 0.1294, "step": 6000 }, { "epoch": 2.782286111755159, "eval_loss": 0.15918263724182835, "eval_runtime": 60.2321, "eval_samples_per_second": 690.429, "eval_steps_per_second": 0.681, "step": 6000 }, { "epoch": 2.8054718293531185, "grad_norm": 0.04185302183032036, "learning_rate": 3.439832771507565e-05, "loss": 0.1283, "step": 6050 }, { "epoch": 2.8054718293531185, "eval_loss": 0.16179385240233157, "eval_runtime": 60.9176, "eval_samples_per_second": 682.66, "eval_steps_per_second": 0.673, "step": 6050 }, { "epoch": 2.828657546951078, "grad_norm": 0.04609336704015732, "learning_rate": 3.364660183412892e-05, "loss": 0.1292, "step": 6100 }, { "epoch": 2.828657546951078, "eval_loss": 0.1611929898635588, "eval_runtime": 60.5916, "eval_samples_per_second": 686.333, "eval_steps_per_second": 0.677, "step": 6100 }, { "epoch": 2.851843264549038, "grad_norm": 0.05404876172542572, "learning_rate": 3.289899283371657e-05, "loss": 0.128, "step": 6150 }, { "epoch": 2.851843264549038, "eval_loss": 0.16039360794951976, "eval_runtime": 60.5961, "eval_samples_per_second": 686.282, "eval_steps_per_second": 0.677, "step": 6150 }, { "epoch": 2.8750289821469974, "grad_norm": 0.06787659227848053, "learning_rate": 3.215568892040641e-05, "loss": 0.1288, "step": 6200 }, { "epoch": 2.8750289821469974, "eval_loss": 0.16113480515361805, "eval_runtime": 60.2775, "eval_samples_per_second": 689.909, "eval_steps_per_second": 0.68, "step": 6200 }, { "epoch": 2.8982146997449574, "grad_norm": 0.06937435269355774, "learning_rate": 3.141687721698363e-05, "loss": 0.1283, "step": 6250 }, { "epoch": 2.8982146997449574, "eval_loss": 0.16087572214972407, "eval_runtime": 60.6789, "eval_samples_per_second": 685.345, "eval_steps_per_second": 0.676, "step": 6250 }, { "epoch": 2.921400417342917, "grad_norm": 0.08074232190847397, "learning_rate": 3.0682743715343564e-05, "loss": 0.1292, "step": 6300 }, { "epoch": 2.921400417342917, "eval_loss": 0.16049740787316144, "eval_runtime": 60.3194, "eval_samples_per_second": 689.43, "eval_steps_per_second": 0.68, "step": 6300 }, { "epoch": 2.9445861349408764, "grad_norm": 0.03976515680551529, "learning_rate": 2.9953473229669328e-05, "loss": 0.1302, "step": 6350 }, { "epoch": 2.9445861349408764, "eval_loss": 0.16023700059761273, "eval_runtime": 60.8537, "eval_samples_per_second": 683.377, "eval_steps_per_second": 0.674, "step": 6350 }, { "epoch": 2.967771852538836, "grad_norm": 0.05303976684808731, "learning_rate": 2.9229249349905684e-05, "loss": 0.1285, "step": 6400 }, { "epoch": 2.967771852538836, "eval_loss": 0.1601465398516622, "eval_runtime": 60.6472, "eval_samples_per_second": 685.703, "eval_steps_per_second": 0.676, "step": 6400 }, { "epoch": 2.990957570136796, "grad_norm": 0.0519745759665966, "learning_rate": 2.851025439554142e-05, "loss": 0.1286, "step": 6450 }, { "epoch": 2.990957570136796, "eval_loss": 0.16085429229133483, "eval_runtime": 60.2507, "eval_samples_per_second": 690.216, "eval_steps_per_second": 0.68, "step": 6450 }, { "epoch": 3.0141432877347554, "grad_norm": 0.050518251955509186, "learning_rate": 2.7796669369711294e-05, "loss": 0.1301, "step": 6500 }, { "epoch": 3.0141432877347554, "eval_loss": 0.16015394660421692, "eval_runtime": 60.5015, "eval_samples_per_second": 687.355, "eval_steps_per_second": 0.678, "step": 6500 }, { "epoch": 3.037329005332715, "grad_norm": 0.04253960773348808, "learning_rate": 2.708867391362948e-05, "loss": 0.1296, "step": 6550 }, { "epoch": 3.037329005332715, "eval_loss": 0.1597283595131218, "eval_runtime": 60.13, "eval_samples_per_second": 691.601, "eval_steps_per_second": 0.682, "step": 6550 }, { "epoch": 3.060514722930675, "grad_norm": 0.06899340450763702, "learning_rate": 2.638644626136587e-05, "loss": 0.1291, "step": 6600 }, { "epoch": 3.060514722930675, "eval_loss": 0.1604277250117246, "eval_runtime": 60.4618, "eval_samples_per_second": 687.806, "eval_steps_per_second": 0.678, "step": 6600 }, { "epoch": 3.0837004405286343, "grad_norm": 0.06556117534637451, "learning_rate": 2.5690163194976575e-05, "loss": 0.1288, "step": 6650 }, { "epoch": 3.0837004405286343, "eval_loss": 0.15953636330193482, "eval_runtime": 60.2757, "eval_samples_per_second": 689.93, "eval_steps_per_second": 0.68, "step": 6650 }, { "epoch": 3.106886158126594, "grad_norm": 0.03685734421014786, "learning_rate": 2.500000000000001e-05, "loss": 0.129, "step": 6700 }, { "epoch": 3.106886158126594, "eval_loss": 0.159308270335797, "eval_runtime": 60.624, "eval_samples_per_second": 685.966, "eval_steps_per_second": 0.676, "step": 6700 }, { "epoch": 3.130071875724554, "grad_norm": 0.0451020672917366, "learning_rate": 2.4316130421329697e-05, "loss": 0.1286, "step": 6750 }, { "epoch": 3.130071875724554, "eval_loss": 0.15995884031774596, "eval_runtime": 60.3654, "eval_samples_per_second": 688.905, "eval_steps_per_second": 0.679, "step": 6750 }, { "epoch": 3.1532575933225133, "grad_norm": 0.0495733842253685, "learning_rate": 2.363872661947488e-05, "loss": 0.1293, "step": 6800 }, { "epoch": 3.1532575933225133, "eval_loss": 0.15987331824692497, "eval_runtime": 60.4636, "eval_samples_per_second": 687.786, "eval_steps_per_second": 0.678, "step": 6800 }, { "epoch": 3.176443310920473, "grad_norm": 0.05756652355194092, "learning_rate": 2.296795912722014e-05, "loss": 0.1289, "step": 6850 }, { "epoch": 3.176443310920473, "eval_loss": 0.15986134614331013, "eval_runtime": 61.0063, "eval_samples_per_second": 681.667, "eval_steps_per_second": 0.672, "step": 6850 }, { "epoch": 3.199629028518433, "grad_norm": 0.0467820018529892, "learning_rate": 2.2303996806694488e-05, "loss": 0.1295, "step": 6900 }, { "epoch": 3.199629028518433, "eval_loss": 0.16011030076900337, "eval_runtime": 60.1041, "eval_samples_per_second": 691.9, "eval_steps_per_second": 0.682, "step": 6900 }, { "epoch": 3.2228147461163923, "grad_norm": 0.04179982468485832, "learning_rate": 2.164700680686147e-05, "loss": 0.1287, "step": 6950 }, { "epoch": 3.2228147461163923, "eval_loss": 0.15917751068552838, "eval_runtime": 60.5321, "eval_samples_per_second": 687.007, "eval_steps_per_second": 0.677, "step": 6950 }, { "epoch": 3.246000463714352, "grad_norm": 0.053910572081804276, "learning_rate": 2.09971545214401e-05, "loss": 0.1286, "step": 7000 }, { "epoch": 3.246000463714352, "eval_loss": 0.15998092838627764, "eval_runtime": 60.4067, "eval_samples_per_second": 688.434, "eval_steps_per_second": 0.679, "step": 7000 }, { "epoch": 3.2691861813123118, "grad_norm": 0.04404950886964798, "learning_rate": 2.0354603547267985e-05, "loss": 0.1283, "step": 7050 }, { "epoch": 3.2691861813123118, "eval_loss": 0.1597617331551387, "eval_runtime": 60.4218, "eval_samples_per_second": 688.262, "eval_steps_per_second": 0.679, "step": 7050 }, { "epoch": 3.2923718989102713, "grad_norm": 0.04763752967119217, "learning_rate": 1.9719515643116674e-05, "loss": 0.1288, "step": 7100 }, { "epoch": 3.2923718989102713, "eval_loss": 0.16116006530852447, "eval_runtime": 60.2132, "eval_samples_per_second": 690.646, "eval_steps_per_second": 0.681, "step": 7100 }, { "epoch": 3.3155576165082308, "grad_norm": 0.049567196518182755, "learning_rate": 1.9092050688969738e-05, "loss": 0.1298, "step": 7150 }, { "epoch": 3.3155576165082308, "eval_loss": 0.15965543804361845, "eval_runtime": 60.3928, "eval_samples_per_second": 688.592, "eval_steps_per_second": 0.679, "step": 7150 }, { "epoch": 3.3387433341061907, "grad_norm": 0.05488676205277443, "learning_rate": 1.847236664577389e-05, "loss": 0.1284, "step": 7200 }, { "epoch": 3.3387433341061907, "eval_loss": 0.16050384662882064, "eval_runtime": 60.121, "eval_samples_per_second": 691.705, "eval_steps_per_second": 0.682, "step": 7200 }, { "epoch": 3.3619290517041502, "grad_norm": 0.04124298691749573, "learning_rate": 1.7860619515673033e-05, "loss": 0.1289, "step": 7250 }, { "epoch": 3.3619290517041502, "eval_loss": 0.16054145931691394, "eval_runtime": 60.2046, "eval_samples_per_second": 690.745, "eval_steps_per_second": 0.681, "step": 7250 }, { "epoch": 3.3851147693021097, "grad_norm": 0.04400424286723137, "learning_rate": 1.725696330273575e-05, "loss": 0.1289, "step": 7300 }, { "epoch": 3.3851147693021097, "eval_loss": 0.15999099129576416, "eval_runtime": 60.4869, "eval_samples_per_second": 687.52, "eval_steps_per_second": 0.678, "step": 7300 }, { "epoch": 3.4083004869000697, "grad_norm": 0.05488509312272072, "learning_rate": 1.6661549974185424e-05, "loss": 0.1285, "step": 7350 }, { "epoch": 3.4083004869000697, "eval_loss": 0.16051823730892306, "eval_runtime": 60.1981, "eval_samples_per_second": 690.819, "eval_steps_per_second": 0.681, "step": 7350 }, { "epoch": 3.431486204498029, "grad_norm": 0.06722457706928253, "learning_rate": 1.60745294221434e-05, "loss": 0.1286, "step": 7400 }, { "epoch": 3.431486204498029, "eval_loss": 0.1610307768591294, "eval_runtime": 60.7755, "eval_samples_per_second": 684.256, "eval_steps_per_second": 0.675, "step": 7400 }, { "epoch": 3.4546719220959887, "grad_norm": 0.04814394935965538, "learning_rate": 1.549604942589441e-05, "loss": 0.1278, "step": 7450 }, { "epoch": 3.4546719220959887, "eval_loss": 0.1598065741965525, "eval_runtime": 59.9968, "eval_samples_per_second": 693.136, "eval_steps_per_second": 0.683, "step": 7450 }, { "epoch": 3.4778576396939487, "grad_norm": 0.04934167116880417, "learning_rate": 1.4926255614683932e-05, "loss": 0.1274, "step": 7500 }, { "epoch": 3.4778576396939487, "eval_loss": 0.15982454893723042, "eval_runtime": 60.201, "eval_samples_per_second": 690.786, "eval_steps_per_second": 0.681, "step": 7500 }, { "epoch": 3.501043357291908, "grad_norm": 0.04529615864157677, "learning_rate": 1.4365291431056871e-05, "loss": 0.1297, "step": 7550 }, { "epoch": 3.501043357291908, "eval_loss": 0.15986133524024926, "eval_runtime": 59.95, "eval_samples_per_second": 693.678, "eval_steps_per_second": 0.684, "step": 7550 }, { "epoch": 3.5242290748898677, "grad_norm": 0.0399620421230793, "learning_rate": 1.3813298094746491e-05, "loss": 0.1288, "step": 7600 }, { "epoch": 3.5242290748898677, "eval_loss": 0.15905609221590689, "eval_runtime": 61.181, "eval_samples_per_second": 679.72, "eval_steps_per_second": 0.67, "step": 7600 }, { "epoch": 3.5474147924878277, "grad_norm": 0.05973295867443085, "learning_rate": 1.327041456712334e-05, "loss": 0.1281, "step": 7650 }, { "epoch": 3.5474147924878277, "eval_loss": 0.15981091550942805, "eval_runtime": 60.5605, "eval_samples_per_second": 686.685, "eval_steps_per_second": 0.677, "step": 7650 }, { "epoch": 3.570600510085787, "grad_norm": 0.04896661266684532, "learning_rate": 1.2736777516212266e-05, "loss": 0.1288, "step": 7700 }, { "epoch": 3.570600510085787, "eval_loss": 0.1599924400443614, "eval_runtime": 60.486, "eval_samples_per_second": 687.531, "eval_steps_per_second": 0.678, "step": 7700 }, { "epoch": 3.5937862276837467, "grad_norm": 0.07458525151014328, "learning_rate": 1.2212521282287092e-05, "loss": 0.128, "step": 7750 }, { "epoch": 3.5937862276837467, "eval_loss": 0.15936126278835275, "eval_runtime": 60.9341, "eval_samples_per_second": 682.475, "eval_steps_per_second": 0.673, "step": 7750 }, { "epoch": 3.6169719452817066, "grad_norm": 0.04200127348303795, "learning_rate": 1.1697777844051105e-05, "loss": 0.1287, "step": 7800 }, { "epoch": 3.6169719452817066, "eval_loss": 0.1603394617678833, "eval_runtime": 60.5155, "eval_samples_per_second": 687.195, "eval_steps_per_second": 0.678, "step": 7800 }, { "epoch": 3.640157662879666, "grad_norm": 0.06712640821933746, "learning_rate": 1.1192676785412154e-05, "loss": 0.1291, "step": 7850 }, { "epoch": 3.640157662879666, "eval_loss": 0.15920067938345067, "eval_runtime": 60.0225, "eval_samples_per_second": 692.84, "eval_steps_per_second": 0.683, "step": 7850 }, { "epoch": 3.6633433804776256, "grad_norm": 0.049462996423244476, "learning_rate": 1.0697345262860636e-05, "loss": 0.1287, "step": 7900 }, { "epoch": 3.6633433804776256, "eval_loss": 0.15964593569874527, "eval_runtime": 60.1965, "eval_samples_per_second": 690.837, "eval_steps_per_second": 0.681, "step": 7900 }, { "epoch": 3.6865290980755856, "grad_norm": 0.05148932337760925, "learning_rate": 1.021190797345839e-05, "loss": 0.1283, "step": 7950 }, { "epoch": 3.6865290980755856, "eval_loss": 0.15903354419354673, "eval_runtime": 60.0507, "eval_samples_per_second": 692.515, "eval_steps_per_second": 0.683, "step": 7950 }, { "epoch": 3.709714815673545, "grad_norm": 0.05164024233818054, "learning_rate": 9.73648712344707e-06, "loss": 0.128, "step": 8000 }, { "epoch": 3.709714815673545, "eval_loss": 0.15835035051131605, "eval_runtime": 60.5739, "eval_samples_per_second": 686.533, "eval_steps_per_second": 0.677, "step": 8000 }, { "epoch": 3.7329005332715046, "grad_norm": 0.04926716163754463, "learning_rate": 9.271202397483215e-06, "loss": 0.1276, "step": 8050 }, { "epoch": 3.7329005332715046, "eval_loss": 0.160225615529793, "eval_runtime": 60.4555, "eval_samples_per_second": 687.878, "eval_steps_per_second": 0.678, "step": 8050 }, { "epoch": 3.7560862508694646, "grad_norm": 0.04355842247605324, "learning_rate": 8.816170928508365e-06, "loss": 0.1287, "step": 8100 }, { "epoch": 3.7560862508694646, "eval_loss": 0.1601867779420742, "eval_runtime": 60.7386, "eval_samples_per_second": 684.672, "eval_steps_per_second": 0.675, "step": 8100 }, { "epoch": 3.779271968467424, "grad_norm": 0.039105553179979324, "learning_rate": 8.371507268261437e-06, "loss": 0.1306, "step": 8150 }, { "epoch": 3.779271968467424, "eval_loss": 0.15946348937187382, "eval_runtime": 60.9253, "eval_samples_per_second": 682.574, "eval_steps_per_second": 0.673, "step": 8150 }, { "epoch": 3.8024576860653836, "grad_norm": 0.04452899843454361, "learning_rate": 7.937323358440935e-06, "loss": 0.1286, "step": 8200 }, { "epoch": 3.8024576860653836, "eval_loss": 0.15871429728364056, "eval_runtime": 60.2776, "eval_samples_per_second": 689.908, "eval_steps_per_second": 0.68, "step": 8200 }, { "epoch": 3.8256434036633435, "grad_norm": 0.043075498193502426, "learning_rate": 7.513728502524286e-06, "loss": 0.1292, "step": 8250 }, { "epoch": 3.8256434036633435, "eval_loss": 0.1592580359542711, "eval_runtime": 60.7244, "eval_samples_per_second": 684.832, "eval_steps_per_second": 0.675, "step": 8250 }, { "epoch": 3.848829121261303, "grad_norm": 0.05848800390958786, "learning_rate": 7.100829338251147e-06, "loss": 0.1275, "step": 8300 }, { "epoch": 3.848829121261303, "eval_loss": 0.15895083163665807, "eval_runtime": 60.3677, "eval_samples_per_second": 688.878, "eval_steps_per_second": 0.679, "step": 8300 }, { "epoch": 3.8720148388592626, "grad_norm": 0.04980336129665375, "learning_rate": 6.698729810778065e-06, "loss": 0.1277, "step": 8350 }, { "epoch": 3.8720148388592626, "eval_loss": 0.16002303550437, "eval_runtime": 60.2742, "eval_samples_per_second": 689.947, "eval_steps_per_second": 0.68, "step": 8350 }, { "epoch": 3.8952005564572225, "grad_norm": 0.057385146617889404, "learning_rate": 6.3075311465107535e-06, "loss": 0.129, "step": 8400 }, { "epoch": 3.8952005564572225, "eval_loss": 0.1601535826416112, "eval_runtime": 60.4053, "eval_samples_per_second": 688.45, "eval_steps_per_second": 0.679, "step": 8400 }, { "epoch": 3.918386274055182, "grad_norm": 0.045788682997226715, "learning_rate": 5.927331827620903e-06, "loss": 0.1286, "step": 8450 }, { "epoch": 3.918386274055182, "eval_loss": 0.15926720973175468, "eval_runtime": 60.6783, "eval_samples_per_second": 685.352, "eval_steps_per_second": 0.676, "step": 8450 }, { "epoch": 3.9415719916531415, "grad_norm": 0.045575451105833054, "learning_rate": 5.558227567253832e-06, "loss": 0.1281, "step": 8500 }, { "epoch": 3.9415719916531415, "eval_loss": 0.16032033338606583, "eval_runtime": 60.4563, "eval_samples_per_second": 687.868, "eval_steps_per_second": 0.678, "step": 8500 }, { "epoch": 3.964757709251101, "grad_norm": 0.034972067922353745, "learning_rate": 5.200311285433213e-06, "loss": 0.1285, "step": 8550 }, { "epoch": 3.964757709251101, "eval_loss": 0.1590997571686103, "eval_runtime": 60.7642, "eval_samples_per_second": 684.384, "eval_steps_per_second": 0.675, "step": 8550 }, { "epoch": 3.987943426849061, "grad_norm": 0.05060684680938721, "learning_rate": 4.853673085668947e-06, "loss": 0.1293, "step": 8600 }, { "epoch": 3.987943426849061, "eval_loss": 0.15924322809570868, "eval_runtime": 60.0799, "eval_samples_per_second": 692.178, "eval_steps_per_second": 0.682, "step": 8600 }, { "epoch": 4.011129144447021, "grad_norm": 0.04898017644882202, "learning_rate": 4.5184002322740785e-06, "loss": 0.1283, "step": 8650 }, { "epoch": 4.011129144447021, "eval_loss": 0.1587491140112498, "eval_runtime": 60.6393, "eval_samples_per_second": 685.793, "eval_steps_per_second": 0.676, "step": 8650 }, { "epoch": 4.0343148620449805, "grad_norm": 0.058361586183309555, "learning_rate": 4.19457712839652e-06, "loss": 0.1277, "step": 8700 }, { "epoch": 4.0343148620449805, "eval_loss": 0.1597737118597627, "eval_runtime": 61.5486, "eval_samples_per_second": 675.661, "eval_steps_per_second": 0.666, "step": 8700 }, { "epoch": 4.05750057964294, "grad_norm": 0.05138258635997772, "learning_rate": 3.8822852947709375e-06, "loss": 0.1283, "step": 8750 }, { "epoch": 4.05750057964294, "eval_loss": 0.15985116115580386, "eval_runtime": 60.5634, "eval_samples_per_second": 686.652, "eval_steps_per_second": 0.677, "step": 8750 }, { "epoch": 4.0806862972408995, "grad_norm": 0.0461881086230278, "learning_rate": 3.581603349196372e-06, "loss": 0.1288, "step": 8800 }, { "epoch": 4.0806862972408995, "eval_loss": 0.15788726458515429, "eval_runtime": 60.6057, "eval_samples_per_second": 686.173, "eval_steps_per_second": 0.677, "step": 8800 }, { "epoch": 4.103872014838859, "grad_norm": 0.0618111789226532, "learning_rate": 3.2926069867446675e-06, "loss": 0.1287, "step": 8850 }, { "epoch": 4.103872014838859, "eval_loss": 0.15881183094974458, "eval_runtime": 60.3747, "eval_samples_per_second": 688.799, "eval_steps_per_second": 0.679, "step": 8850 }, { "epoch": 4.1270577324368185, "grad_norm": 0.04804789274930954, "learning_rate": 3.0153689607045845e-06, "loss": 0.1294, "step": 8900 }, { "epoch": 4.1270577324368185, "eval_loss": 0.1607356553004913, "eval_runtime": 60.6979, "eval_samples_per_second": 685.131, "eval_steps_per_second": 0.675, "step": 8900 }, { "epoch": 4.150243450034779, "grad_norm": 0.04835003986954689, "learning_rate": 2.7499590642665774e-06, "loss": 0.1277, "step": 8950 }, { "epoch": 4.150243450034779, "eval_loss": 0.1598761189516689, "eval_runtime": 61.2003, "eval_samples_per_second": 679.507, "eval_steps_per_second": 0.67, "step": 8950 }, { "epoch": 4.173429167632738, "grad_norm": 0.05750919133424759, "learning_rate": 2.496444112952734e-06, "loss": 0.1285, "step": 9000 }, { "epoch": 4.173429167632738, "eval_loss": 0.15946166188972705, "eval_runtime": 60.6795, "eval_samples_per_second": 685.339, "eval_steps_per_second": 0.676, "step": 9000 }, { "epoch": 4.196614885230698, "grad_norm": 0.06801807135343552, "learning_rate": 2.2548879277963064e-06, "loss": 0.1289, "step": 9050 }, { "epoch": 4.196614885230698, "eval_loss": 0.1609577221237089, "eval_runtime": 61.0186, "eval_samples_per_second": 681.53, "eval_steps_per_second": 0.672, "step": 9050 }, { "epoch": 4.219800602828657, "grad_norm": 0.04383298382163048, "learning_rate": 2.0253513192751373e-06, "loss": 0.1289, "step": 9100 }, { "epoch": 4.219800602828657, "eval_loss": 0.1598739506376352, "eval_runtime": 60.6256, "eval_samples_per_second": 685.948, "eval_steps_per_second": 0.676, "step": 9100 }, { "epoch": 4.242986320426617, "grad_norm": 0.044339120388031006, "learning_rate": 1.807892072002898e-06, "loss": 0.1283, "step": 9150 }, { "epoch": 4.242986320426617, "eval_loss": 0.158920794598519, "eval_runtime": 60.5454, "eval_samples_per_second": 686.856, "eval_steps_per_second": 0.677, "step": 9150 }, { "epoch": 4.2661720380245765, "grad_norm": 0.04090524837374687, "learning_rate": 1.6025649301821876e-06, "loss": 0.1282, "step": 9200 }, { "epoch": 4.2661720380245765, "eval_loss": 0.1596859048948022, "eval_runtime": 60.7716, "eval_samples_per_second": 684.3, "eval_steps_per_second": 0.675, "step": 9200 }, { "epoch": 4.289357755622537, "grad_norm": 0.042642634361982346, "learning_rate": 1.4094215838229176e-06, "loss": 0.1286, "step": 9250 }, { "epoch": 4.289357755622537, "eval_loss": 0.16079005979316527, "eval_runtime": 60.6239, "eval_samples_per_second": 685.967, "eval_steps_per_second": 0.676, "step": 9250 }, { "epoch": 4.312543473220496, "grad_norm": 0.04924129322171211, "learning_rate": 1.2285106557296477e-06, "loss": 0.1287, "step": 9300 }, { "epoch": 4.312543473220496, "eval_loss": 0.16084020796323667, "eval_runtime": 60.2581, "eval_samples_per_second": 690.131, "eval_steps_per_second": 0.68, "step": 9300 }, { "epoch": 4.335729190818456, "grad_norm": 0.04222133755683899, "learning_rate": 1.0598776892610685e-06, "loss": 0.1287, "step": 9350 }, { "epoch": 4.335729190818456, "eval_loss": 0.16020395921618655, "eval_runtime": 60.5816, "eval_samples_per_second": 686.446, "eval_steps_per_second": 0.677, "step": 9350 }, { "epoch": 4.358914908416415, "grad_norm": 0.05593874678015709, "learning_rate": 9.035651368646648e-07, "loss": 0.1286, "step": 9400 }, { "epoch": 4.358914908416415, "eval_loss": 0.15957607987630548, "eval_runtime": 60.8726, "eval_samples_per_second": 683.164, "eval_steps_per_second": 0.674, "step": 9400 }, { "epoch": 4.382100626014375, "grad_norm": 0.059049129486083984, "learning_rate": 7.596123493895991e-07, "loss": 0.1289, "step": 9450 }, { "epoch": 4.382100626014375, "eval_loss": 0.15975197211451994, "eval_runtime": 60.6704, "eval_samples_per_second": 685.441, "eval_steps_per_second": 0.676, "step": 9450 }, { "epoch": 4.405286343612334, "grad_norm": 0.053555767983198166, "learning_rate": 6.280555661802856e-07, "loss": 0.1286, "step": 9500 }, { "epoch": 4.405286343612334, "eval_loss": 0.16117730336557945, "eval_runtime": 61.9214, "eval_samples_per_second": 671.593, "eval_steps_per_second": 0.662, "step": 9500 }, { "epoch": 4.428472061210295, "grad_norm": 0.04488294571638107, "learning_rate": 5.089279059533658e-07, "loss": 0.1281, "step": 9550 }, { "epoch": 4.428472061210295, "eval_loss": 0.15896389365558133, "eval_runtime": 62.0889, "eval_samples_per_second": 669.782, "eval_steps_per_second": 0.66, "step": 9550 }, { "epoch": 4.451657778808254, "grad_norm": 0.044143371284008026, "learning_rate": 4.02259358460233e-07, "loss": 0.1276, "step": 9600 }, { "epoch": 4.451657778808254, "eval_loss": 0.15880485748262804, "eval_runtime": 61.9007, "eval_samples_per_second": 671.818, "eval_steps_per_second": 0.662, "step": 9600 }, { "epoch": 4.474843496406214, "grad_norm": 0.054890409111976624, "learning_rate": 3.080767769372939e-07, "loss": 0.1289, "step": 9650 }, { "epoch": 4.474843496406214, "eval_loss": 0.15899979579394047, "eval_runtime": 61.7264, "eval_samples_per_second": 673.714, "eval_steps_per_second": 0.664, "step": 9650 }, { "epoch": 4.498029214004173, "grad_norm": 0.04276006668806076, "learning_rate": 2.2640387134577058e-07, "loss": 0.1284, "step": 9700 }, { "epoch": 4.498029214004173, "eval_loss": 0.1587265635928511, "eval_runtime": 61.254, "eval_samples_per_second": 678.911, "eval_steps_per_second": 0.669, "step": 9700 }, { "epoch": 4.521214931602133, "grad_norm": 0.04374442994594574, "learning_rate": 1.5726120240288634e-07, "loss": 0.1284, "step": 9750 }, { "epoch": 4.521214931602133, "eval_loss": 0.1596641951113874, "eval_runtime": 61.5999, "eval_samples_per_second": 675.099, "eval_steps_per_second": 0.666, "step": 9750 }, { "epoch": 4.544400649200092, "grad_norm": 0.039518803358078, "learning_rate": 1.0066617640578368e-07, "loss": 0.1297, "step": 9800 }, { "epoch": 4.544400649200092, "eval_loss": 0.15941302591091938, "eval_runtime": 61.7763, "eval_samples_per_second": 673.17, "eval_steps_per_second": 0.664, "step": 9800 }, { "epoch": 4.567586366798053, "grad_norm": 0.037454187870025635, "learning_rate": 5.663304084960186e-08, "loss": 0.1276, "step": 9850 }, { "epoch": 4.567586366798053, "eval_loss": 0.15932704533345807, "eval_runtime": 61.0673, "eval_samples_per_second": 680.987, "eval_steps_per_second": 0.671, "step": 9850 }, { "epoch": 4.590772084396012, "grad_norm": 0.05642937496304512, "learning_rate": 2.5172880840745873e-08, "loss": 0.129, "step": 9900 }, { "epoch": 4.590772084396012, "eval_loss": 0.15923822751292327, "eval_runtime": 61.7949, "eval_samples_per_second": 672.968, "eval_steps_per_second": 0.663, "step": 9900 }, { "epoch": 4.613957801993972, "grad_norm": 0.03662274032831192, "learning_rate": 6.293616306246586e-09, "loss": 0.1285, "step": 9950 }, { "epoch": 4.613957801993972, "eval_loss": 0.160277338388973, "eval_runtime": 62.2438, "eval_samples_per_second": 668.115, "eval_steps_per_second": 0.659, "step": 9950 }, { "epoch": 4.637143519591931, "grad_norm": 0.0563049279153347, "learning_rate": 0.0, "loss": 0.1282, "step": 10000 }, { "epoch": 4.637143519591931, "eval_loss": 0.16006394581914293, "eval_runtime": 61.4917, "eval_samples_per_second": 676.286, "eval_steps_per_second": 0.667, "step": 10000 }, { "epoch": 4.637143519591931, "step": 10000, "total_flos": 2.3231400526217216e+17, "train_loss": 0.13326009378433226, "train_runtime": 41368.3669, "train_samples_per_second": 495.064, "train_steps_per_second": 0.242 } ], "logging_steps": 50, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 50, "total_flos": 2.3231400526217216e+17, "train_batch_size": 1024, "trial_name": null, "trial_params": null }