{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.0, "eval_steps": 3, "global_step": 592, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04054054054054054, "grad_norm": 5.807250022888184, "learning_rate": 5e-05, "loss": 3.1119, "step": 3 }, { "epoch": 0.04054054054054054, "eval_loss": 3.1016640663146973, "eval_runtime": 1.0551, "eval_samples_per_second": 15.164, "eval_steps_per_second": 3.791, "step": 3 }, { "epoch": 0.08108108108108109, "grad_norm": 4.004100322723389, "learning_rate": 0.0001, "loss": 2.8734, "step": 6 }, { "epoch": 0.08108108108108109, "eval_loss": 2.6094236373901367, "eval_runtime": 1.0592, "eval_samples_per_second": 15.106, "eval_steps_per_second": 3.777, "step": 6 }, { "epoch": 0.12162162162162163, "grad_norm": 3.935053586959839, "learning_rate": 9.999353337510526e-05, "loss": 2.4188, "step": 9 }, { "epoch": 0.12162162162162163, "eval_loss": 2.1545872688293457, "eval_runtime": 1.0511, "eval_samples_per_second": 15.222, "eval_steps_per_second": 3.805, "step": 9 }, { "epoch": 0.16216216216216217, "grad_norm": 5.741048812866211, "learning_rate": 9.997413517311055e-05, "loss": 1.9335, "step": 12 }, { "epoch": 0.16216216216216217, "eval_loss": 1.786160945892334, "eval_runtime": 1.0532, "eval_samples_per_second": 15.192, "eval_steps_per_second": 3.798, "step": 12 }, { "epoch": 0.20270270270270271, "grad_norm": 4.155601978302002, "learning_rate": 9.99418104116517e-05, "loss": 1.5361, "step": 15 }, { "epoch": 0.20270270270270271, "eval_loss": 1.4731855392456055, "eval_runtime": 1.0511, "eval_samples_per_second": 15.222, "eval_steps_per_second": 3.805, "step": 15 }, { "epoch": 0.24324324324324326, "grad_norm": 2.4831109046936035, "learning_rate": 9.989656745201298e-05, "loss": 1.314, "step": 18 }, { "epoch": 0.24324324324324326, "eval_loss": 1.2790606021881104, "eval_runtime": 1.0553, "eval_samples_per_second": 15.161, "eval_steps_per_second": 3.79, "step": 18 }, { "epoch": 0.28378378378378377, "grad_norm": 1.9509971141815186, "learning_rate": 9.983841799696438e-05, "loss": 1.1747, "step": 21 }, { "epoch": 0.28378378378378377, "eval_loss": 1.1653475761413574, "eval_runtime": 1.0512, "eval_samples_per_second": 15.221, "eval_steps_per_second": 3.805, "step": 21 }, { "epoch": 0.32432432432432434, "grad_norm": 2.245741367340088, "learning_rate": 9.976737708773445e-05, "loss": 1.1407, "step": 24 }, { "epoch": 0.32432432432432434, "eval_loss": 1.110356092453003, "eval_runtime": 1.0534, "eval_samples_per_second": 15.188, "eval_steps_per_second": 3.797, "step": 24 }, { "epoch": 0.36486486486486486, "grad_norm": 2.0690531730651855, "learning_rate": 9.968346310011964e-05, "loss": 1.1734, "step": 27 }, { "epoch": 0.36486486486486486, "eval_loss": 1.088733434677124, "eval_runtime": 1.0508, "eval_samples_per_second": 15.226, "eval_steps_per_second": 3.806, "step": 27 }, { "epoch": 0.40540540540540543, "grad_norm": 1.8963656425476074, "learning_rate": 9.958669773973123e-05, "loss": 1.0495, "step": 30 }, { "epoch": 0.40540540540540543, "eval_loss": 1.0401344299316406, "eval_runtime": 1.0528, "eval_samples_per_second": 15.197, "eval_steps_per_second": 3.799, "step": 30 }, { "epoch": 0.44594594594594594, "grad_norm": 1.753909945487976, "learning_rate": 9.947710603638078e-05, "loss": 1.0401, "step": 33 }, { "epoch": 0.44594594594594594, "eval_loss": 0.990611732006073, "eval_runtime": 1.0507, "eval_samples_per_second": 15.227, "eval_steps_per_second": 3.807, "step": 33 }, { "epoch": 0.4864864864864865, "grad_norm": 2.1073760986328125, "learning_rate": 9.935471633760573e-05, "loss": 1.0623, "step": 36 }, { "epoch": 0.4864864864864865, "eval_loss": 0.9593618512153625, "eval_runtime": 1.0535, "eval_samples_per_second": 15.188, "eval_steps_per_second": 3.797, "step": 36 }, { "epoch": 0.527027027027027, "grad_norm": 1.5675249099731445, "learning_rate": 9.921956030133701e-05, "loss": 0.8152, "step": 39 }, { "epoch": 0.527027027027027, "eval_loss": 0.9366932511329651, "eval_runtime": 1.0514, "eval_samples_per_second": 15.218, "eval_steps_per_second": 3.805, "step": 39 }, { "epoch": 0.5675675675675675, "grad_norm": 2.219888210296631, "learning_rate": 9.907167288771019e-05, "loss": 0.9261, "step": 42 }, { "epoch": 0.5675675675675675, "eval_loss": 0.9247606992721558, "eval_runtime": 1.0532, "eval_samples_per_second": 15.192, "eval_steps_per_second": 3.798, "step": 42 }, { "epoch": 0.6081081081081081, "grad_norm": 1.6866446733474731, "learning_rate": 9.891109235002249e-05, "loss": 0.9469, "step": 45 }, { "epoch": 0.6081081081081081, "eval_loss": 0.9134540557861328, "eval_runtime": 1.0562, "eval_samples_per_second": 15.149, "eval_steps_per_second": 3.787, "step": 45 }, { "epoch": 0.6486486486486487, "grad_norm": 1.7272800207138062, "learning_rate": 9.8737860224838e-05, "loss": 0.8381, "step": 48 }, { "epoch": 0.6486486486486487, "eval_loss": 0.8871217370033264, "eval_runtime": 1.0527, "eval_samples_per_second": 15.199, "eval_steps_per_second": 3.8, "step": 48 }, { "epoch": 0.6891891891891891, "grad_norm": 2.6152303218841553, "learning_rate": 9.855202132124365e-05, "loss": 0.8456, "step": 51 }, { "epoch": 0.6891891891891891, "eval_loss": 0.8553087711334229, "eval_runtime": 1.0521, "eval_samples_per_second": 15.208, "eval_steps_per_second": 3.802, "step": 51 }, { "epoch": 0.7297297297297297, "grad_norm": 1.8282960653305054, "learning_rate": 9.835362370925868e-05, "loss": 0.908, "step": 54 }, { "epoch": 0.7297297297297297, "eval_loss": 0.8271682858467102, "eval_runtime": 1.052, "eval_samples_per_second": 15.21, "eval_steps_per_second": 3.802, "step": 54 }, { "epoch": 0.7702702702702703, "grad_norm": 2.466750383377075, "learning_rate": 9.814271870740054e-05, "loss": 0.999, "step": 57 }, { "epoch": 0.7702702702702703, "eval_loss": 0.8151593208312988, "eval_runtime": 1.0549, "eval_samples_per_second": 15.167, "eval_steps_per_second": 3.792, "step": 57 }, { "epoch": 0.8108108108108109, "grad_norm": 1.8908120393753052, "learning_rate": 9.791936086941064e-05, "loss": 0.897, "step": 60 }, { "epoch": 0.8108108108108109, "eval_loss": 0.8052847981452942, "eval_runtime": 1.0512, "eval_samples_per_second": 15.22, "eval_steps_per_second": 3.805, "step": 60 }, { "epoch": 0.8513513513513513, "grad_norm": 1.9563689231872559, "learning_rate": 9.768360797014324e-05, "loss": 0.8747, "step": 63 }, { "epoch": 0.8513513513513513, "eval_loss": 0.7914941906929016, "eval_runtime": 1.0519, "eval_samples_per_second": 15.21, "eval_steps_per_second": 3.803, "step": 63 }, { "epoch": 0.8918918918918919, "grad_norm": 1.9292480945587158, "learning_rate": 9.7435520990621e-05, "loss": 1.0646, "step": 66 }, { "epoch": 0.8918918918918919, "eval_loss": 0.7872657179832458, "eval_runtime": 1.0526, "eval_samples_per_second": 15.201, "eval_steps_per_second": 3.8, "step": 66 }, { "epoch": 0.9324324324324325, "grad_norm": 1.7248555421829224, "learning_rate": 9.717516410226145e-05, "loss": 0.6771, "step": 69 }, { "epoch": 0.9324324324324325, "eval_loss": 0.7814666628837585, "eval_runtime": 1.0522, "eval_samples_per_second": 15.207, "eval_steps_per_second": 3.802, "step": 69 }, { "epoch": 0.972972972972973, "grad_norm": 2.171896457672119, "learning_rate": 9.690260465027801e-05, "loss": 0.9386, "step": 72 }, { "epoch": 0.972972972972973, "eval_loss": 0.7634860873222351, "eval_runtime": 1.0498, "eval_samples_per_second": 15.241, "eval_steps_per_second": 3.81, "step": 72 }, { "epoch": 1.0135135135135136, "grad_norm": 1.625179409980774, "learning_rate": 9.661791313626018e-05, "loss": 0.6348, "step": 75 }, { "epoch": 1.0135135135135136, "eval_loss": 0.75515216588974, "eval_runtime": 1.0536, "eval_samples_per_second": 15.186, "eval_steps_per_second": 3.796, "step": 75 }, { "epoch": 1.054054054054054, "grad_norm": 1.4293404817581177, "learning_rate": 9.632116319993725e-05, "loss": 0.5763, "step": 78 }, { "epoch": 1.054054054054054, "eval_loss": 0.7473800182342529, "eval_runtime": 1.0524, "eval_samples_per_second": 15.203, "eval_steps_per_second": 3.801, "step": 78 }, { "epoch": 1.0945945945945945, "grad_norm": 1.9279707670211792, "learning_rate": 9.601243160013023e-05, "loss": 0.7059, "step": 81 }, { "epoch": 1.0945945945945945, "eval_loss": 0.7430617213249207, "eval_runtime": 1.0539, "eval_samples_per_second": 15.181, "eval_steps_per_second": 3.795, "step": 81 }, { "epoch": 1.135135135135135, "grad_norm": 1.7644144296646118, "learning_rate": 9.56917981948971e-05, "loss": 0.6111, "step": 84 }, { "epoch": 1.135135135135135, "eval_loss": 0.7393875122070312, "eval_runtime": 1.0525, "eval_samples_per_second": 15.202, "eval_steps_per_second": 3.8, "step": 84 }, { "epoch": 1.1756756756756757, "grad_norm": 1.4910467863082886, "learning_rate": 9.535934592087627e-05, "loss": 0.6937, "step": 87 }, { "epoch": 1.1756756756756757, "eval_loss": 0.7415614724159241, "eval_runtime": 1.0533, "eval_samples_per_second": 15.191, "eval_steps_per_second": 3.798, "step": 87 }, { "epoch": 1.2162162162162162, "grad_norm": 1.989018440246582, "learning_rate": 9.50151607718338e-05, "loss": 0.6408, "step": 90 }, { "epoch": 1.2162162162162162, "eval_loss": 0.7331891059875488, "eval_runtime": 1.0504, "eval_samples_per_second": 15.232, "eval_steps_per_second": 3.808, "step": 90 }, { "epoch": 1.2567567567567568, "grad_norm": 1.5546590089797974, "learning_rate": 9.465933177641982e-05, "loss": 0.5931, "step": 93 }, { "epoch": 1.2567567567567568, "eval_loss": 0.7319458723068237, "eval_runtime": 1.0532, "eval_samples_per_second": 15.191, "eval_steps_per_second": 3.798, "step": 93 }, { "epoch": 1.2972972972972974, "grad_norm": 2.128746271133423, "learning_rate": 9.429195097513993e-05, "loss": 0.5792, "step": 96 }, { "epoch": 1.2972972972972974, "eval_loss": 0.7179479598999023, "eval_runtime": 1.0504, "eval_samples_per_second": 15.232, "eval_steps_per_second": 3.808, "step": 96 }, { "epoch": 1.3378378378378377, "grad_norm": 2.069204092025757, "learning_rate": 9.391311339654753e-05, "loss": 0.5502, "step": 99 }, { "epoch": 1.3378378378378377, "eval_loss": 0.7083268165588379, "eval_runtime": 1.0531, "eval_samples_per_second": 15.193, "eval_steps_per_second": 3.798, "step": 99 }, { "epoch": 1.3783783783783785, "grad_norm": 2.069469928741455, "learning_rate": 9.352291703266331e-05, "loss": 0.7356, "step": 102 }, { "epoch": 1.3783783783783785, "eval_loss": 0.7048563957214355, "eval_runtime": 1.0519, "eval_samples_per_second": 15.21, "eval_steps_per_second": 3.803, "step": 102 }, { "epoch": 1.4189189189189189, "grad_norm": 1.507051706314087, "learning_rate": 9.31214628136281e-05, "loss": 0.5204, "step": 105 }, { "epoch": 1.4189189189189189, "eval_loss": 0.6983195543289185, "eval_runtime": 1.0543, "eval_samples_per_second": 15.176, "eval_steps_per_second": 3.794, "step": 105 }, { "epoch": 1.4594594594594594, "grad_norm": 1.918865442276001, "learning_rate": 9.270885458159575e-05, "loss": 0.6132, "step": 108 }, { "epoch": 1.4594594594594594, "eval_loss": 0.6857842803001404, "eval_runtime": 1.0525, "eval_samples_per_second": 15.202, "eval_steps_per_second": 3.8, "step": 108 }, { "epoch": 1.5, "grad_norm": 2.062997341156006, "learning_rate": 9.228519906387288e-05, "loss": 0.7527, "step": 111 }, { "epoch": 1.5, "eval_loss": 0.6743776798248291, "eval_runtime": 1.0512, "eval_samples_per_second": 15.221, "eval_steps_per_second": 3.805, "step": 111 }, { "epoch": 1.5405405405405406, "grad_norm": 1.8099018335342407, "learning_rate": 9.185060584531217e-05, "loss": 0.6798, "step": 114 }, { "epoch": 1.5405405405405406, "eval_loss": 0.6715844869613647, "eval_runtime": 1.0529, "eval_samples_per_second": 15.196, "eval_steps_per_second": 3.799, "step": 114 }, { "epoch": 1.5810810810810811, "grad_norm": 2.0540611743927, "learning_rate": 9.140518733996672e-05, "loss": 0.7266, "step": 117 }, { "epoch": 1.5810810810810811, "eval_loss": 0.6656138896942139, "eval_runtime": 1.0523, "eval_samples_per_second": 15.204, "eval_steps_per_second": 3.801, "step": 117 }, { "epoch": 1.6216216216216215, "grad_norm": 2.3945634365081787, "learning_rate": 9.094905876201229e-05, "loss": 0.5347, "step": 120 }, { "epoch": 1.6216216216216215, "eval_loss": 0.6710730791091919, "eval_runtime": 1.053, "eval_samples_per_second": 15.195, "eval_steps_per_second": 3.799, "step": 120 }, { "epoch": 1.6621621621621623, "grad_norm": 2.006612777709961, "learning_rate": 9.048233809594561e-05, "loss": 0.6522, "step": 123 }, { "epoch": 1.6621621621621623, "eval_loss": 0.6679877042770386, "eval_runtime": 1.0519, "eval_samples_per_second": 15.211, "eval_steps_per_second": 3.803, "step": 123 }, { "epoch": 1.7027027027027026, "grad_norm": 1.751696228981018, "learning_rate": 9.000514606606581e-05, "loss": 0.8567, "step": 126 }, { "epoch": 1.7027027027027026, "eval_loss": 0.6558159589767456, "eval_runtime": 1.0531, "eval_samples_per_second": 15.193, "eval_steps_per_second": 3.798, "step": 126 }, { "epoch": 1.7432432432432432, "grad_norm": 1.5286139249801636, "learning_rate": 8.951760610524724e-05, "loss": 0.5204, "step": 129 }, { "epoch": 1.7432432432432432, "eval_loss": 0.6488269567489624, "eval_runtime": 1.0516, "eval_samples_per_second": 15.215, "eval_steps_per_second": 3.804, "step": 129 }, { "epoch": 1.7837837837837838, "grad_norm": 2.1092898845672607, "learning_rate": 8.901984432301185e-05, "loss": 0.6443, "step": 132 }, { "epoch": 1.7837837837837838, "eval_loss": 0.6392868161201477, "eval_runtime": 1.053, "eval_samples_per_second": 15.195, "eval_steps_per_second": 3.799, "step": 132 }, { "epoch": 1.8243243243243243, "grad_norm": 1.7279053926467896, "learning_rate": 8.851198947290894e-05, "loss": 0.5436, "step": 135 }, { "epoch": 1.8243243243243243, "eval_loss": 0.6321672201156616, "eval_runtime": 1.0499, "eval_samples_per_second": 15.239, "eval_steps_per_second": 3.81, "step": 135 }, { "epoch": 1.864864864864865, "grad_norm": 2.6842877864837646, "learning_rate": 8.799417291921117e-05, "loss": 0.6054, "step": 138 }, { "epoch": 1.864864864864865, "eval_loss": 0.6346270442008972, "eval_runtime": 1.0528, "eval_samples_per_second": 15.198, "eval_steps_per_second": 3.799, "step": 138 }, { "epoch": 1.9054054054054053, "grad_norm": 1.9958398342132568, "learning_rate": 8.746652860293523e-05, "loss": 0.4488, "step": 141 }, { "epoch": 1.9054054054054053, "eval_loss": 0.6389164924621582, "eval_runtime": 1.0505, "eval_samples_per_second": 15.231, "eval_steps_per_second": 3.808, "step": 141 }, { "epoch": 1.945945945945946, "grad_norm": 2.0705783367156982, "learning_rate": 8.692919300719595e-05, "loss": 0.7171, "step": 144 }, { "epoch": 1.945945945945946, "eval_loss": 0.632194995880127, "eval_runtime": 1.0537, "eval_samples_per_second": 15.184, "eval_steps_per_second": 3.796, "step": 144 }, { "epoch": 1.9864864864864864, "grad_norm": 2.0737218856811523, "learning_rate": 8.638230512190298e-05, "loss": 0.5383, "step": 147 }, { "epoch": 1.9864864864864864, "eval_loss": 0.6272808313369751, "eval_runtime": 1.0507, "eval_samples_per_second": 15.228, "eval_steps_per_second": 3.807, "step": 147 }, { "epoch": 2.027027027027027, "grad_norm": 1.6119190454483032, "learning_rate": 8.58260064078088e-05, "loss": 0.4812, "step": 150 }, { "epoch": 2.027027027027027, "eval_loss": 0.6234598755836487, "eval_runtime": 1.0541, "eval_samples_per_second": 15.179, "eval_steps_per_second": 3.795, "step": 150 }, { "epoch": 2.0675675675675675, "grad_norm": 2.104738712310791, "learning_rate": 8.526044075991802e-05, "loss": 0.7911, "step": 153 }, { "epoch": 2.0675675675675675, "eval_loss": 0.6295649409294128, "eval_runtime": 1.0504, "eval_samples_per_second": 15.232, "eval_steps_per_second": 3.808, "step": 153 }, { "epoch": 2.108108108108108, "grad_norm": 2.041696786880493, "learning_rate": 8.468575447026651e-05, "loss": 0.514, "step": 156 }, { "epoch": 2.108108108108108, "eval_loss": 0.6444165706634521, "eval_runtime": 1.0539, "eval_samples_per_second": 15.182, "eval_steps_per_second": 3.795, "step": 156 }, { "epoch": 2.1486486486486487, "grad_norm": 1.7887616157531738, "learning_rate": 8.410209619008101e-05, "loss": 0.4481, "step": 159 }, { "epoch": 2.1486486486486487, "eval_loss": 0.6452795267105103, "eval_runtime": 1.0508, "eval_samples_per_second": 15.227, "eval_steps_per_second": 3.807, "step": 159 }, { "epoch": 2.189189189189189, "grad_norm": 2.2852938175201416, "learning_rate": 8.350961689132808e-05, "loss": 0.3983, "step": 162 }, { "epoch": 2.189189189189189, "eval_loss": 0.6356573104858398, "eval_runtime": 1.0538, "eval_samples_per_second": 15.183, "eval_steps_per_second": 3.796, "step": 162 }, { "epoch": 2.22972972972973, "grad_norm": 1.3814259767532349, "learning_rate": 8.290846982766305e-05, "loss": 0.2386, "step": 165 }, { "epoch": 2.22972972972973, "eval_loss": 0.632733166217804, "eval_runtime": 1.053, "eval_samples_per_second": 15.195, "eval_steps_per_second": 3.799, "step": 165 }, { "epoch": 2.27027027027027, "grad_norm": 2.624509572982788, "learning_rate": 8.22988104947886e-05, "loss": 0.4447, "step": 168 }, { "epoch": 2.27027027027027, "eval_loss": 0.6358802318572998, "eval_runtime": 1.0518, "eval_samples_per_second": 15.212, "eval_steps_per_second": 3.803, "step": 168 }, { "epoch": 2.310810810810811, "grad_norm": 2.1006217002868652, "learning_rate": 8.168079659023349e-05, "loss": 0.4302, "step": 171 }, { "epoch": 2.310810810810811, "eval_loss": 0.6386667490005493, "eval_runtime": 1.0534, "eval_samples_per_second": 15.188, "eval_steps_per_second": 3.797, "step": 171 }, { "epoch": 2.3513513513513513, "grad_norm": 2.631301164627075, "learning_rate": 8.105458797256178e-05, "loss": 0.4514, "step": 174 }, { "epoch": 2.3513513513513513, "eval_loss": 0.6402238607406616, "eval_runtime": 1.0545, "eval_samples_per_second": 15.174, "eval_steps_per_second": 3.793, "step": 174 }, { "epoch": 2.391891891891892, "grad_norm": 1.4005826711654663, "learning_rate": 8.04203466200229e-05, "loss": 0.2813, "step": 177 }, { "epoch": 2.391891891891892, "eval_loss": 0.6313220262527466, "eval_runtime": 1.0541, "eval_samples_per_second": 15.178, "eval_steps_per_second": 3.795, "step": 177 }, { "epoch": 2.4324324324324325, "grad_norm": 2.4380390644073486, "learning_rate": 7.977823658865364e-05, "loss": 0.4747, "step": 180 }, { "epoch": 2.4324324324324325, "eval_loss": 0.6258513927459717, "eval_runtime": 1.0533, "eval_samples_per_second": 15.191, "eval_steps_per_second": 3.798, "step": 180 }, { "epoch": 2.472972972972973, "grad_norm": 2.3655426502227783, "learning_rate": 7.912842396984254e-05, "loss": 0.547, "step": 183 }, { "epoch": 2.472972972972973, "eval_loss": 0.6256988048553467, "eval_runtime": 1.053, "eval_samples_per_second": 15.195, "eval_steps_per_second": 3.799, "step": 183 }, { "epoch": 2.5135135135135136, "grad_norm": 1.9949471950531006, "learning_rate": 7.847107684736792e-05, "loss": 0.3154, "step": 186 }, { "epoch": 2.5135135135135136, "eval_loss": 0.6247289776802063, "eval_runtime": 1.0523, "eval_samples_per_second": 15.205, "eval_steps_per_second": 3.801, "step": 186 }, { "epoch": 2.554054054054054, "grad_norm": 3.2453622817993164, "learning_rate": 7.780636525392046e-05, "loss": 0.5583, "step": 189 }, { "epoch": 2.554054054054054, "eval_loss": 0.6129618883132935, "eval_runtime": 1.0519, "eval_samples_per_second": 15.21, "eval_steps_per_second": 3.803, "step": 189 }, { "epoch": 2.5945945945945947, "grad_norm": 2.022986888885498, "learning_rate": 7.713446112712169e-05, "loss": 0.5726, "step": 192 }, { "epoch": 2.5945945945945947, "eval_loss": 0.6086827516555786, "eval_runtime": 1.0543, "eval_samples_per_second": 15.175, "eval_steps_per_second": 3.794, "step": 192 }, { "epoch": 2.635135135135135, "grad_norm": 2.429865598678589, "learning_rate": 7.645553826504969e-05, "loss": 0.4701, "step": 195 }, { "epoch": 2.635135135135135, "eval_loss": 0.6085944175720215, "eval_runtime": 1.0521, "eval_samples_per_second": 15.208, "eval_steps_per_second": 3.802, "step": 195 }, { "epoch": 2.6756756756756754, "grad_norm": 1.991803526878357, "learning_rate": 7.576977228128376e-05, "loss": 0.4866, "step": 198 }, { "epoch": 2.6756756756756754, "eval_loss": 0.6133272647857666, "eval_runtime": 1.0535, "eval_samples_per_second": 15.187, "eval_steps_per_second": 3.797, "step": 198 }, { "epoch": 2.7162162162162162, "grad_norm": 2.537832021713257, "learning_rate": 7.50773405594792e-05, "loss": 0.4015, "step": 201 }, { "epoch": 2.7162162162162162, "eval_loss": 0.6213403940200806, "eval_runtime": 1.0524, "eval_samples_per_second": 15.203, "eval_steps_per_second": 3.801, "step": 201 }, { "epoch": 2.756756756756757, "grad_norm": 1.758016586303711, "learning_rate": 7.437842220748441e-05, "loss": 0.4277, "step": 204 }, { "epoch": 2.756756756756757, "eval_loss": 0.623763382434845, "eval_runtime": 1.0527, "eval_samples_per_second": 15.198, "eval_steps_per_second": 3.8, "step": 204 }, { "epoch": 2.7972972972972974, "grad_norm": 1.8930737972259521, "learning_rate": 7.367319801101196e-05, "loss": 0.3157, "step": 207 }, { "epoch": 2.7972972972972974, "eval_loss": 0.6248853206634521, "eval_runtime": 1.0562, "eval_samples_per_second": 15.149, "eval_steps_per_second": 3.787, "step": 207 }, { "epoch": 2.8378378378378377, "grad_norm": 2.071988105773926, "learning_rate": 7.296185038687566e-05, "loss": 0.3883, "step": 210 }, { "epoch": 2.8378378378378377, "eval_loss": 0.6209710240364075, "eval_runtime": 1.0518, "eval_samples_per_second": 15.212, "eval_steps_per_second": 3.803, "step": 210 }, { "epoch": 2.8783783783783785, "grad_norm": 1.579237937927246, "learning_rate": 7.224456333580573e-05, "loss": 0.5436, "step": 213 }, { "epoch": 2.8783783783783785, "eval_loss": 0.6127223968505859, "eval_runtime": 1.0524, "eval_samples_per_second": 15.204, "eval_steps_per_second": 3.801, "step": 213 }, { "epoch": 2.918918918918919, "grad_norm": 2.4129927158355713, "learning_rate": 7.152152239485419e-05, "loss": 0.526, "step": 216 }, { "epoch": 2.918918918918919, "eval_loss": 0.6055560111999512, "eval_runtime": 1.0502, "eval_samples_per_second": 15.236, "eval_steps_per_second": 3.809, "step": 216 }, { "epoch": 2.9594594594594597, "grad_norm": 2.252251148223877, "learning_rate": 7.079291458940301e-05, "loss": 0.4465, "step": 219 }, { "epoch": 2.9594594594594597, "eval_loss": 0.5982283353805542, "eval_runtime": 1.0529, "eval_samples_per_second": 15.197, "eval_steps_per_second": 3.799, "step": 219 }, { "epoch": 3.0, "grad_norm": 1.9773114919662476, "learning_rate": 7.005892838478711e-05, "loss": 0.3692, "step": 222 }, { "epoch": 3.0, "eval_loss": 0.5916565656661987, "eval_runtime": 1.0501, "eval_samples_per_second": 15.237, "eval_steps_per_second": 3.809, "step": 222 }, { "epoch": 3.0405405405405403, "grad_norm": 1.1434626579284668, "learning_rate": 6.931975363754502e-05, "loss": 0.3022, "step": 225 }, { "epoch": 3.0405405405405403, "eval_loss": 0.5955583453178406, "eval_runtime": 1.0535, "eval_samples_per_second": 15.187, "eval_steps_per_second": 3.797, "step": 225 }, { "epoch": 3.081081081081081, "grad_norm": 1.9162238836288452, "learning_rate": 6.85755815463096e-05, "loss": 0.2875, "step": 228 }, { "epoch": 3.081081081081081, "eval_loss": 0.6152929067611694, "eval_runtime": 1.0516, "eval_samples_per_second": 15.215, "eval_steps_per_second": 3.804, "step": 228 }, { "epoch": 3.1216216216216215, "grad_norm": 2.688631057739258, "learning_rate": 6.782660460235174e-05, "loss": 0.5544, "step": 231 }, { "epoch": 3.1216216216216215, "eval_loss": 0.6343094110488892, "eval_runtime": 1.052, "eval_samples_per_second": 15.21, "eval_steps_per_second": 3.802, "step": 231 }, { "epoch": 3.1621621621621623, "grad_norm": 2.58313250541687, "learning_rate": 6.707301653978945e-05, "loss": 0.4159, "step": 234 }, { "epoch": 3.1621621621621623, "eval_loss": 0.6369538307189941, "eval_runtime": 1.0524, "eval_samples_per_second": 15.203, "eval_steps_per_second": 3.801, "step": 234 }, { "epoch": 3.2027027027027026, "grad_norm": 2.2415409088134766, "learning_rate": 6.63150122854758e-05, "loss": 0.4963, "step": 237 }, { "epoch": 3.2027027027027026, "eval_loss": 0.6289186477661133, "eval_runtime": 1.0528, "eval_samples_per_second": 15.198, "eval_steps_per_second": 3.799, "step": 237 }, { "epoch": 3.2432432432432434, "grad_norm": 2.974931240081787, "learning_rate": 6.5552787908578e-05, "loss": 0.3248, "step": 240 }, { "epoch": 3.2432432432432434, "eval_loss": 0.6189987659454346, "eval_runtime": 1.0515, "eval_samples_per_second": 15.217, "eval_steps_per_second": 3.804, "step": 240 }, { "epoch": 3.2837837837837838, "grad_norm": 2.0078535079956055, "learning_rate": 6.478654056986131e-05, "loss": 0.349, "step": 243 }, { "epoch": 3.2837837837837838, "eval_loss": 0.6110680103302002, "eval_runtime": 1.0532, "eval_samples_per_second": 15.192, "eval_steps_per_second": 3.798, "step": 243 }, { "epoch": 3.3243243243243246, "grad_norm": 2.6236143112182617, "learning_rate": 6.401646847069039e-05, "loss": 0.3107, "step": 246 }, { "epoch": 3.3243243243243246, "eval_loss": 0.6120755672454834, "eval_runtime": 1.0508, "eval_samples_per_second": 15.227, "eval_steps_per_second": 3.807, "step": 246 }, { "epoch": 3.364864864864865, "grad_norm": 1.75555419921875, "learning_rate": 6.32427708017615e-05, "loss": 0.2219, "step": 249 }, { "epoch": 3.364864864864865, "eval_loss": 0.6196171641349792, "eval_runtime": 1.0523, "eval_samples_per_second": 15.204, "eval_steps_per_second": 3.801, "step": 249 }, { "epoch": 3.4054054054054053, "grad_norm": 3.003138303756714, "learning_rate": 6.246564769157894e-05, "loss": 0.251, "step": 252 }, { "epoch": 3.4054054054054053, "eval_loss": 0.6273298263549805, "eval_runtime": 1.0546, "eval_samples_per_second": 15.171, "eval_steps_per_second": 3.793, "step": 252 }, { "epoch": 3.445945945945946, "grad_norm": 2.2066917419433594, "learning_rate": 6.168530015468872e-05, "loss": 0.3366, "step": 255 }, { "epoch": 3.445945945945946, "eval_loss": 0.6258885860443115, "eval_runtime": 1.0514, "eval_samples_per_second": 15.217, "eval_steps_per_second": 3.804, "step": 255 }, { "epoch": 3.4864864864864864, "grad_norm": 1.7121000289916992, "learning_rate": 6.0901930039683184e-05, "loss": 0.3182, "step": 258 }, { "epoch": 3.4864864864864864, "eval_loss": 0.6243223547935486, "eval_runtime": 1.0739, "eval_samples_per_second": 14.898, "eval_steps_per_second": 3.725, "step": 258 }, { "epoch": 3.527027027027027, "grad_norm": 2.7600913047790527, "learning_rate": 6.011573997698985e-05, "loss": 0.4133, "step": 261 }, { "epoch": 3.527027027027027, "eval_loss": 0.6259996294975281, "eval_runtime": 1.0561, "eval_samples_per_second": 15.151, "eval_steps_per_second": 3.788, "step": 261 }, { "epoch": 3.5675675675675675, "grad_norm": 2.611302614212036, "learning_rate": 5.9326933326457956e-05, "loss": 0.3297, "step": 264 }, { "epoch": 3.5675675675675675, "eval_loss": 0.6303350925445557, "eval_runtime": 1.0534, "eval_samples_per_second": 15.189, "eval_steps_per_second": 3.797, "step": 264 }, { "epoch": 3.608108108108108, "grad_norm": 1.6527258157730103, "learning_rate": 5.8535714124756434e-05, "loss": 0.2276, "step": 267 }, { "epoch": 3.608108108108108, "eval_loss": 0.6364917159080505, "eval_runtime": 1.052, "eval_samples_per_second": 15.209, "eval_steps_per_second": 3.802, "step": 267 }, { "epoch": 3.6486486486486487, "grad_norm": 1.1108059883117676, "learning_rate": 5.774228703259678e-05, "loss": 0.1842, "step": 270 }, { "epoch": 3.6486486486486487, "eval_loss": 0.6382502317428589, "eval_runtime": 1.0549, "eval_samples_per_second": 15.168, "eval_steps_per_second": 3.792, "step": 270 }, { "epoch": 3.689189189189189, "grad_norm": 2.822380781173706, "learning_rate": 5.694685728179442e-05, "loss": 0.4961, "step": 273 }, { "epoch": 3.689189189189189, "eval_loss": 0.6313918828964233, "eval_runtime": 1.0523, "eval_samples_per_second": 15.205, "eval_steps_per_second": 3.801, "step": 273 }, { "epoch": 3.72972972972973, "grad_norm": 2.4894397258758545, "learning_rate": 5.6149630622182526e-05, "loss": 0.3785, "step": 276 }, { "epoch": 3.72972972972973, "eval_loss": 0.6239753365516663, "eval_runtime": 1.053, "eval_samples_per_second": 15.195, "eval_steps_per_second": 3.799, "step": 276 }, { "epoch": 3.77027027027027, "grad_norm": 2.1039986610412598, "learning_rate": 5.535081326839165e-05, "loss": 0.2834, "step": 279 }, { "epoch": 3.77027027027027, "eval_loss": 0.6189073920249939, "eval_runtime": 1.0515, "eval_samples_per_second": 15.217, "eval_steps_per_second": 3.804, "step": 279 }, { "epoch": 3.810810810810811, "grad_norm": 2.7096340656280518, "learning_rate": 5.455061184650921e-05, "loss": 0.3397, "step": 282 }, { "epoch": 3.810810810810811, "eval_loss": 0.6138538122177124, "eval_runtime": 1.0521, "eval_samples_per_second": 15.208, "eval_steps_per_second": 3.802, "step": 282 }, { "epoch": 3.8513513513513513, "grad_norm": 2.030907154083252, "learning_rate": 5.3749233340632674e-05, "loss": 0.2795, "step": 285 }, { "epoch": 3.8513513513513513, "eval_loss": 0.6104437708854675, "eval_runtime": 1.0581, "eval_samples_per_second": 15.122, "eval_steps_per_second": 3.78, "step": 285 }, { "epoch": 3.891891891891892, "grad_norm": 2.061206340789795, "learning_rate": 5.2946885039329866e-05, "loss": 0.3114, "step": 288 }, { "epoch": 3.891891891891892, "eval_loss": 0.6077687740325928, "eval_runtime": 1.0527, "eval_samples_per_second": 15.199, "eval_steps_per_second": 3.8, "step": 288 }, { "epoch": 3.9324324324324325, "grad_norm": 2.062087059020996, "learning_rate": 5.2143774482020744e-05, "loss": 0.2395, "step": 291 }, { "epoch": 3.9324324324324325, "eval_loss": 0.6111433506011963, "eval_runtime": 1.0517, "eval_samples_per_second": 15.214, "eval_steps_per_second": 3.804, "step": 291 }, { "epoch": 3.972972972972973, "grad_norm": 1.6344010829925537, "learning_rate": 5.134010940529429e-05, "loss": 0.1948, "step": 294 }, { "epoch": 3.972972972972973, "eval_loss": 0.6142452955245972, "eval_runtime": 1.0529, "eval_samples_per_second": 15.196, "eval_steps_per_second": 3.799, "step": 294 }, { "epoch": 4.013513513513513, "grad_norm": 1.9017384052276611, "learning_rate": 5.053609768917413e-05, "loss": 0.2284, "step": 297 }, { "epoch": 4.013513513513513, "eval_loss": 0.6194114685058594, "eval_runtime": 1.0515, "eval_samples_per_second": 15.217, "eval_steps_per_second": 3.804, "step": 297 }, { "epoch": 4.054054054054054, "grad_norm": 2.1609394550323486, "learning_rate": 4.973194730334748e-05, "loss": 0.2638, "step": 300 }, { "epoch": 4.054054054054054, "eval_loss": 0.6303145885467529, "eval_runtime": 1.053, "eval_samples_per_second": 15.194, "eval_steps_per_second": 3.798, "step": 300 }, { "epoch": 4.094594594594595, "grad_norm": 1.5275555849075317, "learning_rate": 4.892786625337047e-05, "loss": 0.252, "step": 303 }, { "epoch": 4.094594594594595, "eval_loss": 0.6517325639724731, "eval_runtime": 1.051, "eval_samples_per_second": 15.224, "eval_steps_per_second": 3.806, "step": 303 }, { "epoch": 4.135135135135135, "grad_norm": 2.807483434677124, "learning_rate": 4.8124062526864534e-05, "loss": 0.183, "step": 306 }, { "epoch": 4.135135135135135, "eval_loss": 0.6644703149795532, "eval_runtime": 1.0531, "eval_samples_per_second": 15.193, "eval_steps_per_second": 3.798, "step": 306 }, { "epoch": 4.175675675675675, "grad_norm": 2.6279256343841553, "learning_rate": 4.7320744039717154e-05, "loss": 0.2415, "step": 309 }, { "epoch": 4.175675675675675, "eval_loss": 0.6603893041610718, "eval_runtime": 1.0531, "eval_samples_per_second": 15.193, "eval_steps_per_second": 3.798, "step": 309 }, { "epoch": 4.216216216216216, "grad_norm": 0.42106354236602783, "learning_rate": 4.651811858230149e-05, "loss": 0.1791, "step": 312 }, { "epoch": 4.216216216216216, "eval_loss": 0.652984082698822, "eval_runtime": 1.053, "eval_samples_per_second": 15.195, "eval_steps_per_second": 3.799, "step": 312 }, { "epoch": 4.256756756756757, "grad_norm": 2.064615249633789, "learning_rate": 4.571639376572806e-05, "loss": 0.2013, "step": 315 }, { "epoch": 4.256756756756757, "eval_loss": 0.6488903760910034, "eval_runtime": 1.0505, "eval_samples_per_second": 15.23, "eval_steps_per_second": 3.808, "step": 315 }, { "epoch": 4.297297297297297, "grad_norm": 2.4248170852661133, "learning_rate": 4.491577696814318e-05, "loss": 0.1827, "step": 318 }, { "epoch": 4.297297297297297, "eval_loss": 0.653176486492157, "eval_runtime": 1.0536, "eval_samples_per_second": 15.186, "eval_steps_per_second": 3.797, "step": 318 }, { "epoch": 4.337837837837838, "grad_norm": 2.055769443511963, "learning_rate": 4.411647528108743e-05, "loss": 0.1792, "step": 321 }, { "epoch": 4.337837837837838, "eval_loss": 0.6584765315055847, "eval_runtime": 1.052, "eval_samples_per_second": 15.209, "eval_steps_per_second": 3.802, "step": 321 }, { "epoch": 4.378378378378378, "grad_norm": 3.4611449241638184, "learning_rate": 4.331869545592834e-05, "loss": 0.2568, "step": 324 }, { "epoch": 4.378378378378378, "eval_loss": 0.6628451347351074, "eval_runtime": 1.055, "eval_samples_per_second": 15.166, "eval_steps_per_second": 3.791, "step": 324 }, { "epoch": 4.418918918918919, "grad_norm": 1.6108025312423706, "learning_rate": 4.252264385038098e-05, "loss": 0.1682, "step": 327 }, { "epoch": 4.418918918918919, "eval_loss": 0.66502845287323, "eval_runtime": 1.0508, "eval_samples_per_second": 15.227, "eval_steps_per_second": 3.807, "step": 327 }, { "epoch": 4.45945945945946, "grad_norm": 1.828131914138794, "learning_rate": 4.1728526375130614e-05, "loss": 0.25, "step": 330 }, { "epoch": 4.45945945945946, "eval_loss": 0.6729562282562256, "eval_runtime": 1.0534, "eval_samples_per_second": 15.189, "eval_steps_per_second": 3.797, "step": 330 }, { "epoch": 4.5, "grad_norm": 2.5057499408721924, "learning_rate": 4.093654844057059e-05, "loss": 0.2664, "step": 333 }, { "epoch": 4.5, "eval_loss": 0.6741403937339783, "eval_runtime": 1.052, "eval_samples_per_second": 15.209, "eval_steps_per_second": 3.802, "step": 333 }, { "epoch": 4.54054054054054, "grad_norm": 1.6008535623550415, "learning_rate": 4.014691490367e-05, "loss": 0.2316, "step": 336 }, { "epoch": 4.54054054054054, "eval_loss": 0.6773088574409485, "eval_runtime": 1.053, "eval_samples_per_second": 15.194, "eval_steps_per_second": 3.799, "step": 336 }, { "epoch": 4.581081081081081, "grad_norm": 2.551591157913208, "learning_rate": 3.935983001498439e-05, "loss": 0.3467, "step": 339 }, { "epoch": 4.581081081081081, "eval_loss": 0.6705477237701416, "eval_runtime": 1.0509, "eval_samples_per_second": 15.226, "eval_steps_per_second": 3.806, "step": 339 }, { "epoch": 4.621621621621622, "grad_norm": 2.130202054977417, "learning_rate": 3.857549736582316e-05, "loss": 0.2426, "step": 342 }, { "epoch": 4.621621621621622, "eval_loss": 0.6681296825408936, "eval_runtime": 1.0529, "eval_samples_per_second": 15.196, "eval_steps_per_second": 3.799, "step": 342 }, { "epoch": 4.662162162162162, "grad_norm": 2.043670415878296, "learning_rate": 3.7794119835587685e-05, "loss": 0.2421, "step": 345 }, { "epoch": 4.662162162162162, "eval_loss": 0.6622060537338257, "eval_runtime": 1.0519, "eval_samples_per_second": 15.21, "eval_steps_per_second": 3.803, "step": 345 }, { "epoch": 4.702702702702703, "grad_norm": 1.9365885257720947, "learning_rate": 3.701589953929354e-05, "loss": 0.4063, "step": 348 }, { "epoch": 4.702702702702703, "eval_loss": 0.6608781814575195, "eval_runtime": 1.0528, "eval_samples_per_second": 15.197, "eval_steps_per_second": 3.799, "step": 348 }, { "epoch": 4.743243243243243, "grad_norm": 2.596634864807129, "learning_rate": 3.62410377752904e-05, "loss": 0.2255, "step": 351 }, { "epoch": 4.743243243243243, "eval_loss": 0.6569182276725769, "eval_runtime": 1.0522, "eval_samples_per_second": 15.206, "eval_steps_per_second": 3.802, "step": 351 }, { "epoch": 4.783783783783784, "grad_norm": 2.039332628250122, "learning_rate": 3.546973497319319e-05, "loss": 0.1933, "step": 354 }, { "epoch": 4.783783783783784, "eval_loss": 0.6534222364425659, "eval_runtime": 1.0498, "eval_samples_per_second": 15.241, "eval_steps_per_second": 3.81, "step": 354 }, { "epoch": 4.824324324324325, "grad_norm": 1.994629144668579, "learning_rate": 3.4702190642037944e-05, "loss": 0.1975, "step": 357 }, { "epoch": 4.824324324324325, "eval_loss": 0.649687647819519, "eval_runtime": 1.0523, "eval_samples_per_second": 15.204, "eval_steps_per_second": 3.801, "step": 357 }, { "epoch": 4.864864864864865, "grad_norm": 2.154684543609619, "learning_rate": 3.393860331867589e-05, "loss": 0.3065, "step": 360 }, { "epoch": 4.864864864864865, "eval_loss": 0.6491411924362183, "eval_runtime": 1.0519, "eval_samples_per_second": 15.21, "eval_steps_per_second": 3.803, "step": 360 }, { "epoch": 4.905405405405405, "grad_norm": 1.61858069896698, "learning_rate": 3.317917051641877e-05, "loss": 0.1641, "step": 363 }, { "epoch": 4.905405405405405, "eval_loss": 0.651297926902771, "eval_runtime": 1.0521, "eval_samples_per_second": 15.208, "eval_steps_per_second": 3.802, "step": 363 }, { "epoch": 4.945945945945946, "grad_norm": 2.7362637519836426, "learning_rate": 3.242408867394919e-05, "loss": 0.2032, "step": 366 }, { "epoch": 4.945945945945946, "eval_loss": 0.6552869081497192, "eval_runtime": 1.0506, "eval_samples_per_second": 15.229, "eval_steps_per_second": 3.807, "step": 366 }, { "epoch": 4.986486486486487, "grad_norm": 2.0567097663879395, "learning_rate": 3.167355310450877e-05, "loss": 0.1886, "step": 369 }, { "epoch": 4.986486486486487, "eval_loss": 0.6590157747268677, "eval_runtime": 1.0528, "eval_samples_per_second": 15.197, "eval_steps_per_second": 3.799, "step": 369 }, { "epoch": 5.027027027027027, "grad_norm": 1.5418853759765625, "learning_rate": 3.092775794537741e-05, "loss": 0.2539, "step": 372 }, { "epoch": 5.027027027027027, "eval_loss": 0.6676727533340454, "eval_runtime": 1.0516, "eval_samples_per_second": 15.215, "eval_steps_per_second": 3.804, "step": 372 }, { "epoch": 5.0675675675675675, "grad_norm": 1.229972004890442, "learning_rate": 3.0186896107656803e-05, "loss": 0.1464, "step": 375 }, { "epoch": 5.0675675675675675, "eval_loss": 0.687861979007721, "eval_runtime": 1.0539, "eval_samples_per_second": 15.182, "eval_steps_per_second": 3.796, "step": 375 }, { "epoch": 5.108108108108108, "grad_norm": 2.421496868133545, "learning_rate": 2.9451159226371095e-05, "loss": 0.2295, "step": 378 }, { "epoch": 5.108108108108108, "eval_loss": 0.7066453695297241, "eval_runtime": 1.0503, "eval_samples_per_second": 15.233, "eval_steps_per_second": 3.808, "step": 378 }, { "epoch": 5.148648648648648, "grad_norm": 2.3475804328918457, "learning_rate": 2.8720737610897575e-05, "loss": 0.1438, "step": 381 }, { "epoch": 5.148648648648648, "eval_loss": 0.7166962623596191, "eval_runtime": 1.0534, "eval_samples_per_second": 15.189, "eval_steps_per_second": 3.797, "step": 381 }, { "epoch": 5.1891891891891895, "grad_norm": 2.2746946811676025, "learning_rate": 2.799582019574033e-05, "loss": 0.1603, "step": 384 }, { "epoch": 5.1891891891891895, "eval_loss": 0.7134541273117065, "eval_runtime": 1.0519, "eval_samples_per_second": 15.211, "eval_steps_per_second": 3.803, "step": 384 }, { "epoch": 5.22972972972973, "grad_norm": 1.2550048828125, "learning_rate": 2.7276594491659525e-05, "loss": 0.1379, "step": 387 }, { "epoch": 5.22972972972973, "eval_loss": 0.7095359563827515, "eval_runtime": 1.0543, "eval_samples_per_second": 15.176, "eval_steps_per_second": 3.794, "step": 387 }, { "epoch": 5.27027027027027, "grad_norm": 1.7738205194473267, "learning_rate": 2.656324653716884e-05, "loss": 0.2783, "step": 390 }, { "epoch": 5.27027027027027, "eval_loss": 0.7103461623191833, "eval_runtime": 1.0515, "eval_samples_per_second": 15.216, "eval_steps_per_second": 3.804, "step": 390 }, { "epoch": 5.3108108108108105, "grad_norm": 2.2887580394744873, "learning_rate": 2.5855960850413935e-05, "loss": 0.1575, "step": 393 }, { "epoch": 5.3108108108108105, "eval_loss": 0.7042403817176819, "eval_runtime": 1.0523, "eval_samples_per_second": 15.204, "eval_steps_per_second": 3.801, "step": 393 }, { "epoch": 5.351351351351352, "grad_norm": 2.6281135082244873, "learning_rate": 2.5154920381444025e-05, "loss": 0.1743, "step": 396 }, { "epoch": 5.351351351351352, "eval_loss": 0.7114053964614868, "eval_runtime": 1.0527, "eval_samples_per_second": 15.199, "eval_steps_per_second": 3.8, "step": 396 }, { "epoch": 5.391891891891892, "grad_norm": 1.8125991821289062, "learning_rate": 2.4460306464889022e-05, "loss": 0.1168, "step": 399 }, { "epoch": 5.391891891891892, "eval_loss": 0.7083012461662292, "eval_runtime": 1.0506, "eval_samples_per_second": 15.23, "eval_steps_per_second": 3.807, "step": 399 }, { "epoch": 5.4324324324324325, "grad_norm": 2.5157058238983154, "learning_rate": 2.3772298773054757e-05, "loss": 0.284, "step": 402 }, { "epoch": 5.4324324324324325, "eval_loss": 0.7072416543960571, "eval_runtime": 1.0524, "eval_samples_per_second": 15.204, "eval_steps_per_second": 3.801, "step": 402 }, { "epoch": 5.472972972972973, "grad_norm": 0.8739199042320251, "learning_rate": 2.309107526944792e-05, "loss": 0.1013, "step": 405 }, { "epoch": 5.472972972972973, "eval_loss": 0.7062889933586121, "eval_runtime": 1.051, "eval_samples_per_second": 15.223, "eval_steps_per_second": 3.806, "step": 405 }, { "epoch": 5.513513513513513, "grad_norm": 2.2809295654296875, "learning_rate": 2.2416812162743223e-05, "loss": 0.2612, "step": 408 }, { "epoch": 5.513513513513513, "eval_loss": 0.70506751537323, "eval_runtime": 1.053, "eval_samples_per_second": 15.195, "eval_steps_per_second": 3.799, "step": 408 }, { "epoch": 5.554054054054054, "grad_norm": 2.2030365467071533, "learning_rate": 2.17496838612043e-05, "loss": 0.1343, "step": 411 }, { "epoch": 5.554054054054054, "eval_loss": 0.7102519273757935, "eval_runtime": 1.0534, "eval_samples_per_second": 15.188, "eval_steps_per_second": 3.797, "step": 411 }, { "epoch": 5.594594594594595, "grad_norm": 1.4592159986495972, "learning_rate": 2.1089862927570475e-05, "loss": 0.1009, "step": 414 }, { "epoch": 5.594594594594595, "eval_loss": 0.7105306386947632, "eval_runtime": 1.0533, "eval_samples_per_second": 15.19, "eval_steps_per_second": 3.797, "step": 414 }, { "epoch": 5.635135135135135, "grad_norm": 2.2018954753875732, "learning_rate": 2.0437520034420776e-05, "loss": 0.3127, "step": 417 }, { "epoch": 5.635135135135135, "eval_loss": 0.7089606523513794, "eval_runtime": 1.0533, "eval_samples_per_second": 15.191, "eval_steps_per_second": 3.798, "step": 417 }, { "epoch": 5.675675675675675, "grad_norm": 1.8359624147415161, "learning_rate": 1.979282392002691e-05, "loss": 0.1355, "step": 420 }, { "epoch": 5.675675675675675, "eval_loss": 0.7059516906738281, "eval_runtime": 1.0526, "eval_samples_per_second": 15.201, "eval_steps_per_second": 3.8, "step": 420 }, { "epoch": 5.716216216216216, "grad_norm": 2.3145079612731934, "learning_rate": 1.9155941344706546e-05, "loss": 0.1345, "step": 423 }, { "epoch": 5.716216216216216, "eval_loss": 0.705683171749115, "eval_runtime": 1.0519, "eval_samples_per_second": 15.21, "eval_steps_per_second": 3.802, "step": 423 }, { "epoch": 5.756756756756757, "grad_norm": 1.7434961795806885, "learning_rate": 1.852703704768842e-05, "loss": 0.1865, "step": 426 }, { "epoch": 5.756756756756757, "eval_loss": 0.7038547396659851, "eval_runtime": 1.0535, "eval_samples_per_second": 15.188, "eval_steps_per_second": 3.797, "step": 426 }, { "epoch": 5.797297297297297, "grad_norm": 1.5850327014923096, "learning_rate": 1.7906273704499845e-05, "loss": 0.119, "step": 429 }, { "epoch": 5.797297297297297, "eval_loss": 0.7066537737846375, "eval_runtime": 1.0521, "eval_samples_per_second": 15.208, "eval_steps_per_second": 3.802, "step": 429 }, { "epoch": 5.837837837837838, "grad_norm": 1.599552035331726, "learning_rate": 1.7293811884888344e-05, "loss": 0.149, "step": 432 }, { "epoch": 5.837837837837838, "eval_loss": 0.7120293974876404, "eval_runtime": 1.0536, "eval_samples_per_second": 15.185, "eval_steps_per_second": 3.796, "step": 432 }, { "epoch": 5.878378378378378, "grad_norm": 1.8353303670883179, "learning_rate": 1.6689810011287932e-05, "loss": 0.1748, "step": 435 }, { "epoch": 5.878378378378378, "eval_loss": 0.7123138308525085, "eval_runtime": 1.0524, "eval_samples_per_second": 15.203, "eval_steps_per_second": 3.801, "step": 435 }, { "epoch": 5.918918918918919, "grad_norm": 1.4937026500701904, "learning_rate": 1.6094424317840723e-05, "loss": 0.1781, "step": 438 }, { "epoch": 5.918918918918919, "eval_loss": 0.7113088965415955, "eval_runtime": 1.0528, "eval_samples_per_second": 15.198, "eval_steps_per_second": 3.799, "step": 438 }, { "epoch": 5.95945945945946, "grad_norm": 2.0092716217041016, "learning_rate": 1.550780880998456e-05, "loss": 0.2075, "step": 441 }, { "epoch": 5.95945945945946, "eval_loss": 0.7117879390716553, "eval_runtime": 1.0532, "eval_samples_per_second": 15.192, "eval_steps_per_second": 3.798, "step": 441 }, { "epoch": 6.0, "grad_norm": 2.762338161468506, "learning_rate": 1.4930115224617353e-05, "loss": 0.1591, "step": 444 }, { "epoch": 6.0, "eval_loss": 0.7111848592758179, "eval_runtime": 1.0522, "eval_samples_per_second": 15.206, "eval_steps_per_second": 3.801, "step": 444 }, { "epoch": 6.04054054054054, "grad_norm": 1.825244665145874, "learning_rate": 1.436149299084789e-05, "loss": 0.1224, "step": 447 }, { "epoch": 6.04054054054054, "eval_loss": 0.7117843627929688, "eval_runtime": 1.0529, "eval_samples_per_second": 15.195, "eval_steps_per_second": 3.799, "step": 447 }, { "epoch": 6.081081081081081, "grad_norm": 0.9274085760116577, "learning_rate": 1.380208919134392e-05, "loss": 0.2234, "step": 450 }, { "epoch": 6.081081081081081, "eval_loss": 0.7170644402503967, "eval_runtime": 1.0513, "eval_samples_per_second": 15.219, "eval_steps_per_second": 3.805, "step": 450 }, { "epoch": 6.121621621621622, "grad_norm": 1.5220532417297363, "learning_rate": 1.3252048524286842e-05, "loss": 0.1165, "step": 453 }, { "epoch": 6.121621621621622, "eval_loss": 0.7227377891540527, "eval_runtime": 1.0532, "eval_samples_per_second": 15.191, "eval_steps_per_second": 3.798, "step": 453 }, { "epoch": 6.162162162162162, "grad_norm": 1.669662594795227, "learning_rate": 1.271151326594352e-05, "loss": 0.2518, "step": 456 }, { "epoch": 6.162162162162162, "eval_loss": 0.7325636148452759, "eval_runtime": 1.0523, "eval_samples_per_second": 15.205, "eval_steps_per_second": 3.801, "step": 456 }, { "epoch": 6.202702702702703, "grad_norm": 1.6538748741149902, "learning_rate": 1.2180623233864253e-05, "loss": 0.1288, "step": 459 }, { "epoch": 6.202702702702703, "eval_loss": 0.7430564165115356, "eval_runtime": 1.0597, "eval_samples_per_second": 15.099, "eval_steps_per_second": 3.775, "step": 459 }, { "epoch": 6.243243243243243, "grad_norm": 1.5836577415466309, "learning_rate": 1.1659515750716955e-05, "loss": 0.1176, "step": 462 }, { "epoch": 6.243243243243243, "eval_loss": 0.7481391429901123, "eval_runtime": 1.0512, "eval_samples_per_second": 15.221, "eval_steps_per_second": 3.805, "step": 462 }, { "epoch": 6.283783783783784, "grad_norm": 1.0982418060302734, "learning_rate": 1.1148325608766585e-05, "loss": 0.1231, "step": 465 }, { "epoch": 6.283783783783784, "eval_loss": 0.7511347532272339, "eval_runtime": 1.0552, "eval_samples_per_second": 15.163, "eval_steps_per_second": 3.791, "step": 465 }, { "epoch": 6.324324324324325, "grad_norm": 1.9232176542282104, "learning_rate": 1.0647185035009038e-05, "loss": 0.146, "step": 468 }, { "epoch": 6.324324324324325, "eval_loss": 0.7529792785644531, "eval_runtime": 1.0535, "eval_samples_per_second": 15.188, "eval_steps_per_second": 3.797, "step": 468 }, { "epoch": 6.364864864864865, "grad_norm": 2.5786333084106445, "learning_rate": 1.0156223656968694e-05, "loss": 0.1169, "step": 471 }, { "epoch": 6.364864864864865, "eval_loss": 0.7518468499183655, "eval_runtime": 1.0523, "eval_samples_per_second": 15.205, "eval_steps_per_second": 3.801, "step": 471 }, { "epoch": 6.405405405405405, "grad_norm": 1.4718759059906006, "learning_rate": 9.675568469168388e-06, "loss": 0.1048, "step": 474 }, { "epoch": 6.405405405405405, "eval_loss": 0.7540909051895142, "eval_runtime": 1.049, "eval_samples_per_second": 15.253, "eval_steps_per_second": 3.813, "step": 474 }, { "epoch": 6.445945945945946, "grad_norm": 1.3492368459701538, "learning_rate": 9.205343800280219e-06, "loss": 0.1092, "step": 477 }, { "epoch": 6.445945945945946, "eval_loss": 0.750686764717102, "eval_runtime": 1.0533, "eval_samples_per_second": 15.19, "eval_steps_per_second": 3.798, "step": 477 }, { "epoch": 6.486486486486487, "grad_norm": 2.10587739944458, "learning_rate": 8.745671280966177e-06, "loss": 0.1458, "step": 480 }, { "epoch": 6.486486486486487, "eval_loss": 0.7518497705459595, "eval_runtime": 1.0499, "eval_samples_per_second": 15.239, "eval_steps_per_second": 3.81, "step": 480 }, { "epoch": 6.527027027027027, "grad_norm": 0.8871177434921265, "learning_rate": 8.296669812416547e-06, "loss": 0.2177, "step": 483 }, { "epoch": 6.527027027027027, "eval_loss": 0.7509324550628662, "eval_runtime": 1.0528, "eval_samples_per_second": 15.198, "eval_steps_per_second": 3.8, "step": 483 }, { "epoch": 6.5675675675675675, "grad_norm": 1.299116611480713, "learning_rate": 7.858455535594306e-06, "loss": 0.1585, "step": 486 }, { "epoch": 6.5675675675675675, "eval_loss": 0.7509753108024597, "eval_runtime": 1.0507, "eval_samples_per_second": 15.228, "eval_steps_per_second": 3.807, "step": 486 }, { "epoch": 6.608108108108108, "grad_norm": 1.8996071815490723, "learning_rate": 7.431141801193508e-06, "loss": 0.1337, "step": 489 }, { "epoch": 6.608108108108108, "eval_loss": 0.7546273469924927, "eval_runtime": 1.0538, "eval_samples_per_second": 15.183, "eval_steps_per_second": 3.796, "step": 489 }, { "epoch": 6.648648648648649, "grad_norm": 2.193199634552002, "learning_rate": 7.014839140319485e-06, "loss": 0.122, "step": 492 }, { "epoch": 6.648648648648649, "eval_loss": 0.7523775100708008, "eval_runtime": 1.0517, "eval_samples_per_second": 15.213, "eval_steps_per_second": 3.803, "step": 492 }, { "epoch": 6.6891891891891895, "grad_norm": 1.310517430305481, "learning_rate": 6.609655235898227e-06, "loss": 0.0793, "step": 495 }, { "epoch": 6.6891891891891895, "eval_loss": 0.7553800344467163, "eval_runtime": 1.0524, "eval_samples_per_second": 15.203, "eval_steps_per_second": 3.801, "step": 495 }, { "epoch": 6.72972972972973, "grad_norm": 1.7615861892700195, "learning_rate": 6.215694894822699e-06, "loss": 0.1544, "step": 498 }, { "epoch": 6.72972972972973, "eval_loss": 0.7521288394927979, "eval_runtime": 1.0505, "eval_samples_per_second": 15.231, "eval_steps_per_second": 3.808, "step": 498 }, { "epoch": 6.77027027027027, "grad_norm": 1.4952490329742432, "learning_rate": 5.83306002084284e-06, "loss": 0.1387, "step": 501 }, { "epoch": 6.77027027027027, "eval_loss": 0.7528640627861023, "eval_runtime": 1.052, "eval_samples_per_second": 15.209, "eval_steps_per_second": 3.802, "step": 501 }, { "epoch": 6.8108108108108105, "grad_norm": 1.7409045696258545, "learning_rate": 5.461849588206724e-06, "loss": 0.1253, "step": 504 }, { "epoch": 6.8108108108108105, "eval_loss": 0.7528926134109497, "eval_runtime": 1.059, "eval_samples_per_second": 15.108, "eval_steps_per_second": 3.777, "step": 504 }, { "epoch": 6.851351351351351, "grad_norm": 0.7362686395645142, "learning_rate": 5.102159616059365e-06, "loss": 0.1296, "step": 507 }, { "epoch": 6.851351351351351, "eval_loss": 0.7542049884796143, "eval_runtime": 1.0521, "eval_samples_per_second": 15.207, "eval_steps_per_second": 3.802, "step": 507 }, { "epoch": 6.891891891891892, "grad_norm": 0.806505560874939, "learning_rate": 4.754083143605869e-06, "loss": 0.1094, "step": 510 }, { "epoch": 6.891891891891892, "eval_loss": 0.7515612840652466, "eval_runtime": 1.0559, "eval_samples_per_second": 15.152, "eval_steps_per_second": 3.788, "step": 510 }, { "epoch": 6.9324324324324325, "grad_norm": 1.5709373950958252, "learning_rate": 4.417710206045533e-06, "loss": 0.1009, "step": 513 }, { "epoch": 6.9324324324324325, "eval_loss": 0.751240611076355, "eval_runtime": 1.0523, "eval_samples_per_second": 15.205, "eval_steps_per_second": 3.801, "step": 513 }, { "epoch": 6.972972972972973, "grad_norm": 1.2641761302947998, "learning_rate": 4.093127811282821e-06, "loss": 0.1871, "step": 516 }, { "epoch": 6.972972972972973, "eval_loss": 0.7525576949119568, "eval_runtime": 1.0539, "eval_samples_per_second": 15.181, "eval_steps_per_second": 3.795, "step": 516 }, { "epoch": 7.013513513513513, "grad_norm": 0.9734938144683838, "learning_rate": 3.7804199174215183e-06, "loss": 0.1017, "step": 519 }, { "epoch": 7.013513513513513, "eval_loss": 0.7537960410118103, "eval_runtime": 1.0511, "eval_samples_per_second": 15.222, "eval_steps_per_second": 3.805, "step": 519 }, { "epoch": 7.054054054054054, "grad_norm": 1.4745818376541138, "learning_rate": 3.479667411047677e-06, "loss": 0.1536, "step": 522 }, { "epoch": 7.054054054054054, "eval_loss": 0.7529079914093018, "eval_runtime": 1.0543, "eval_samples_per_second": 15.176, "eval_steps_per_second": 3.794, "step": 522 }, { "epoch": 7.094594594594595, "grad_norm": 1.0725492238998413, "learning_rate": 3.1909480863070884e-06, "loss": 0.0886, "step": 525 }, { "epoch": 7.094594594594595, "eval_loss": 0.7565038204193115, "eval_runtime": 1.0511, "eval_samples_per_second": 15.222, "eval_steps_per_second": 3.806, "step": 525 }, { "epoch": 7.135135135135135, "grad_norm": 1.1345540285110474, "learning_rate": 2.9143366247826598e-06, "loss": 0.0983, "step": 528 }, { "epoch": 7.135135135135135, "eval_loss": 0.7576066255569458, "eval_runtime": 1.0528, "eval_samples_per_second": 15.198, "eval_steps_per_second": 3.799, "step": 528 }, { "epoch": 7.175675675675675, "grad_norm": 1.122189998626709, "learning_rate": 2.6499045761769315e-06, "loss": 0.084, "step": 531 }, { "epoch": 7.175675675675675, "eval_loss": 0.758578896522522, "eval_runtime": 1.0508, "eval_samples_per_second": 15.227, "eval_steps_per_second": 3.807, "step": 531 }, { "epoch": 7.216216216216216, "grad_norm": 1.6193064451217651, "learning_rate": 2.397720339804649e-06, "loss": 0.099, "step": 534 }, { "epoch": 7.216216216216216, "eval_loss": 0.7563527822494507, "eval_runtime": 1.0563, "eval_samples_per_second": 15.147, "eval_steps_per_second": 3.787, "step": 534 }, { "epoch": 7.256756756756757, "grad_norm": 1.373356580734253, "learning_rate": 2.1578491469002373e-06, "loss": 0.1089, "step": 537 }, { "epoch": 7.256756756756757, "eval_loss": 0.7592064142227173, "eval_runtime": 1.0528, "eval_samples_per_second": 15.197, "eval_steps_per_second": 3.799, "step": 537 }, { "epoch": 7.297297297297297, "grad_norm": 1.1875869035720825, "learning_rate": 1.9303530437448035e-06, "loss": 0.1145, "step": 540 }, { "epoch": 7.297297297297297, "eval_loss": 0.7611518502235413, "eval_runtime": 1.0529, "eval_samples_per_second": 15.196, "eval_steps_per_second": 3.799, "step": 540 }, { "epoch": 7.337837837837838, "grad_norm": 1.8787821531295776, "learning_rate": 1.7152908756169262e-06, "loss": 0.1823, "step": 543 }, { "epoch": 7.337837837837838, "eval_loss": 0.7614726424217224, "eval_runtime": 1.0548, "eval_samples_per_second": 15.168, "eval_steps_per_second": 3.792, "step": 543 }, { "epoch": 7.378378378378378, "grad_norm": 1.9469506740570068, "learning_rate": 1.5127182715714006e-06, "loss": 0.2784, "step": 546 }, { "epoch": 7.378378378378378, "eval_loss": 0.7602246999740601, "eval_runtime": 1.053, "eval_samples_per_second": 15.194, "eval_steps_per_second": 3.799, "step": 546 }, { "epoch": 7.418918918918919, "grad_norm": 1.6328327655792236, "learning_rate": 1.3226876300500123e-06, "loss": 0.0887, "step": 549 }, { "epoch": 7.418918918918919, "eval_loss": 0.7616763114929199, "eval_runtime": 1.0504, "eval_samples_per_second": 15.232, "eval_steps_per_second": 3.808, "step": 549 }, { "epoch": 7.45945945945946, "grad_norm": 1.5713064670562744, "learning_rate": 1.1452481053278396e-06, "loss": 0.1133, "step": 552 }, { "epoch": 7.45945945945946, "eval_loss": 0.7640103101730347, "eval_runtime": 1.053, "eval_samples_per_second": 15.195, "eval_steps_per_second": 3.799, "step": 552 }, { "epoch": 7.5, "grad_norm": 1.5901539325714111, "learning_rate": 9.804455947988067e-07, "loss": 0.1207, "step": 555 }, { "epoch": 7.5, "eval_loss": 0.7629836797714233, "eval_runtime": 1.0516, "eval_samples_per_second": 15.216, "eval_steps_per_second": 3.804, "step": 555 }, { "epoch": 7.54054054054054, "grad_norm": 1.5648808479309082, "learning_rate": 8.283227271035976e-07, "loss": 0.0954, "step": 558 }, { "epoch": 7.54054054054054, "eval_loss": 0.7643275260925293, "eval_runtime": 1.0548, "eval_samples_per_second": 15.169, "eval_steps_per_second": 3.792, "step": 558 }, { "epoch": 7.581081081081081, "grad_norm": 1.6403340101242065, "learning_rate": 6.889188511031542e-07, "loss": 0.1135, "step": 561 }, { "epoch": 7.581081081081081, "eval_loss": 0.7628697156906128, "eval_runtime": 1.0531, "eval_samples_per_second": 15.194, "eval_steps_per_second": 3.798, "step": 561 }, { "epoch": 7.621621621621622, "grad_norm": 1.393983244895935, "learning_rate": 5.622700257004676e-07, "loss": 0.096, "step": 564 }, { "epoch": 7.621621621621622, "eval_loss": 0.7637063264846802, "eval_runtime": 1.0544, "eval_samples_per_second": 15.174, "eval_steps_per_second": 3.793, "step": 564 }, { "epoch": 7.662162162162162, "grad_norm": 1.2016361951828003, "learning_rate": 4.484090105134231e-07, "loss": 0.1088, "step": 567 }, { "epoch": 7.662162162162162, "eval_loss": 0.7655338048934937, "eval_runtime": 1.0534, "eval_samples_per_second": 15.189, "eval_steps_per_second": 3.797, "step": 567 }, { "epoch": 7.702702702702703, "grad_norm": 1.1388864517211914, "learning_rate": 3.4736525740104444e-07, "loss": 0.1628, "step": 570 }, { "epoch": 7.702702702702703, "eval_loss": 0.7655097842216492, "eval_runtime": 1.053, "eval_samples_per_second": 15.195, "eval_steps_per_second": 3.799, "step": 570 }, { "epoch": 7.743243243243243, "grad_norm": 1.9650497436523438, "learning_rate": 2.591649028453047e-07, "loss": 0.1431, "step": 573 }, { "epoch": 7.743243243243243, "eval_loss": 0.7649960517883301, "eval_runtime": 1.0519, "eval_samples_per_second": 15.211, "eval_steps_per_second": 3.803, "step": 573 }, { "epoch": 7.783783783783784, "grad_norm": 1.7549225091934204, "learning_rate": 1.8383076119053432e-07, "loss": 0.1034, "step": 576 }, { "epoch": 7.783783783783784, "eval_loss": 0.763870358467102, "eval_runtime": 1.0529, "eval_samples_per_second": 15.196, "eval_steps_per_second": 3.799, "step": 576 }, { "epoch": 7.824324324324325, "grad_norm": 1.7549595832824707, "learning_rate": 1.2138231874217475e-07, "loss": 0.181, "step": 579 }, { "epoch": 7.824324324324325, "eval_loss": 0.7637079358100891, "eval_runtime": 1.0546, "eval_samples_per_second": 15.172, "eval_steps_per_second": 3.793, "step": 579 }, { "epoch": 7.864864864864865, "grad_norm": 1.3891515731811523, "learning_rate": 7.183572872632715e-08, "loss": 0.062, "step": 582 }, { "epoch": 7.864864864864865, "eval_loss": 0.7649126052856445, "eval_runtime": 1.0509, "eval_samples_per_second": 15.225, "eval_steps_per_second": 3.806, "step": 582 }, { "epoch": 7.905405405405405, "grad_norm": 1.0669249296188354, "learning_rate": 3.5203807111489074e-08, "loss": 0.0769, "step": 585 }, { "epoch": 7.905405405405405, "eval_loss": 0.7653980255126953, "eval_runtime": 1.0536, "eval_samples_per_second": 15.185, "eval_steps_per_second": 3.796, "step": 585 }, { "epoch": 7.945945945945946, "grad_norm": 2.3302104473114014, "learning_rate": 1.1496029293511789e-08, "loss": 0.1951, "step": 588 }, { "epoch": 7.945945945945946, "eval_loss": 0.7646524906158447, "eval_runtime": 1.0566, "eval_samples_per_second": 15.143, "eval_steps_per_second": 3.786, "step": 588 }, { "epoch": 7.986486486486487, "grad_norm": 1.9744952917099, "learning_rate": 7.185276446441958e-10, "loss": 0.1175, "step": 591 }, { "epoch": 7.986486486486487, "eval_loss": 0.765015721321106, "eval_runtime": 1.0522, "eval_samples_per_second": 15.206, "eval_steps_per_second": 3.801, "step": 591 }, { "epoch": 8.0, "step": 592, "total_flos": 1188976147968000.0, "train_loss": 0.4161318518926163, "train_runtime": 741.424, "train_samples_per_second": 3.194, "train_steps_per_second": 0.798 } ], "logging_steps": 3, "max_steps": 592, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1188976147968000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }