| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 8.0, | |
| "eval_steps": 3, | |
| "global_step": 592, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.04054054054054054, | |
| "grad_norm": 5.807250022888184, | |
| "learning_rate": 5e-05, | |
| "loss": 3.1119, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.04054054054054054, | |
| "eval_loss": 3.1016640663146973, | |
| "eval_runtime": 1.0551, | |
| "eval_samples_per_second": 15.164, | |
| "eval_steps_per_second": 3.791, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.08108108108108109, | |
| "grad_norm": 4.004100322723389, | |
| "learning_rate": 0.0001, | |
| "loss": 2.8734, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.08108108108108109, | |
| "eval_loss": 2.6094236373901367, | |
| "eval_runtime": 1.0592, | |
| "eval_samples_per_second": 15.106, | |
| "eval_steps_per_second": 3.777, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.12162162162162163, | |
| "grad_norm": 3.935053586959839, | |
| "learning_rate": 9.999353337510526e-05, | |
| "loss": 2.4188, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.12162162162162163, | |
| "eval_loss": 2.1545872688293457, | |
| "eval_runtime": 1.0511, | |
| "eval_samples_per_second": 15.222, | |
| "eval_steps_per_second": 3.805, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.16216216216216217, | |
| "grad_norm": 5.741048812866211, | |
| "learning_rate": 9.997413517311055e-05, | |
| "loss": 1.9335, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.16216216216216217, | |
| "eval_loss": 1.786160945892334, | |
| "eval_runtime": 1.0532, | |
| "eval_samples_per_second": 15.192, | |
| "eval_steps_per_second": 3.798, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.20270270270270271, | |
| "grad_norm": 4.155601978302002, | |
| "learning_rate": 9.99418104116517e-05, | |
| "loss": 1.5361, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.20270270270270271, | |
| "eval_loss": 1.4731855392456055, | |
| "eval_runtime": 1.0511, | |
| "eval_samples_per_second": 15.222, | |
| "eval_steps_per_second": 3.805, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.24324324324324326, | |
| "grad_norm": 2.4831109046936035, | |
| "learning_rate": 9.989656745201298e-05, | |
| "loss": 1.314, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.24324324324324326, | |
| "eval_loss": 1.2790606021881104, | |
| "eval_runtime": 1.0553, | |
| "eval_samples_per_second": 15.161, | |
| "eval_steps_per_second": 3.79, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.28378378378378377, | |
| "grad_norm": 1.9509971141815186, | |
| "learning_rate": 9.983841799696438e-05, | |
| "loss": 1.1747, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.28378378378378377, | |
| "eval_loss": 1.1653475761413574, | |
| "eval_runtime": 1.0512, | |
| "eval_samples_per_second": 15.221, | |
| "eval_steps_per_second": 3.805, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.32432432432432434, | |
| "grad_norm": 2.245741367340088, | |
| "learning_rate": 9.976737708773445e-05, | |
| "loss": 1.1407, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.32432432432432434, | |
| "eval_loss": 1.110356092453003, | |
| "eval_runtime": 1.0534, | |
| "eval_samples_per_second": 15.188, | |
| "eval_steps_per_second": 3.797, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.36486486486486486, | |
| "grad_norm": 2.0690531730651855, | |
| "learning_rate": 9.968346310011964e-05, | |
| "loss": 1.1734, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.36486486486486486, | |
| "eval_loss": 1.088733434677124, | |
| "eval_runtime": 1.0508, | |
| "eval_samples_per_second": 15.226, | |
| "eval_steps_per_second": 3.806, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.40540540540540543, | |
| "grad_norm": 1.8963656425476074, | |
| "learning_rate": 9.958669773973123e-05, | |
| "loss": 1.0495, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.40540540540540543, | |
| "eval_loss": 1.0401344299316406, | |
| "eval_runtime": 1.0528, | |
| "eval_samples_per_second": 15.197, | |
| "eval_steps_per_second": 3.799, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.44594594594594594, | |
| "grad_norm": 1.753909945487976, | |
| "learning_rate": 9.947710603638078e-05, | |
| "loss": 1.0401, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.44594594594594594, | |
| "eval_loss": 0.990611732006073, | |
| "eval_runtime": 1.0507, | |
| "eval_samples_per_second": 15.227, | |
| "eval_steps_per_second": 3.807, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.4864864864864865, | |
| "grad_norm": 2.1073760986328125, | |
| "learning_rate": 9.935471633760573e-05, | |
| "loss": 1.0623, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.4864864864864865, | |
| "eval_loss": 0.9593618512153625, | |
| "eval_runtime": 1.0535, | |
| "eval_samples_per_second": 15.188, | |
| "eval_steps_per_second": 3.797, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.527027027027027, | |
| "grad_norm": 1.5675249099731445, | |
| "learning_rate": 9.921956030133701e-05, | |
| "loss": 0.8152, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.527027027027027, | |
| "eval_loss": 0.9366932511329651, | |
| "eval_runtime": 1.0514, | |
| "eval_samples_per_second": 15.218, | |
| "eval_steps_per_second": 3.805, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.5675675675675675, | |
| "grad_norm": 2.219888210296631, | |
| "learning_rate": 9.907167288771019e-05, | |
| "loss": 0.9261, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.5675675675675675, | |
| "eval_loss": 0.9247606992721558, | |
| "eval_runtime": 1.0532, | |
| "eval_samples_per_second": 15.192, | |
| "eval_steps_per_second": 3.798, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.6081081081081081, | |
| "grad_norm": 1.6866446733474731, | |
| "learning_rate": 9.891109235002249e-05, | |
| "loss": 0.9469, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.6081081081081081, | |
| "eval_loss": 0.9134540557861328, | |
| "eval_runtime": 1.0562, | |
| "eval_samples_per_second": 15.149, | |
| "eval_steps_per_second": 3.787, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.6486486486486487, | |
| "grad_norm": 1.7272800207138062, | |
| "learning_rate": 9.8737860224838e-05, | |
| "loss": 0.8381, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.6486486486486487, | |
| "eval_loss": 0.8871217370033264, | |
| "eval_runtime": 1.0527, | |
| "eval_samples_per_second": 15.199, | |
| "eval_steps_per_second": 3.8, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.6891891891891891, | |
| "grad_norm": 2.6152303218841553, | |
| "learning_rate": 9.855202132124365e-05, | |
| "loss": 0.8456, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.6891891891891891, | |
| "eval_loss": 0.8553087711334229, | |
| "eval_runtime": 1.0521, | |
| "eval_samples_per_second": 15.208, | |
| "eval_steps_per_second": 3.802, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.7297297297297297, | |
| "grad_norm": 1.8282960653305054, | |
| "learning_rate": 9.835362370925868e-05, | |
| "loss": 0.908, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.7297297297297297, | |
| "eval_loss": 0.8271682858467102, | |
| "eval_runtime": 1.052, | |
| "eval_samples_per_second": 15.21, | |
| "eval_steps_per_second": 3.802, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.7702702702702703, | |
| "grad_norm": 2.466750383377075, | |
| "learning_rate": 9.814271870740054e-05, | |
| "loss": 0.999, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.7702702702702703, | |
| "eval_loss": 0.8151593208312988, | |
| "eval_runtime": 1.0549, | |
| "eval_samples_per_second": 15.167, | |
| "eval_steps_per_second": 3.792, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.8108108108108109, | |
| "grad_norm": 1.8908120393753052, | |
| "learning_rate": 9.791936086941064e-05, | |
| "loss": 0.897, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.8108108108108109, | |
| "eval_loss": 0.8052847981452942, | |
| "eval_runtime": 1.0512, | |
| "eval_samples_per_second": 15.22, | |
| "eval_steps_per_second": 3.805, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.8513513513513513, | |
| "grad_norm": 1.9563689231872559, | |
| "learning_rate": 9.768360797014324e-05, | |
| "loss": 0.8747, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.8513513513513513, | |
| "eval_loss": 0.7914941906929016, | |
| "eval_runtime": 1.0519, | |
| "eval_samples_per_second": 15.21, | |
| "eval_steps_per_second": 3.803, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.8918918918918919, | |
| "grad_norm": 1.9292480945587158, | |
| "learning_rate": 9.7435520990621e-05, | |
| "loss": 1.0646, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.8918918918918919, | |
| "eval_loss": 0.7872657179832458, | |
| "eval_runtime": 1.0526, | |
| "eval_samples_per_second": 15.201, | |
| "eval_steps_per_second": 3.8, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.9324324324324325, | |
| "grad_norm": 1.7248555421829224, | |
| "learning_rate": 9.717516410226145e-05, | |
| "loss": 0.6771, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.9324324324324325, | |
| "eval_loss": 0.7814666628837585, | |
| "eval_runtime": 1.0522, | |
| "eval_samples_per_second": 15.207, | |
| "eval_steps_per_second": 3.802, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.972972972972973, | |
| "grad_norm": 2.171896457672119, | |
| "learning_rate": 9.690260465027801e-05, | |
| "loss": 0.9386, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.972972972972973, | |
| "eval_loss": 0.7634860873222351, | |
| "eval_runtime": 1.0498, | |
| "eval_samples_per_second": 15.241, | |
| "eval_steps_per_second": 3.81, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 1.0135135135135136, | |
| "grad_norm": 1.625179409980774, | |
| "learning_rate": 9.661791313626018e-05, | |
| "loss": 0.6348, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 1.0135135135135136, | |
| "eval_loss": 0.75515216588974, | |
| "eval_runtime": 1.0536, | |
| "eval_samples_per_second": 15.186, | |
| "eval_steps_per_second": 3.796, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 1.054054054054054, | |
| "grad_norm": 1.4293404817581177, | |
| "learning_rate": 9.632116319993725e-05, | |
| "loss": 0.5763, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 1.054054054054054, | |
| "eval_loss": 0.7473800182342529, | |
| "eval_runtime": 1.0524, | |
| "eval_samples_per_second": 15.203, | |
| "eval_steps_per_second": 3.801, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 1.0945945945945945, | |
| "grad_norm": 1.9279707670211792, | |
| "learning_rate": 9.601243160013023e-05, | |
| "loss": 0.7059, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 1.0945945945945945, | |
| "eval_loss": 0.7430617213249207, | |
| "eval_runtime": 1.0539, | |
| "eval_samples_per_second": 15.181, | |
| "eval_steps_per_second": 3.795, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 1.135135135135135, | |
| "grad_norm": 1.7644144296646118, | |
| "learning_rate": 9.56917981948971e-05, | |
| "loss": 0.6111, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 1.135135135135135, | |
| "eval_loss": 0.7393875122070312, | |
| "eval_runtime": 1.0525, | |
| "eval_samples_per_second": 15.202, | |
| "eval_steps_per_second": 3.8, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 1.1756756756756757, | |
| "grad_norm": 1.4910467863082886, | |
| "learning_rate": 9.535934592087627e-05, | |
| "loss": 0.6937, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 1.1756756756756757, | |
| "eval_loss": 0.7415614724159241, | |
| "eval_runtime": 1.0533, | |
| "eval_samples_per_second": 15.191, | |
| "eval_steps_per_second": 3.798, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 1.2162162162162162, | |
| "grad_norm": 1.989018440246582, | |
| "learning_rate": 9.50151607718338e-05, | |
| "loss": 0.6408, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.2162162162162162, | |
| "eval_loss": 0.7331891059875488, | |
| "eval_runtime": 1.0504, | |
| "eval_samples_per_second": 15.232, | |
| "eval_steps_per_second": 3.808, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.2567567567567568, | |
| "grad_norm": 1.5546590089797974, | |
| "learning_rate": 9.465933177641982e-05, | |
| "loss": 0.5931, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 1.2567567567567568, | |
| "eval_loss": 0.7319458723068237, | |
| "eval_runtime": 1.0532, | |
| "eval_samples_per_second": 15.191, | |
| "eval_steps_per_second": 3.798, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 1.2972972972972974, | |
| "grad_norm": 2.128746271133423, | |
| "learning_rate": 9.429195097513993e-05, | |
| "loss": 0.5792, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 1.2972972972972974, | |
| "eval_loss": 0.7179479598999023, | |
| "eval_runtime": 1.0504, | |
| "eval_samples_per_second": 15.232, | |
| "eval_steps_per_second": 3.808, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 1.3378378378378377, | |
| "grad_norm": 2.069204092025757, | |
| "learning_rate": 9.391311339654753e-05, | |
| "loss": 0.5502, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 1.3378378378378377, | |
| "eval_loss": 0.7083268165588379, | |
| "eval_runtime": 1.0531, | |
| "eval_samples_per_second": 15.193, | |
| "eval_steps_per_second": 3.798, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 1.3783783783783785, | |
| "grad_norm": 2.069469928741455, | |
| "learning_rate": 9.352291703266331e-05, | |
| "loss": 0.7356, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.3783783783783785, | |
| "eval_loss": 0.7048563957214355, | |
| "eval_runtime": 1.0519, | |
| "eval_samples_per_second": 15.21, | |
| "eval_steps_per_second": 3.803, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.4189189189189189, | |
| "grad_norm": 1.507051706314087, | |
| "learning_rate": 9.31214628136281e-05, | |
| "loss": 0.5204, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.4189189189189189, | |
| "eval_loss": 0.6983195543289185, | |
| "eval_runtime": 1.0543, | |
| "eval_samples_per_second": 15.176, | |
| "eval_steps_per_second": 3.794, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.4594594594594594, | |
| "grad_norm": 1.918865442276001, | |
| "learning_rate": 9.270885458159575e-05, | |
| "loss": 0.6132, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.4594594594594594, | |
| "eval_loss": 0.6857842803001404, | |
| "eval_runtime": 1.0525, | |
| "eval_samples_per_second": 15.202, | |
| "eval_steps_per_second": 3.8, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 2.062997341156006, | |
| "learning_rate": 9.228519906387288e-05, | |
| "loss": 0.7527, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "eval_loss": 0.6743776798248291, | |
| "eval_runtime": 1.0512, | |
| "eval_samples_per_second": 15.221, | |
| "eval_steps_per_second": 3.805, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 1.5405405405405406, | |
| "grad_norm": 1.8099018335342407, | |
| "learning_rate": 9.185060584531217e-05, | |
| "loss": 0.6798, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 1.5405405405405406, | |
| "eval_loss": 0.6715844869613647, | |
| "eval_runtime": 1.0529, | |
| "eval_samples_per_second": 15.196, | |
| "eval_steps_per_second": 3.799, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 1.5810810810810811, | |
| "grad_norm": 2.0540611743927, | |
| "learning_rate": 9.140518733996672e-05, | |
| "loss": 0.7266, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 1.5810810810810811, | |
| "eval_loss": 0.6656138896942139, | |
| "eval_runtime": 1.0523, | |
| "eval_samples_per_second": 15.204, | |
| "eval_steps_per_second": 3.801, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 1.6216216216216215, | |
| "grad_norm": 2.3945634365081787, | |
| "learning_rate": 9.094905876201229e-05, | |
| "loss": 0.5347, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.6216216216216215, | |
| "eval_loss": 0.6710730791091919, | |
| "eval_runtime": 1.053, | |
| "eval_samples_per_second": 15.195, | |
| "eval_steps_per_second": 3.799, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.6621621621621623, | |
| "grad_norm": 2.006612777709961, | |
| "learning_rate": 9.048233809594561e-05, | |
| "loss": 0.6522, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 1.6621621621621623, | |
| "eval_loss": 0.6679877042770386, | |
| "eval_runtime": 1.0519, | |
| "eval_samples_per_second": 15.211, | |
| "eval_steps_per_second": 3.803, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 1.7027027027027026, | |
| "grad_norm": 1.751696228981018, | |
| "learning_rate": 9.000514606606581e-05, | |
| "loss": 0.8567, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.7027027027027026, | |
| "eval_loss": 0.6558159589767456, | |
| "eval_runtime": 1.0531, | |
| "eval_samples_per_second": 15.193, | |
| "eval_steps_per_second": 3.798, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.7432432432432432, | |
| "grad_norm": 1.5286139249801636, | |
| "learning_rate": 8.951760610524724e-05, | |
| "loss": 0.5204, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 1.7432432432432432, | |
| "eval_loss": 0.6488269567489624, | |
| "eval_runtime": 1.0516, | |
| "eval_samples_per_second": 15.215, | |
| "eval_steps_per_second": 3.804, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 1.7837837837837838, | |
| "grad_norm": 2.1092898845672607, | |
| "learning_rate": 8.901984432301185e-05, | |
| "loss": 0.6443, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 1.7837837837837838, | |
| "eval_loss": 0.6392868161201477, | |
| "eval_runtime": 1.053, | |
| "eval_samples_per_second": 15.195, | |
| "eval_steps_per_second": 3.799, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 1.8243243243243243, | |
| "grad_norm": 1.7279053926467896, | |
| "learning_rate": 8.851198947290894e-05, | |
| "loss": 0.5436, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.8243243243243243, | |
| "eval_loss": 0.6321672201156616, | |
| "eval_runtime": 1.0499, | |
| "eval_samples_per_second": 15.239, | |
| "eval_steps_per_second": 3.81, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.864864864864865, | |
| "grad_norm": 2.6842877864837646, | |
| "learning_rate": 8.799417291921117e-05, | |
| "loss": 0.6054, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.864864864864865, | |
| "eval_loss": 0.6346270442008972, | |
| "eval_runtime": 1.0528, | |
| "eval_samples_per_second": 15.198, | |
| "eval_steps_per_second": 3.799, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.9054054054054053, | |
| "grad_norm": 1.9958398342132568, | |
| "learning_rate": 8.746652860293523e-05, | |
| "loss": 0.4488, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 1.9054054054054053, | |
| "eval_loss": 0.6389164924621582, | |
| "eval_runtime": 1.0505, | |
| "eval_samples_per_second": 15.231, | |
| "eval_steps_per_second": 3.808, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 1.945945945945946, | |
| "grad_norm": 2.0705783367156982, | |
| "learning_rate": 8.692919300719595e-05, | |
| "loss": 0.7171, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 1.945945945945946, | |
| "eval_loss": 0.632194995880127, | |
| "eval_runtime": 1.0537, | |
| "eval_samples_per_second": 15.184, | |
| "eval_steps_per_second": 3.796, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 1.9864864864864864, | |
| "grad_norm": 2.0737218856811523, | |
| "learning_rate": 8.638230512190298e-05, | |
| "loss": 0.5383, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 1.9864864864864864, | |
| "eval_loss": 0.6272808313369751, | |
| "eval_runtime": 1.0507, | |
| "eval_samples_per_second": 15.228, | |
| "eval_steps_per_second": 3.807, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 2.027027027027027, | |
| "grad_norm": 1.6119190454483032, | |
| "learning_rate": 8.58260064078088e-05, | |
| "loss": 0.4812, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.027027027027027, | |
| "eval_loss": 0.6234598755836487, | |
| "eval_runtime": 1.0541, | |
| "eval_samples_per_second": 15.179, | |
| "eval_steps_per_second": 3.795, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.0675675675675675, | |
| "grad_norm": 2.104738712310791, | |
| "learning_rate": 8.526044075991802e-05, | |
| "loss": 0.7911, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 2.0675675675675675, | |
| "eval_loss": 0.6295649409294128, | |
| "eval_runtime": 1.0504, | |
| "eval_samples_per_second": 15.232, | |
| "eval_steps_per_second": 3.808, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 2.108108108108108, | |
| "grad_norm": 2.041696786880493, | |
| "learning_rate": 8.468575447026651e-05, | |
| "loss": 0.514, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 2.108108108108108, | |
| "eval_loss": 0.6444165706634521, | |
| "eval_runtime": 1.0539, | |
| "eval_samples_per_second": 15.182, | |
| "eval_steps_per_second": 3.795, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 2.1486486486486487, | |
| "grad_norm": 1.7887616157531738, | |
| "learning_rate": 8.410209619008101e-05, | |
| "loss": 0.4481, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 2.1486486486486487, | |
| "eval_loss": 0.6452795267105103, | |
| "eval_runtime": 1.0508, | |
| "eval_samples_per_second": 15.227, | |
| "eval_steps_per_second": 3.807, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 2.189189189189189, | |
| "grad_norm": 2.2852938175201416, | |
| "learning_rate": 8.350961689132808e-05, | |
| "loss": 0.3983, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 2.189189189189189, | |
| "eval_loss": 0.6356573104858398, | |
| "eval_runtime": 1.0538, | |
| "eval_samples_per_second": 15.183, | |
| "eval_steps_per_second": 3.796, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 2.22972972972973, | |
| "grad_norm": 1.3814259767532349, | |
| "learning_rate": 8.290846982766305e-05, | |
| "loss": 0.2386, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 2.22972972972973, | |
| "eval_loss": 0.632733166217804, | |
| "eval_runtime": 1.053, | |
| "eval_samples_per_second": 15.195, | |
| "eval_steps_per_second": 3.799, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 2.27027027027027, | |
| "grad_norm": 2.624509572982788, | |
| "learning_rate": 8.22988104947886e-05, | |
| "loss": 0.4447, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 2.27027027027027, | |
| "eval_loss": 0.6358802318572998, | |
| "eval_runtime": 1.0518, | |
| "eval_samples_per_second": 15.212, | |
| "eval_steps_per_second": 3.803, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 2.310810810810811, | |
| "grad_norm": 2.1006217002868652, | |
| "learning_rate": 8.168079659023349e-05, | |
| "loss": 0.4302, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 2.310810810810811, | |
| "eval_loss": 0.6386667490005493, | |
| "eval_runtime": 1.0534, | |
| "eval_samples_per_second": 15.188, | |
| "eval_steps_per_second": 3.797, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 2.3513513513513513, | |
| "grad_norm": 2.631301164627075, | |
| "learning_rate": 8.105458797256178e-05, | |
| "loss": 0.4514, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 2.3513513513513513, | |
| "eval_loss": 0.6402238607406616, | |
| "eval_runtime": 1.0545, | |
| "eval_samples_per_second": 15.174, | |
| "eval_steps_per_second": 3.793, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 2.391891891891892, | |
| "grad_norm": 1.4005826711654663, | |
| "learning_rate": 8.04203466200229e-05, | |
| "loss": 0.2813, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 2.391891891891892, | |
| "eval_loss": 0.6313220262527466, | |
| "eval_runtime": 1.0541, | |
| "eval_samples_per_second": 15.178, | |
| "eval_steps_per_second": 3.795, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 2.4324324324324325, | |
| "grad_norm": 2.4380390644073486, | |
| "learning_rate": 7.977823658865364e-05, | |
| "loss": 0.4747, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.4324324324324325, | |
| "eval_loss": 0.6258513927459717, | |
| "eval_runtime": 1.0533, | |
| "eval_samples_per_second": 15.191, | |
| "eval_steps_per_second": 3.798, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.472972972972973, | |
| "grad_norm": 2.3655426502227783, | |
| "learning_rate": 7.912842396984254e-05, | |
| "loss": 0.547, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 2.472972972972973, | |
| "eval_loss": 0.6256988048553467, | |
| "eval_runtime": 1.053, | |
| "eval_samples_per_second": 15.195, | |
| "eval_steps_per_second": 3.799, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 2.5135135135135136, | |
| "grad_norm": 1.9949471950531006, | |
| "learning_rate": 7.847107684736792e-05, | |
| "loss": 0.3154, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 2.5135135135135136, | |
| "eval_loss": 0.6247289776802063, | |
| "eval_runtime": 1.0523, | |
| "eval_samples_per_second": 15.205, | |
| "eval_steps_per_second": 3.801, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 2.554054054054054, | |
| "grad_norm": 3.2453622817993164, | |
| "learning_rate": 7.780636525392046e-05, | |
| "loss": 0.5583, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 2.554054054054054, | |
| "eval_loss": 0.6129618883132935, | |
| "eval_runtime": 1.0519, | |
| "eval_samples_per_second": 15.21, | |
| "eval_steps_per_second": 3.803, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 2.5945945945945947, | |
| "grad_norm": 2.022986888885498, | |
| "learning_rate": 7.713446112712169e-05, | |
| "loss": 0.5726, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 2.5945945945945947, | |
| "eval_loss": 0.6086827516555786, | |
| "eval_runtime": 1.0543, | |
| "eval_samples_per_second": 15.175, | |
| "eval_steps_per_second": 3.794, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 2.635135135135135, | |
| "grad_norm": 2.429865598678589, | |
| "learning_rate": 7.645553826504969e-05, | |
| "loss": 0.4701, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 2.635135135135135, | |
| "eval_loss": 0.6085944175720215, | |
| "eval_runtime": 1.0521, | |
| "eval_samples_per_second": 15.208, | |
| "eval_steps_per_second": 3.802, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 2.6756756756756754, | |
| "grad_norm": 1.991803526878357, | |
| "learning_rate": 7.576977228128376e-05, | |
| "loss": 0.4866, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 2.6756756756756754, | |
| "eval_loss": 0.6133272647857666, | |
| "eval_runtime": 1.0535, | |
| "eval_samples_per_second": 15.187, | |
| "eval_steps_per_second": 3.797, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 2.7162162162162162, | |
| "grad_norm": 2.537832021713257, | |
| "learning_rate": 7.50773405594792e-05, | |
| "loss": 0.4015, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 2.7162162162162162, | |
| "eval_loss": 0.6213403940200806, | |
| "eval_runtime": 1.0524, | |
| "eval_samples_per_second": 15.203, | |
| "eval_steps_per_second": 3.801, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 2.756756756756757, | |
| "grad_norm": 1.758016586303711, | |
| "learning_rate": 7.437842220748441e-05, | |
| "loss": 0.4277, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 2.756756756756757, | |
| "eval_loss": 0.623763382434845, | |
| "eval_runtime": 1.0527, | |
| "eval_samples_per_second": 15.198, | |
| "eval_steps_per_second": 3.8, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 2.7972972972972974, | |
| "grad_norm": 1.8930737972259521, | |
| "learning_rate": 7.367319801101196e-05, | |
| "loss": 0.3157, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 2.7972972972972974, | |
| "eval_loss": 0.6248853206634521, | |
| "eval_runtime": 1.0562, | |
| "eval_samples_per_second": 15.149, | |
| "eval_steps_per_second": 3.787, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 2.8378378378378377, | |
| "grad_norm": 2.071988105773926, | |
| "learning_rate": 7.296185038687566e-05, | |
| "loss": 0.3883, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.8378378378378377, | |
| "eval_loss": 0.6209710240364075, | |
| "eval_runtime": 1.0518, | |
| "eval_samples_per_second": 15.212, | |
| "eval_steps_per_second": 3.803, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.8783783783783785, | |
| "grad_norm": 1.579237937927246, | |
| "learning_rate": 7.224456333580573e-05, | |
| "loss": 0.5436, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 2.8783783783783785, | |
| "eval_loss": 0.6127223968505859, | |
| "eval_runtime": 1.0524, | |
| "eval_samples_per_second": 15.204, | |
| "eval_steps_per_second": 3.801, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 2.918918918918919, | |
| "grad_norm": 2.4129927158355713, | |
| "learning_rate": 7.152152239485419e-05, | |
| "loss": 0.526, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 2.918918918918919, | |
| "eval_loss": 0.6055560111999512, | |
| "eval_runtime": 1.0502, | |
| "eval_samples_per_second": 15.236, | |
| "eval_steps_per_second": 3.809, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 2.9594594594594597, | |
| "grad_norm": 2.252251148223877, | |
| "learning_rate": 7.079291458940301e-05, | |
| "loss": 0.4465, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 2.9594594594594597, | |
| "eval_loss": 0.5982283353805542, | |
| "eval_runtime": 1.0529, | |
| "eval_samples_per_second": 15.197, | |
| "eval_steps_per_second": 3.799, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 1.9773114919662476, | |
| "learning_rate": 7.005892838478711e-05, | |
| "loss": 0.3692, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.5916565656661987, | |
| "eval_runtime": 1.0501, | |
| "eval_samples_per_second": 15.237, | |
| "eval_steps_per_second": 3.809, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 3.0405405405405403, | |
| "grad_norm": 1.1434626579284668, | |
| "learning_rate": 6.931975363754502e-05, | |
| "loss": 0.3022, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 3.0405405405405403, | |
| "eval_loss": 0.5955583453178406, | |
| "eval_runtime": 1.0535, | |
| "eval_samples_per_second": 15.187, | |
| "eval_steps_per_second": 3.797, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 3.081081081081081, | |
| "grad_norm": 1.9162238836288452, | |
| "learning_rate": 6.85755815463096e-05, | |
| "loss": 0.2875, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 3.081081081081081, | |
| "eval_loss": 0.6152929067611694, | |
| "eval_runtime": 1.0516, | |
| "eval_samples_per_second": 15.215, | |
| "eval_steps_per_second": 3.804, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 3.1216216216216215, | |
| "grad_norm": 2.688631057739258, | |
| "learning_rate": 6.782660460235174e-05, | |
| "loss": 0.5544, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 3.1216216216216215, | |
| "eval_loss": 0.6343094110488892, | |
| "eval_runtime": 1.052, | |
| "eval_samples_per_second": 15.21, | |
| "eval_steps_per_second": 3.802, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 3.1621621621621623, | |
| "grad_norm": 2.58313250541687, | |
| "learning_rate": 6.707301653978945e-05, | |
| "loss": 0.4159, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 3.1621621621621623, | |
| "eval_loss": 0.6369538307189941, | |
| "eval_runtime": 1.0524, | |
| "eval_samples_per_second": 15.203, | |
| "eval_steps_per_second": 3.801, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 3.2027027027027026, | |
| "grad_norm": 2.2415409088134766, | |
| "learning_rate": 6.63150122854758e-05, | |
| "loss": 0.4963, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 3.2027027027027026, | |
| "eval_loss": 0.6289186477661133, | |
| "eval_runtime": 1.0528, | |
| "eval_samples_per_second": 15.198, | |
| "eval_steps_per_second": 3.799, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 3.2432432432432434, | |
| "grad_norm": 2.974931240081787, | |
| "learning_rate": 6.5552787908578e-05, | |
| "loss": 0.3248, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 3.2432432432432434, | |
| "eval_loss": 0.6189987659454346, | |
| "eval_runtime": 1.0515, | |
| "eval_samples_per_second": 15.217, | |
| "eval_steps_per_second": 3.804, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 3.2837837837837838, | |
| "grad_norm": 2.0078535079956055, | |
| "learning_rate": 6.478654056986131e-05, | |
| "loss": 0.349, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 3.2837837837837838, | |
| "eval_loss": 0.6110680103302002, | |
| "eval_runtime": 1.0532, | |
| "eval_samples_per_second": 15.192, | |
| "eval_steps_per_second": 3.798, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 3.3243243243243246, | |
| "grad_norm": 2.6236143112182617, | |
| "learning_rate": 6.401646847069039e-05, | |
| "loss": 0.3107, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 3.3243243243243246, | |
| "eval_loss": 0.6120755672454834, | |
| "eval_runtime": 1.0508, | |
| "eval_samples_per_second": 15.227, | |
| "eval_steps_per_second": 3.807, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 3.364864864864865, | |
| "grad_norm": 1.75555419921875, | |
| "learning_rate": 6.32427708017615e-05, | |
| "loss": 0.2219, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 3.364864864864865, | |
| "eval_loss": 0.6196171641349792, | |
| "eval_runtime": 1.0523, | |
| "eval_samples_per_second": 15.204, | |
| "eval_steps_per_second": 3.801, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 3.4054054054054053, | |
| "grad_norm": 3.003138303756714, | |
| "learning_rate": 6.246564769157894e-05, | |
| "loss": 0.251, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 3.4054054054054053, | |
| "eval_loss": 0.6273298263549805, | |
| "eval_runtime": 1.0546, | |
| "eval_samples_per_second": 15.171, | |
| "eval_steps_per_second": 3.793, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 3.445945945945946, | |
| "grad_norm": 2.2066917419433594, | |
| "learning_rate": 6.168530015468872e-05, | |
| "loss": 0.3366, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 3.445945945945946, | |
| "eval_loss": 0.6258885860443115, | |
| "eval_runtime": 1.0514, | |
| "eval_samples_per_second": 15.217, | |
| "eval_steps_per_second": 3.804, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 3.4864864864864864, | |
| "grad_norm": 1.7121000289916992, | |
| "learning_rate": 6.0901930039683184e-05, | |
| "loss": 0.3182, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 3.4864864864864864, | |
| "eval_loss": 0.6243223547935486, | |
| "eval_runtime": 1.0739, | |
| "eval_samples_per_second": 14.898, | |
| "eval_steps_per_second": 3.725, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 3.527027027027027, | |
| "grad_norm": 2.7600913047790527, | |
| "learning_rate": 6.011573997698985e-05, | |
| "loss": 0.4133, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 3.527027027027027, | |
| "eval_loss": 0.6259996294975281, | |
| "eval_runtime": 1.0561, | |
| "eval_samples_per_second": 15.151, | |
| "eval_steps_per_second": 3.788, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 3.5675675675675675, | |
| "grad_norm": 2.611302614212036, | |
| "learning_rate": 5.9326933326457956e-05, | |
| "loss": 0.3297, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 3.5675675675675675, | |
| "eval_loss": 0.6303350925445557, | |
| "eval_runtime": 1.0534, | |
| "eval_samples_per_second": 15.189, | |
| "eval_steps_per_second": 3.797, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 3.608108108108108, | |
| "grad_norm": 1.6527258157730103, | |
| "learning_rate": 5.8535714124756434e-05, | |
| "loss": 0.2276, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 3.608108108108108, | |
| "eval_loss": 0.6364917159080505, | |
| "eval_runtime": 1.052, | |
| "eval_samples_per_second": 15.209, | |
| "eval_steps_per_second": 3.802, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 3.6486486486486487, | |
| "grad_norm": 1.1108059883117676, | |
| "learning_rate": 5.774228703259678e-05, | |
| "loss": 0.1842, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 3.6486486486486487, | |
| "eval_loss": 0.6382502317428589, | |
| "eval_runtime": 1.0549, | |
| "eval_samples_per_second": 15.168, | |
| "eval_steps_per_second": 3.792, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 3.689189189189189, | |
| "grad_norm": 2.822380781173706, | |
| "learning_rate": 5.694685728179442e-05, | |
| "loss": 0.4961, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 3.689189189189189, | |
| "eval_loss": 0.6313918828964233, | |
| "eval_runtime": 1.0523, | |
| "eval_samples_per_second": 15.205, | |
| "eval_steps_per_second": 3.801, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 3.72972972972973, | |
| "grad_norm": 2.4894397258758545, | |
| "learning_rate": 5.6149630622182526e-05, | |
| "loss": 0.3785, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 3.72972972972973, | |
| "eval_loss": 0.6239753365516663, | |
| "eval_runtime": 1.053, | |
| "eval_samples_per_second": 15.195, | |
| "eval_steps_per_second": 3.799, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 3.77027027027027, | |
| "grad_norm": 2.1039986610412598, | |
| "learning_rate": 5.535081326839165e-05, | |
| "loss": 0.2834, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 3.77027027027027, | |
| "eval_loss": 0.6189073920249939, | |
| "eval_runtime": 1.0515, | |
| "eval_samples_per_second": 15.217, | |
| "eval_steps_per_second": 3.804, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 3.810810810810811, | |
| "grad_norm": 2.7096340656280518, | |
| "learning_rate": 5.455061184650921e-05, | |
| "loss": 0.3397, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 3.810810810810811, | |
| "eval_loss": 0.6138538122177124, | |
| "eval_runtime": 1.0521, | |
| "eval_samples_per_second": 15.208, | |
| "eval_steps_per_second": 3.802, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 3.8513513513513513, | |
| "grad_norm": 2.030907154083252, | |
| "learning_rate": 5.3749233340632674e-05, | |
| "loss": 0.2795, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 3.8513513513513513, | |
| "eval_loss": 0.6104437708854675, | |
| "eval_runtime": 1.0581, | |
| "eval_samples_per_second": 15.122, | |
| "eval_steps_per_second": 3.78, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 3.891891891891892, | |
| "grad_norm": 2.061206340789795, | |
| "learning_rate": 5.2946885039329866e-05, | |
| "loss": 0.3114, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 3.891891891891892, | |
| "eval_loss": 0.6077687740325928, | |
| "eval_runtime": 1.0527, | |
| "eval_samples_per_second": 15.199, | |
| "eval_steps_per_second": 3.8, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 3.9324324324324325, | |
| "grad_norm": 2.062087059020996, | |
| "learning_rate": 5.2143774482020744e-05, | |
| "loss": 0.2395, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 3.9324324324324325, | |
| "eval_loss": 0.6111433506011963, | |
| "eval_runtime": 1.0517, | |
| "eval_samples_per_second": 15.214, | |
| "eval_steps_per_second": 3.804, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 3.972972972972973, | |
| "grad_norm": 1.6344010829925537, | |
| "learning_rate": 5.134010940529429e-05, | |
| "loss": 0.1948, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 3.972972972972973, | |
| "eval_loss": 0.6142452955245972, | |
| "eval_runtime": 1.0529, | |
| "eval_samples_per_second": 15.196, | |
| "eval_steps_per_second": 3.799, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 4.013513513513513, | |
| "grad_norm": 1.9017384052276611, | |
| "learning_rate": 5.053609768917413e-05, | |
| "loss": 0.2284, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 4.013513513513513, | |
| "eval_loss": 0.6194114685058594, | |
| "eval_runtime": 1.0515, | |
| "eval_samples_per_second": 15.217, | |
| "eval_steps_per_second": 3.804, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 4.054054054054054, | |
| "grad_norm": 2.1609394550323486, | |
| "learning_rate": 4.973194730334748e-05, | |
| "loss": 0.2638, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 4.054054054054054, | |
| "eval_loss": 0.6303145885467529, | |
| "eval_runtime": 1.053, | |
| "eval_samples_per_second": 15.194, | |
| "eval_steps_per_second": 3.798, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 4.094594594594595, | |
| "grad_norm": 1.5275555849075317, | |
| "learning_rate": 4.892786625337047e-05, | |
| "loss": 0.252, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 4.094594594594595, | |
| "eval_loss": 0.6517325639724731, | |
| "eval_runtime": 1.051, | |
| "eval_samples_per_second": 15.224, | |
| "eval_steps_per_second": 3.806, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 4.135135135135135, | |
| "grad_norm": 2.807483434677124, | |
| "learning_rate": 4.8124062526864534e-05, | |
| "loss": 0.183, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 4.135135135135135, | |
| "eval_loss": 0.6644703149795532, | |
| "eval_runtime": 1.0531, | |
| "eval_samples_per_second": 15.193, | |
| "eval_steps_per_second": 3.798, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 4.175675675675675, | |
| "grad_norm": 2.6279256343841553, | |
| "learning_rate": 4.7320744039717154e-05, | |
| "loss": 0.2415, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 4.175675675675675, | |
| "eval_loss": 0.6603893041610718, | |
| "eval_runtime": 1.0531, | |
| "eval_samples_per_second": 15.193, | |
| "eval_steps_per_second": 3.798, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 4.216216216216216, | |
| "grad_norm": 0.42106354236602783, | |
| "learning_rate": 4.651811858230149e-05, | |
| "loss": 0.1791, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 4.216216216216216, | |
| "eval_loss": 0.652984082698822, | |
| "eval_runtime": 1.053, | |
| "eval_samples_per_second": 15.195, | |
| "eval_steps_per_second": 3.799, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 4.256756756756757, | |
| "grad_norm": 2.064615249633789, | |
| "learning_rate": 4.571639376572806e-05, | |
| "loss": 0.2013, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 4.256756756756757, | |
| "eval_loss": 0.6488903760910034, | |
| "eval_runtime": 1.0505, | |
| "eval_samples_per_second": 15.23, | |
| "eval_steps_per_second": 3.808, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 4.297297297297297, | |
| "grad_norm": 2.4248170852661133, | |
| "learning_rate": 4.491577696814318e-05, | |
| "loss": 0.1827, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 4.297297297297297, | |
| "eval_loss": 0.653176486492157, | |
| "eval_runtime": 1.0536, | |
| "eval_samples_per_second": 15.186, | |
| "eval_steps_per_second": 3.797, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 4.337837837837838, | |
| "grad_norm": 2.055769443511963, | |
| "learning_rate": 4.411647528108743e-05, | |
| "loss": 0.1792, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 4.337837837837838, | |
| "eval_loss": 0.6584765315055847, | |
| "eval_runtime": 1.052, | |
| "eval_samples_per_second": 15.209, | |
| "eval_steps_per_second": 3.802, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 4.378378378378378, | |
| "grad_norm": 3.4611449241638184, | |
| "learning_rate": 4.331869545592834e-05, | |
| "loss": 0.2568, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 4.378378378378378, | |
| "eval_loss": 0.6628451347351074, | |
| "eval_runtime": 1.055, | |
| "eval_samples_per_second": 15.166, | |
| "eval_steps_per_second": 3.791, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 4.418918918918919, | |
| "grad_norm": 1.6108025312423706, | |
| "learning_rate": 4.252264385038098e-05, | |
| "loss": 0.1682, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 4.418918918918919, | |
| "eval_loss": 0.66502845287323, | |
| "eval_runtime": 1.0508, | |
| "eval_samples_per_second": 15.227, | |
| "eval_steps_per_second": 3.807, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 4.45945945945946, | |
| "grad_norm": 1.828131914138794, | |
| "learning_rate": 4.1728526375130614e-05, | |
| "loss": 0.25, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 4.45945945945946, | |
| "eval_loss": 0.6729562282562256, | |
| "eval_runtime": 1.0534, | |
| "eval_samples_per_second": 15.189, | |
| "eval_steps_per_second": 3.797, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 2.5057499408721924, | |
| "learning_rate": 4.093654844057059e-05, | |
| "loss": 0.2664, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "eval_loss": 0.6741403937339783, | |
| "eval_runtime": 1.052, | |
| "eval_samples_per_second": 15.209, | |
| "eval_steps_per_second": 3.802, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 4.54054054054054, | |
| "grad_norm": 1.6008535623550415, | |
| "learning_rate": 4.014691490367e-05, | |
| "loss": 0.2316, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 4.54054054054054, | |
| "eval_loss": 0.6773088574409485, | |
| "eval_runtime": 1.053, | |
| "eval_samples_per_second": 15.194, | |
| "eval_steps_per_second": 3.799, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 4.581081081081081, | |
| "grad_norm": 2.551591157913208, | |
| "learning_rate": 3.935983001498439e-05, | |
| "loss": 0.3467, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 4.581081081081081, | |
| "eval_loss": 0.6705477237701416, | |
| "eval_runtime": 1.0509, | |
| "eval_samples_per_second": 15.226, | |
| "eval_steps_per_second": 3.806, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 4.621621621621622, | |
| "grad_norm": 2.130202054977417, | |
| "learning_rate": 3.857549736582316e-05, | |
| "loss": 0.2426, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 4.621621621621622, | |
| "eval_loss": 0.6681296825408936, | |
| "eval_runtime": 1.0529, | |
| "eval_samples_per_second": 15.196, | |
| "eval_steps_per_second": 3.799, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 4.662162162162162, | |
| "grad_norm": 2.043670415878296, | |
| "learning_rate": 3.7794119835587685e-05, | |
| "loss": 0.2421, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 4.662162162162162, | |
| "eval_loss": 0.6622060537338257, | |
| "eval_runtime": 1.0519, | |
| "eval_samples_per_second": 15.21, | |
| "eval_steps_per_second": 3.803, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 4.702702702702703, | |
| "grad_norm": 1.9365885257720947, | |
| "learning_rate": 3.701589953929354e-05, | |
| "loss": 0.4063, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 4.702702702702703, | |
| "eval_loss": 0.6608781814575195, | |
| "eval_runtime": 1.0528, | |
| "eval_samples_per_second": 15.197, | |
| "eval_steps_per_second": 3.799, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 4.743243243243243, | |
| "grad_norm": 2.596634864807129, | |
| "learning_rate": 3.62410377752904e-05, | |
| "loss": 0.2255, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 4.743243243243243, | |
| "eval_loss": 0.6569182276725769, | |
| "eval_runtime": 1.0522, | |
| "eval_samples_per_second": 15.206, | |
| "eval_steps_per_second": 3.802, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 4.783783783783784, | |
| "grad_norm": 2.039332628250122, | |
| "learning_rate": 3.546973497319319e-05, | |
| "loss": 0.1933, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 4.783783783783784, | |
| "eval_loss": 0.6534222364425659, | |
| "eval_runtime": 1.0498, | |
| "eval_samples_per_second": 15.241, | |
| "eval_steps_per_second": 3.81, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 4.824324324324325, | |
| "grad_norm": 1.994629144668579, | |
| "learning_rate": 3.4702190642037944e-05, | |
| "loss": 0.1975, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 4.824324324324325, | |
| "eval_loss": 0.649687647819519, | |
| "eval_runtime": 1.0523, | |
| "eval_samples_per_second": 15.204, | |
| "eval_steps_per_second": 3.801, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 4.864864864864865, | |
| "grad_norm": 2.154684543609619, | |
| "learning_rate": 3.393860331867589e-05, | |
| "loss": 0.3065, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 4.864864864864865, | |
| "eval_loss": 0.6491411924362183, | |
| "eval_runtime": 1.0519, | |
| "eval_samples_per_second": 15.21, | |
| "eval_steps_per_second": 3.803, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 4.905405405405405, | |
| "grad_norm": 1.61858069896698, | |
| "learning_rate": 3.317917051641877e-05, | |
| "loss": 0.1641, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 4.905405405405405, | |
| "eval_loss": 0.651297926902771, | |
| "eval_runtime": 1.0521, | |
| "eval_samples_per_second": 15.208, | |
| "eval_steps_per_second": 3.802, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 4.945945945945946, | |
| "grad_norm": 2.7362637519836426, | |
| "learning_rate": 3.242408867394919e-05, | |
| "loss": 0.2032, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 4.945945945945946, | |
| "eval_loss": 0.6552869081497192, | |
| "eval_runtime": 1.0506, | |
| "eval_samples_per_second": 15.229, | |
| "eval_steps_per_second": 3.807, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 4.986486486486487, | |
| "grad_norm": 2.0567097663879395, | |
| "learning_rate": 3.167355310450877e-05, | |
| "loss": 0.1886, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 4.986486486486487, | |
| "eval_loss": 0.6590157747268677, | |
| "eval_runtime": 1.0528, | |
| "eval_samples_per_second": 15.197, | |
| "eval_steps_per_second": 3.799, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 5.027027027027027, | |
| "grad_norm": 1.5418853759765625, | |
| "learning_rate": 3.092775794537741e-05, | |
| "loss": 0.2539, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 5.027027027027027, | |
| "eval_loss": 0.6676727533340454, | |
| "eval_runtime": 1.0516, | |
| "eval_samples_per_second": 15.215, | |
| "eval_steps_per_second": 3.804, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 5.0675675675675675, | |
| "grad_norm": 1.229972004890442, | |
| "learning_rate": 3.0186896107656803e-05, | |
| "loss": 0.1464, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 5.0675675675675675, | |
| "eval_loss": 0.687861979007721, | |
| "eval_runtime": 1.0539, | |
| "eval_samples_per_second": 15.182, | |
| "eval_steps_per_second": 3.796, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 5.108108108108108, | |
| "grad_norm": 2.421496868133545, | |
| "learning_rate": 2.9451159226371095e-05, | |
| "loss": 0.2295, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 5.108108108108108, | |
| "eval_loss": 0.7066453695297241, | |
| "eval_runtime": 1.0503, | |
| "eval_samples_per_second": 15.233, | |
| "eval_steps_per_second": 3.808, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 5.148648648648648, | |
| "grad_norm": 2.3475804328918457, | |
| "learning_rate": 2.8720737610897575e-05, | |
| "loss": 0.1438, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 5.148648648648648, | |
| "eval_loss": 0.7166962623596191, | |
| "eval_runtime": 1.0534, | |
| "eval_samples_per_second": 15.189, | |
| "eval_steps_per_second": 3.797, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 5.1891891891891895, | |
| "grad_norm": 2.2746946811676025, | |
| "learning_rate": 2.799582019574033e-05, | |
| "loss": 0.1603, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 5.1891891891891895, | |
| "eval_loss": 0.7134541273117065, | |
| "eval_runtime": 1.0519, | |
| "eval_samples_per_second": 15.211, | |
| "eval_steps_per_second": 3.803, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 5.22972972972973, | |
| "grad_norm": 1.2550048828125, | |
| "learning_rate": 2.7276594491659525e-05, | |
| "loss": 0.1379, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 5.22972972972973, | |
| "eval_loss": 0.7095359563827515, | |
| "eval_runtime": 1.0543, | |
| "eval_samples_per_second": 15.176, | |
| "eval_steps_per_second": 3.794, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 5.27027027027027, | |
| "grad_norm": 1.7738205194473267, | |
| "learning_rate": 2.656324653716884e-05, | |
| "loss": 0.2783, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 5.27027027027027, | |
| "eval_loss": 0.7103461623191833, | |
| "eval_runtime": 1.0515, | |
| "eval_samples_per_second": 15.216, | |
| "eval_steps_per_second": 3.804, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 5.3108108108108105, | |
| "grad_norm": 2.2887580394744873, | |
| "learning_rate": 2.5855960850413935e-05, | |
| "loss": 0.1575, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 5.3108108108108105, | |
| "eval_loss": 0.7042403817176819, | |
| "eval_runtime": 1.0523, | |
| "eval_samples_per_second": 15.204, | |
| "eval_steps_per_second": 3.801, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 5.351351351351352, | |
| "grad_norm": 2.6281135082244873, | |
| "learning_rate": 2.5154920381444025e-05, | |
| "loss": 0.1743, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 5.351351351351352, | |
| "eval_loss": 0.7114053964614868, | |
| "eval_runtime": 1.0527, | |
| "eval_samples_per_second": 15.199, | |
| "eval_steps_per_second": 3.8, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 5.391891891891892, | |
| "grad_norm": 1.8125991821289062, | |
| "learning_rate": 2.4460306464889022e-05, | |
| "loss": 0.1168, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 5.391891891891892, | |
| "eval_loss": 0.7083012461662292, | |
| "eval_runtime": 1.0506, | |
| "eval_samples_per_second": 15.23, | |
| "eval_steps_per_second": 3.807, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 5.4324324324324325, | |
| "grad_norm": 2.5157058238983154, | |
| "learning_rate": 2.3772298773054757e-05, | |
| "loss": 0.284, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 5.4324324324324325, | |
| "eval_loss": 0.7072416543960571, | |
| "eval_runtime": 1.0524, | |
| "eval_samples_per_second": 15.204, | |
| "eval_steps_per_second": 3.801, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 5.472972972972973, | |
| "grad_norm": 0.8739199042320251, | |
| "learning_rate": 2.309107526944792e-05, | |
| "loss": 0.1013, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 5.472972972972973, | |
| "eval_loss": 0.7062889933586121, | |
| "eval_runtime": 1.051, | |
| "eval_samples_per_second": 15.223, | |
| "eval_steps_per_second": 3.806, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 5.513513513513513, | |
| "grad_norm": 2.2809295654296875, | |
| "learning_rate": 2.2416812162743223e-05, | |
| "loss": 0.2612, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 5.513513513513513, | |
| "eval_loss": 0.70506751537323, | |
| "eval_runtime": 1.053, | |
| "eval_samples_per_second": 15.195, | |
| "eval_steps_per_second": 3.799, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 5.554054054054054, | |
| "grad_norm": 2.2030365467071533, | |
| "learning_rate": 2.17496838612043e-05, | |
| "loss": 0.1343, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 5.554054054054054, | |
| "eval_loss": 0.7102519273757935, | |
| "eval_runtime": 1.0534, | |
| "eval_samples_per_second": 15.188, | |
| "eval_steps_per_second": 3.797, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 5.594594594594595, | |
| "grad_norm": 1.4592159986495972, | |
| "learning_rate": 2.1089862927570475e-05, | |
| "loss": 0.1009, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 5.594594594594595, | |
| "eval_loss": 0.7105306386947632, | |
| "eval_runtime": 1.0533, | |
| "eval_samples_per_second": 15.19, | |
| "eval_steps_per_second": 3.797, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 5.635135135135135, | |
| "grad_norm": 2.2018954753875732, | |
| "learning_rate": 2.0437520034420776e-05, | |
| "loss": 0.3127, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 5.635135135135135, | |
| "eval_loss": 0.7089606523513794, | |
| "eval_runtime": 1.0533, | |
| "eval_samples_per_second": 15.191, | |
| "eval_steps_per_second": 3.798, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 5.675675675675675, | |
| "grad_norm": 1.8359624147415161, | |
| "learning_rate": 1.979282392002691e-05, | |
| "loss": 0.1355, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 5.675675675675675, | |
| "eval_loss": 0.7059516906738281, | |
| "eval_runtime": 1.0526, | |
| "eval_samples_per_second": 15.201, | |
| "eval_steps_per_second": 3.8, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 5.716216216216216, | |
| "grad_norm": 2.3145079612731934, | |
| "learning_rate": 1.9155941344706546e-05, | |
| "loss": 0.1345, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 5.716216216216216, | |
| "eval_loss": 0.705683171749115, | |
| "eval_runtime": 1.0519, | |
| "eval_samples_per_second": 15.21, | |
| "eval_steps_per_second": 3.802, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 5.756756756756757, | |
| "grad_norm": 1.7434961795806885, | |
| "learning_rate": 1.852703704768842e-05, | |
| "loss": 0.1865, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 5.756756756756757, | |
| "eval_loss": 0.7038547396659851, | |
| "eval_runtime": 1.0535, | |
| "eval_samples_per_second": 15.188, | |
| "eval_steps_per_second": 3.797, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 5.797297297297297, | |
| "grad_norm": 1.5850327014923096, | |
| "learning_rate": 1.7906273704499845e-05, | |
| "loss": 0.119, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 5.797297297297297, | |
| "eval_loss": 0.7066537737846375, | |
| "eval_runtime": 1.0521, | |
| "eval_samples_per_second": 15.208, | |
| "eval_steps_per_second": 3.802, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 5.837837837837838, | |
| "grad_norm": 1.599552035331726, | |
| "learning_rate": 1.7293811884888344e-05, | |
| "loss": 0.149, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 5.837837837837838, | |
| "eval_loss": 0.7120293974876404, | |
| "eval_runtime": 1.0536, | |
| "eval_samples_per_second": 15.185, | |
| "eval_steps_per_second": 3.796, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 5.878378378378378, | |
| "grad_norm": 1.8353303670883179, | |
| "learning_rate": 1.6689810011287932e-05, | |
| "loss": 0.1748, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 5.878378378378378, | |
| "eval_loss": 0.7123138308525085, | |
| "eval_runtime": 1.0524, | |
| "eval_samples_per_second": 15.203, | |
| "eval_steps_per_second": 3.801, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 5.918918918918919, | |
| "grad_norm": 1.4937026500701904, | |
| "learning_rate": 1.6094424317840723e-05, | |
| "loss": 0.1781, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 5.918918918918919, | |
| "eval_loss": 0.7113088965415955, | |
| "eval_runtime": 1.0528, | |
| "eval_samples_per_second": 15.198, | |
| "eval_steps_per_second": 3.799, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 5.95945945945946, | |
| "grad_norm": 2.0092716217041016, | |
| "learning_rate": 1.550780880998456e-05, | |
| "loss": 0.2075, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 5.95945945945946, | |
| "eval_loss": 0.7117879390716553, | |
| "eval_runtime": 1.0532, | |
| "eval_samples_per_second": 15.192, | |
| "eval_steps_per_second": 3.798, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 2.762338161468506, | |
| "learning_rate": 1.4930115224617353e-05, | |
| "loss": 0.1591, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 0.7111848592758179, | |
| "eval_runtime": 1.0522, | |
| "eval_samples_per_second": 15.206, | |
| "eval_steps_per_second": 3.801, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 6.04054054054054, | |
| "grad_norm": 1.825244665145874, | |
| "learning_rate": 1.436149299084789e-05, | |
| "loss": 0.1224, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 6.04054054054054, | |
| "eval_loss": 0.7117843627929688, | |
| "eval_runtime": 1.0529, | |
| "eval_samples_per_second": 15.195, | |
| "eval_steps_per_second": 3.799, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 6.081081081081081, | |
| "grad_norm": 0.9274085760116577, | |
| "learning_rate": 1.380208919134392e-05, | |
| "loss": 0.2234, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 6.081081081081081, | |
| "eval_loss": 0.7170644402503967, | |
| "eval_runtime": 1.0513, | |
| "eval_samples_per_second": 15.219, | |
| "eval_steps_per_second": 3.805, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 6.121621621621622, | |
| "grad_norm": 1.5220532417297363, | |
| "learning_rate": 1.3252048524286842e-05, | |
| "loss": 0.1165, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 6.121621621621622, | |
| "eval_loss": 0.7227377891540527, | |
| "eval_runtime": 1.0532, | |
| "eval_samples_per_second": 15.191, | |
| "eval_steps_per_second": 3.798, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 6.162162162162162, | |
| "grad_norm": 1.669662594795227, | |
| "learning_rate": 1.271151326594352e-05, | |
| "loss": 0.2518, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 6.162162162162162, | |
| "eval_loss": 0.7325636148452759, | |
| "eval_runtime": 1.0523, | |
| "eval_samples_per_second": 15.205, | |
| "eval_steps_per_second": 3.801, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 6.202702702702703, | |
| "grad_norm": 1.6538748741149902, | |
| "learning_rate": 1.2180623233864253e-05, | |
| "loss": 0.1288, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 6.202702702702703, | |
| "eval_loss": 0.7430564165115356, | |
| "eval_runtime": 1.0597, | |
| "eval_samples_per_second": 15.099, | |
| "eval_steps_per_second": 3.775, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 6.243243243243243, | |
| "grad_norm": 1.5836577415466309, | |
| "learning_rate": 1.1659515750716955e-05, | |
| "loss": 0.1176, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 6.243243243243243, | |
| "eval_loss": 0.7481391429901123, | |
| "eval_runtime": 1.0512, | |
| "eval_samples_per_second": 15.221, | |
| "eval_steps_per_second": 3.805, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 6.283783783783784, | |
| "grad_norm": 1.0982418060302734, | |
| "learning_rate": 1.1148325608766585e-05, | |
| "loss": 0.1231, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 6.283783783783784, | |
| "eval_loss": 0.7511347532272339, | |
| "eval_runtime": 1.0552, | |
| "eval_samples_per_second": 15.163, | |
| "eval_steps_per_second": 3.791, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 6.324324324324325, | |
| "grad_norm": 1.9232176542282104, | |
| "learning_rate": 1.0647185035009038e-05, | |
| "loss": 0.146, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 6.324324324324325, | |
| "eval_loss": 0.7529792785644531, | |
| "eval_runtime": 1.0535, | |
| "eval_samples_per_second": 15.188, | |
| "eval_steps_per_second": 3.797, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 6.364864864864865, | |
| "grad_norm": 2.5786333084106445, | |
| "learning_rate": 1.0156223656968694e-05, | |
| "loss": 0.1169, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 6.364864864864865, | |
| "eval_loss": 0.7518468499183655, | |
| "eval_runtime": 1.0523, | |
| "eval_samples_per_second": 15.205, | |
| "eval_steps_per_second": 3.801, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 6.405405405405405, | |
| "grad_norm": 1.4718759059906006, | |
| "learning_rate": 9.675568469168388e-06, | |
| "loss": 0.1048, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 6.405405405405405, | |
| "eval_loss": 0.7540909051895142, | |
| "eval_runtime": 1.049, | |
| "eval_samples_per_second": 15.253, | |
| "eval_steps_per_second": 3.813, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 6.445945945945946, | |
| "grad_norm": 1.3492368459701538, | |
| "learning_rate": 9.205343800280219e-06, | |
| "loss": 0.1092, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 6.445945945945946, | |
| "eval_loss": 0.750686764717102, | |
| "eval_runtime": 1.0533, | |
| "eval_samples_per_second": 15.19, | |
| "eval_steps_per_second": 3.798, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 6.486486486486487, | |
| "grad_norm": 2.10587739944458, | |
| "learning_rate": 8.745671280966177e-06, | |
| "loss": 0.1458, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 6.486486486486487, | |
| "eval_loss": 0.7518497705459595, | |
| "eval_runtime": 1.0499, | |
| "eval_samples_per_second": 15.239, | |
| "eval_steps_per_second": 3.81, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 6.527027027027027, | |
| "grad_norm": 0.8871177434921265, | |
| "learning_rate": 8.296669812416547e-06, | |
| "loss": 0.2177, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 6.527027027027027, | |
| "eval_loss": 0.7509324550628662, | |
| "eval_runtime": 1.0528, | |
| "eval_samples_per_second": 15.198, | |
| "eval_steps_per_second": 3.8, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 6.5675675675675675, | |
| "grad_norm": 1.299116611480713, | |
| "learning_rate": 7.858455535594306e-06, | |
| "loss": 0.1585, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 6.5675675675675675, | |
| "eval_loss": 0.7509753108024597, | |
| "eval_runtime": 1.0507, | |
| "eval_samples_per_second": 15.228, | |
| "eval_steps_per_second": 3.807, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 6.608108108108108, | |
| "grad_norm": 1.8996071815490723, | |
| "learning_rate": 7.431141801193508e-06, | |
| "loss": 0.1337, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 6.608108108108108, | |
| "eval_loss": 0.7546273469924927, | |
| "eval_runtime": 1.0538, | |
| "eval_samples_per_second": 15.183, | |
| "eval_steps_per_second": 3.796, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 6.648648648648649, | |
| "grad_norm": 2.193199634552002, | |
| "learning_rate": 7.014839140319485e-06, | |
| "loss": 0.122, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 6.648648648648649, | |
| "eval_loss": 0.7523775100708008, | |
| "eval_runtime": 1.0517, | |
| "eval_samples_per_second": 15.213, | |
| "eval_steps_per_second": 3.803, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 6.6891891891891895, | |
| "grad_norm": 1.310517430305481, | |
| "learning_rate": 6.609655235898227e-06, | |
| "loss": 0.0793, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 6.6891891891891895, | |
| "eval_loss": 0.7553800344467163, | |
| "eval_runtime": 1.0524, | |
| "eval_samples_per_second": 15.203, | |
| "eval_steps_per_second": 3.801, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 6.72972972972973, | |
| "grad_norm": 1.7615861892700195, | |
| "learning_rate": 6.215694894822699e-06, | |
| "loss": 0.1544, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 6.72972972972973, | |
| "eval_loss": 0.7521288394927979, | |
| "eval_runtime": 1.0505, | |
| "eval_samples_per_second": 15.231, | |
| "eval_steps_per_second": 3.808, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 6.77027027027027, | |
| "grad_norm": 1.4952490329742432, | |
| "learning_rate": 5.83306002084284e-06, | |
| "loss": 0.1387, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 6.77027027027027, | |
| "eval_loss": 0.7528640627861023, | |
| "eval_runtime": 1.052, | |
| "eval_samples_per_second": 15.209, | |
| "eval_steps_per_second": 3.802, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 6.8108108108108105, | |
| "grad_norm": 1.7409045696258545, | |
| "learning_rate": 5.461849588206724e-06, | |
| "loss": 0.1253, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 6.8108108108108105, | |
| "eval_loss": 0.7528926134109497, | |
| "eval_runtime": 1.059, | |
| "eval_samples_per_second": 15.108, | |
| "eval_steps_per_second": 3.777, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 6.851351351351351, | |
| "grad_norm": 0.7362686395645142, | |
| "learning_rate": 5.102159616059365e-06, | |
| "loss": 0.1296, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 6.851351351351351, | |
| "eval_loss": 0.7542049884796143, | |
| "eval_runtime": 1.0521, | |
| "eval_samples_per_second": 15.207, | |
| "eval_steps_per_second": 3.802, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 6.891891891891892, | |
| "grad_norm": 0.806505560874939, | |
| "learning_rate": 4.754083143605869e-06, | |
| "loss": 0.1094, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 6.891891891891892, | |
| "eval_loss": 0.7515612840652466, | |
| "eval_runtime": 1.0559, | |
| "eval_samples_per_second": 15.152, | |
| "eval_steps_per_second": 3.788, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 6.9324324324324325, | |
| "grad_norm": 1.5709373950958252, | |
| "learning_rate": 4.417710206045533e-06, | |
| "loss": 0.1009, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 6.9324324324324325, | |
| "eval_loss": 0.751240611076355, | |
| "eval_runtime": 1.0523, | |
| "eval_samples_per_second": 15.205, | |
| "eval_steps_per_second": 3.801, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 6.972972972972973, | |
| "grad_norm": 1.2641761302947998, | |
| "learning_rate": 4.093127811282821e-06, | |
| "loss": 0.1871, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 6.972972972972973, | |
| "eval_loss": 0.7525576949119568, | |
| "eval_runtime": 1.0539, | |
| "eval_samples_per_second": 15.181, | |
| "eval_steps_per_second": 3.795, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 7.013513513513513, | |
| "grad_norm": 0.9734938144683838, | |
| "learning_rate": 3.7804199174215183e-06, | |
| "loss": 0.1017, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 7.013513513513513, | |
| "eval_loss": 0.7537960410118103, | |
| "eval_runtime": 1.0511, | |
| "eval_samples_per_second": 15.222, | |
| "eval_steps_per_second": 3.805, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 7.054054054054054, | |
| "grad_norm": 1.4745818376541138, | |
| "learning_rate": 3.479667411047677e-06, | |
| "loss": 0.1536, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 7.054054054054054, | |
| "eval_loss": 0.7529079914093018, | |
| "eval_runtime": 1.0543, | |
| "eval_samples_per_second": 15.176, | |
| "eval_steps_per_second": 3.794, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 7.094594594594595, | |
| "grad_norm": 1.0725492238998413, | |
| "learning_rate": 3.1909480863070884e-06, | |
| "loss": 0.0886, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 7.094594594594595, | |
| "eval_loss": 0.7565038204193115, | |
| "eval_runtime": 1.0511, | |
| "eval_samples_per_second": 15.222, | |
| "eval_steps_per_second": 3.806, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 7.135135135135135, | |
| "grad_norm": 1.1345540285110474, | |
| "learning_rate": 2.9143366247826598e-06, | |
| "loss": 0.0983, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 7.135135135135135, | |
| "eval_loss": 0.7576066255569458, | |
| "eval_runtime": 1.0528, | |
| "eval_samples_per_second": 15.198, | |
| "eval_steps_per_second": 3.799, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 7.175675675675675, | |
| "grad_norm": 1.122189998626709, | |
| "learning_rate": 2.6499045761769315e-06, | |
| "loss": 0.084, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 7.175675675675675, | |
| "eval_loss": 0.758578896522522, | |
| "eval_runtime": 1.0508, | |
| "eval_samples_per_second": 15.227, | |
| "eval_steps_per_second": 3.807, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 7.216216216216216, | |
| "grad_norm": 1.6193064451217651, | |
| "learning_rate": 2.397720339804649e-06, | |
| "loss": 0.099, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 7.216216216216216, | |
| "eval_loss": 0.7563527822494507, | |
| "eval_runtime": 1.0563, | |
| "eval_samples_per_second": 15.147, | |
| "eval_steps_per_second": 3.787, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 7.256756756756757, | |
| "grad_norm": 1.373356580734253, | |
| "learning_rate": 2.1578491469002373e-06, | |
| "loss": 0.1089, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 7.256756756756757, | |
| "eval_loss": 0.7592064142227173, | |
| "eval_runtime": 1.0528, | |
| "eval_samples_per_second": 15.197, | |
| "eval_steps_per_second": 3.799, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 7.297297297297297, | |
| "grad_norm": 1.1875869035720825, | |
| "learning_rate": 1.9303530437448035e-06, | |
| "loss": 0.1145, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 7.297297297297297, | |
| "eval_loss": 0.7611518502235413, | |
| "eval_runtime": 1.0529, | |
| "eval_samples_per_second": 15.196, | |
| "eval_steps_per_second": 3.799, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 7.337837837837838, | |
| "grad_norm": 1.8787821531295776, | |
| "learning_rate": 1.7152908756169262e-06, | |
| "loss": 0.1823, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 7.337837837837838, | |
| "eval_loss": 0.7614726424217224, | |
| "eval_runtime": 1.0548, | |
| "eval_samples_per_second": 15.168, | |
| "eval_steps_per_second": 3.792, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 7.378378378378378, | |
| "grad_norm": 1.9469506740570068, | |
| "learning_rate": 1.5127182715714006e-06, | |
| "loss": 0.2784, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 7.378378378378378, | |
| "eval_loss": 0.7602246999740601, | |
| "eval_runtime": 1.053, | |
| "eval_samples_per_second": 15.194, | |
| "eval_steps_per_second": 3.799, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 7.418918918918919, | |
| "grad_norm": 1.6328327655792236, | |
| "learning_rate": 1.3226876300500123e-06, | |
| "loss": 0.0887, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 7.418918918918919, | |
| "eval_loss": 0.7616763114929199, | |
| "eval_runtime": 1.0504, | |
| "eval_samples_per_second": 15.232, | |
| "eval_steps_per_second": 3.808, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 7.45945945945946, | |
| "grad_norm": 1.5713064670562744, | |
| "learning_rate": 1.1452481053278396e-06, | |
| "loss": 0.1133, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 7.45945945945946, | |
| "eval_loss": 0.7640103101730347, | |
| "eval_runtime": 1.053, | |
| "eval_samples_per_second": 15.195, | |
| "eval_steps_per_second": 3.799, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 7.5, | |
| "grad_norm": 1.5901539325714111, | |
| "learning_rate": 9.804455947988067e-07, | |
| "loss": 0.1207, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 7.5, | |
| "eval_loss": 0.7629836797714233, | |
| "eval_runtime": 1.0516, | |
| "eval_samples_per_second": 15.216, | |
| "eval_steps_per_second": 3.804, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 7.54054054054054, | |
| "grad_norm": 1.5648808479309082, | |
| "learning_rate": 8.283227271035976e-07, | |
| "loss": 0.0954, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 7.54054054054054, | |
| "eval_loss": 0.7643275260925293, | |
| "eval_runtime": 1.0548, | |
| "eval_samples_per_second": 15.169, | |
| "eval_steps_per_second": 3.792, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 7.581081081081081, | |
| "grad_norm": 1.6403340101242065, | |
| "learning_rate": 6.889188511031542e-07, | |
| "loss": 0.1135, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 7.581081081081081, | |
| "eval_loss": 0.7628697156906128, | |
| "eval_runtime": 1.0531, | |
| "eval_samples_per_second": 15.194, | |
| "eval_steps_per_second": 3.798, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 7.621621621621622, | |
| "grad_norm": 1.393983244895935, | |
| "learning_rate": 5.622700257004676e-07, | |
| "loss": 0.096, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 7.621621621621622, | |
| "eval_loss": 0.7637063264846802, | |
| "eval_runtime": 1.0544, | |
| "eval_samples_per_second": 15.174, | |
| "eval_steps_per_second": 3.793, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 7.662162162162162, | |
| "grad_norm": 1.2016361951828003, | |
| "learning_rate": 4.484090105134231e-07, | |
| "loss": 0.1088, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 7.662162162162162, | |
| "eval_loss": 0.7655338048934937, | |
| "eval_runtime": 1.0534, | |
| "eval_samples_per_second": 15.189, | |
| "eval_steps_per_second": 3.797, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 7.702702702702703, | |
| "grad_norm": 1.1388864517211914, | |
| "learning_rate": 3.4736525740104444e-07, | |
| "loss": 0.1628, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 7.702702702702703, | |
| "eval_loss": 0.7655097842216492, | |
| "eval_runtime": 1.053, | |
| "eval_samples_per_second": 15.195, | |
| "eval_steps_per_second": 3.799, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 7.743243243243243, | |
| "grad_norm": 1.9650497436523438, | |
| "learning_rate": 2.591649028453047e-07, | |
| "loss": 0.1431, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 7.743243243243243, | |
| "eval_loss": 0.7649960517883301, | |
| "eval_runtime": 1.0519, | |
| "eval_samples_per_second": 15.211, | |
| "eval_steps_per_second": 3.803, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 7.783783783783784, | |
| "grad_norm": 1.7549225091934204, | |
| "learning_rate": 1.8383076119053432e-07, | |
| "loss": 0.1034, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 7.783783783783784, | |
| "eval_loss": 0.763870358467102, | |
| "eval_runtime": 1.0529, | |
| "eval_samples_per_second": 15.196, | |
| "eval_steps_per_second": 3.799, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 7.824324324324325, | |
| "grad_norm": 1.7549595832824707, | |
| "learning_rate": 1.2138231874217475e-07, | |
| "loss": 0.181, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 7.824324324324325, | |
| "eval_loss": 0.7637079358100891, | |
| "eval_runtime": 1.0546, | |
| "eval_samples_per_second": 15.172, | |
| "eval_steps_per_second": 3.793, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 7.864864864864865, | |
| "grad_norm": 1.3891515731811523, | |
| "learning_rate": 7.183572872632715e-08, | |
| "loss": 0.062, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 7.864864864864865, | |
| "eval_loss": 0.7649126052856445, | |
| "eval_runtime": 1.0509, | |
| "eval_samples_per_second": 15.225, | |
| "eval_steps_per_second": 3.806, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 7.905405405405405, | |
| "grad_norm": 1.0669249296188354, | |
| "learning_rate": 3.5203807111489074e-08, | |
| "loss": 0.0769, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 7.905405405405405, | |
| "eval_loss": 0.7653980255126953, | |
| "eval_runtime": 1.0536, | |
| "eval_samples_per_second": 15.185, | |
| "eval_steps_per_second": 3.796, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 7.945945945945946, | |
| "grad_norm": 2.3302104473114014, | |
| "learning_rate": 1.1496029293511789e-08, | |
| "loss": 0.1951, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 7.945945945945946, | |
| "eval_loss": 0.7646524906158447, | |
| "eval_runtime": 1.0566, | |
| "eval_samples_per_second": 15.143, | |
| "eval_steps_per_second": 3.786, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 7.986486486486487, | |
| "grad_norm": 1.9744952917099, | |
| "learning_rate": 7.185276446441958e-10, | |
| "loss": 0.1175, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 7.986486486486487, | |
| "eval_loss": 0.765015721321106, | |
| "eval_runtime": 1.0522, | |
| "eval_samples_per_second": 15.206, | |
| "eval_steps_per_second": 3.801, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "step": 592, | |
| "total_flos": 1188976147968000.0, | |
| "train_loss": 0.4161318518926163, | |
| "train_runtime": 741.424, | |
| "train_samples_per_second": 3.194, | |
| "train_steps_per_second": 0.798 | |
| } | |
| ], | |
| "logging_steps": 3, | |
| "max_steps": 592, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 8, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1188976147968000.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |