| { | |
| "best_metric": 0.16085075220051892, | |
| "best_model_checkpoint": "checkpoints/checkpoint-6750", | |
| "epoch": 4.6360686138154845, | |
| "eval_steps": 50, | |
| "global_step": 10000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.023180343069077423, | |
| "grad_norm": 0.08667636662721634, | |
| "learning_rate": 5e-05, | |
| "loss": 0.6255, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.023180343069077423, | |
| "eval_loss": 0.2028235954199523, | |
| "eval_runtime": 58.9481, | |
| "eval_samples_per_second": 705.468, | |
| "eval_steps_per_second": 0.356, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.04636068613815485, | |
| "grad_norm": 0.0631365031003952, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1532, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04636068613815485, | |
| "eval_loss": 0.1865187252730081, | |
| "eval_runtime": 59.7995, | |
| "eval_samples_per_second": 695.424, | |
| "eval_steps_per_second": 0.351, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.06954102920723226, | |
| "grad_norm": 0.11565029621124268, | |
| "learning_rate": 9.999370638369377e-05, | |
| "loss": 0.1448, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.06954102920723226, | |
| "eval_loss": 0.1811586473052961, | |
| "eval_runtime": 59.448, | |
| "eval_samples_per_second": 699.536, | |
| "eval_steps_per_second": 0.353, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0927213722763097, | |
| "grad_norm": 0.12287624180316925, | |
| "learning_rate": 9.997482711915927e-05, | |
| "loss": 0.1431, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.0927213722763097, | |
| "eval_loss": 0.17856710216099736, | |
| "eval_runtime": 59.6811, | |
| "eval_samples_per_second": 696.803, | |
| "eval_steps_per_second": 0.352, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.11590171534538711, | |
| "grad_norm": 0.1239687129855156, | |
| "learning_rate": 9.99433669591504e-05, | |
| "loss": 0.1423, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.11590171534538711, | |
| "eval_loss": 0.17694150013393828, | |
| "eval_runtime": 59.3601, | |
| "eval_samples_per_second": 700.572, | |
| "eval_steps_per_second": 0.354, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.13908205841446453, | |
| "grad_norm": 0.08375083655118942, | |
| "learning_rate": 9.989933382359422e-05, | |
| "loss": 0.1413, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.13908205841446453, | |
| "eval_loss": 0.17530383900746715, | |
| "eval_runtime": 59.6335, | |
| "eval_samples_per_second": 697.359, | |
| "eval_steps_per_second": 0.352, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.16226240148354196, | |
| "grad_norm": 0.07470008730888367, | |
| "learning_rate": 9.984273879759713e-05, | |
| "loss": 0.1391, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.16226240148354196, | |
| "eval_loss": 0.17539924518994512, | |
| "eval_runtime": 59.2699, | |
| "eval_samples_per_second": 701.638, | |
| "eval_steps_per_second": 0.354, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.1854427445526194, | |
| "grad_norm": 0.11057748645544052, | |
| "learning_rate": 9.977359612865423e-05, | |
| "loss": 0.1398, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.1854427445526194, | |
| "eval_loss": 0.17613958094562268, | |
| "eval_runtime": 59.3258, | |
| "eval_samples_per_second": 700.977, | |
| "eval_steps_per_second": 0.354, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.2086230876216968, | |
| "grad_norm": 0.10229019820690155, | |
| "learning_rate": 9.969192322306271e-05, | |
| "loss": 0.1398, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.2086230876216968, | |
| "eval_loss": 0.17319489228196833, | |
| "eval_runtime": 59.4596, | |
| "eval_samples_per_second": 699.399, | |
| "eval_steps_per_second": 0.353, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.23180343069077422, | |
| "grad_norm": 0.10784970223903656, | |
| "learning_rate": 9.959774064153977e-05, | |
| "loss": 0.1384, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.23180343069077422, | |
| "eval_loss": 0.17334065558523068, | |
| "eval_runtime": 59.5777, | |
| "eval_samples_per_second": 698.012, | |
| "eval_steps_per_second": 0.352, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.25498377375985165, | |
| "grad_norm": 0.08434706926345825, | |
| "learning_rate": 9.949107209404665e-05, | |
| "loss": 0.1386, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.25498377375985165, | |
| "eval_loss": 0.17196178719739552, | |
| "eval_runtime": 59.6138, | |
| "eval_samples_per_second": 697.59, | |
| "eval_steps_per_second": 0.352, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.27816411682892905, | |
| "grad_norm": 0.08924778550863266, | |
| "learning_rate": 9.937194443381972e-05, | |
| "loss": 0.1377, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.27816411682892905, | |
| "eval_loss": 0.1740634102700707, | |
| "eval_runtime": 59.422, | |
| "eval_samples_per_second": 699.842, | |
| "eval_steps_per_second": 0.353, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.3013444598980065, | |
| "grad_norm": 0.15332703292369843, | |
| "learning_rate": 9.924038765061042e-05, | |
| "loss": 0.1372, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.3013444598980065, | |
| "eval_loss": 0.17391025863974857, | |
| "eval_runtime": 59.5007, | |
| "eval_samples_per_second": 698.917, | |
| "eval_steps_per_second": 0.353, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.3245248029670839, | |
| "grad_norm": 0.08973913639783859, | |
| "learning_rate": 9.909643486313533e-05, | |
| "loss": 0.1374, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.3245248029670839, | |
| "eval_loss": 0.17245501519579134, | |
| "eval_runtime": 59.2504, | |
| "eval_samples_per_second": 701.868, | |
| "eval_steps_per_second": 0.354, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.3477051460361613, | |
| "grad_norm": 0.07252663373947144, | |
| "learning_rate": 9.894012231073894e-05, | |
| "loss": 0.1378, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.3477051460361613, | |
| "eval_loss": 0.1731146153162719, | |
| "eval_runtime": 59.6278, | |
| "eval_samples_per_second": 697.426, | |
| "eval_steps_per_second": 0.352, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.3708854891052388, | |
| "grad_norm": 0.09351957589387894, | |
| "learning_rate": 9.877148934427037e-05, | |
| "loss": 0.1371, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.3708854891052388, | |
| "eval_loss": 0.17056697699015605, | |
| "eval_runtime": 59.4338, | |
| "eval_samples_per_second": 699.703, | |
| "eval_steps_per_second": 0.353, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.3940658321743162, | |
| "grad_norm": 0.06937623023986816, | |
| "learning_rate": 9.859057841617709e-05, | |
| "loss": 0.1364, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.3940658321743162, | |
| "eval_loss": 0.1730773180756858, | |
| "eval_runtime": 59.2237, | |
| "eval_samples_per_second": 702.185, | |
| "eval_steps_per_second": 0.355, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.4172461752433936, | |
| "grad_norm": 0.1241346001625061, | |
| "learning_rate": 9.839743506981782e-05, | |
| "loss": 0.1382, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.4172461752433936, | |
| "eval_loss": 0.17300324635270986, | |
| "eval_runtime": 59.1648, | |
| "eval_samples_per_second": 702.884, | |
| "eval_steps_per_second": 0.355, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.44042651831247104, | |
| "grad_norm": 0.0649554654955864, | |
| "learning_rate": 9.819210792799712e-05, | |
| "loss": 0.1369, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.44042651831247104, | |
| "eval_loss": 0.17298936761230632, | |
| "eval_runtime": 59.4593, | |
| "eval_samples_per_second": 699.402, | |
| "eval_steps_per_second": 0.353, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.46360686138154844, | |
| "grad_norm": 0.07767663151025772, | |
| "learning_rate": 9.797464868072488e-05, | |
| "loss": 0.1373, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.46360686138154844, | |
| "eval_loss": 0.1722117168758624, | |
| "eval_runtime": 59.4433, | |
| "eval_samples_per_second": 699.592, | |
| "eval_steps_per_second": 0.353, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.48678720445062584, | |
| "grad_norm": 0.09637939929962158, | |
| "learning_rate": 9.77451120722037e-05, | |
| "loss": 0.1357, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.48678720445062584, | |
| "eval_loss": 0.17295359261954948, | |
| "eval_runtime": 59.0076, | |
| "eval_samples_per_second": 704.757, | |
| "eval_steps_per_second": 0.356, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.5099675475197033, | |
| "grad_norm": 0.0731373056769371, | |
| "learning_rate": 9.750355588704727e-05, | |
| "loss": 0.135, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.5099675475197033, | |
| "eval_loss": 0.1715334055521701, | |
| "eval_runtime": 59.0167, | |
| "eval_samples_per_second": 704.648, | |
| "eval_steps_per_second": 0.356, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.5331478905887808, | |
| "grad_norm": 0.1365990936756134, | |
| "learning_rate": 9.725004093573342e-05, | |
| "loss": 0.1357, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.5331478905887808, | |
| "eval_loss": 0.17017831764477356, | |
| "eval_runtime": 59.0779, | |
| "eval_samples_per_second": 703.918, | |
| "eval_steps_per_second": 0.355, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.5563282336578581, | |
| "grad_norm": 0.07747852057218552, | |
| "learning_rate": 9.698463103929542e-05, | |
| "loss": 0.1366, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.5563282336578581, | |
| "eval_loss": 0.17079754339969364, | |
| "eval_runtime": 59.1474, | |
| "eval_samples_per_second": 703.091, | |
| "eval_steps_per_second": 0.355, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.5795085767269356, | |
| "grad_norm": 0.08369060605764389, | |
| "learning_rate": 9.670739301325534e-05, | |
| "loss": 0.1352, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.5795085767269356, | |
| "eval_loss": 0.17218272966053694, | |
| "eval_runtime": 59.4772, | |
| "eval_samples_per_second": 699.192, | |
| "eval_steps_per_second": 0.353, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.602688919796013, | |
| "grad_norm": 0.15560708940029144, | |
| "learning_rate": 9.641839665080363e-05, | |
| "loss": 0.1366, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.602688919796013, | |
| "eval_loss": 0.1698094484306934, | |
| "eval_runtime": 59.4226, | |
| "eval_samples_per_second": 699.835, | |
| "eval_steps_per_second": 0.353, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.6258692628650904, | |
| "grad_norm": 0.1404338777065277, | |
| "learning_rate": 9.611771470522908e-05, | |
| "loss": 0.1353, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.6258692628650904, | |
| "eval_loss": 0.17023876656477224, | |
| "eval_runtime": 59.3422, | |
| "eval_samples_per_second": 700.783, | |
| "eval_steps_per_second": 0.354, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.6490496059341678, | |
| "grad_norm": 0.07887144386768341, | |
| "learning_rate": 9.580542287160348e-05, | |
| "loss": 0.1363, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.6490496059341678, | |
| "eval_loss": 0.1706377184753332, | |
| "eval_runtime": 59.2598, | |
| "eval_samples_per_second": 701.758, | |
| "eval_steps_per_second": 0.354, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.6722299490032453, | |
| "grad_norm": 0.09286168217658997, | |
| "learning_rate": 9.548159976772592e-05, | |
| "loss": 0.1362, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.6722299490032453, | |
| "eval_loss": 0.16891294843072946, | |
| "eval_runtime": 59.4024, | |
| "eval_samples_per_second": 700.073, | |
| "eval_steps_per_second": 0.354, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.6954102920723226, | |
| "grad_norm": 0.08167006820440292, | |
| "learning_rate": 9.514632691433107e-05, | |
| "loss": 0.1345, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.6954102920723226, | |
| "eval_loss": 0.16790113662592512, | |
| "eval_runtime": 60.0378, | |
| "eval_samples_per_second": 692.664, | |
| "eval_steps_per_second": 0.35, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.7185906351414001, | |
| "grad_norm": 0.09860191494226456, | |
| "learning_rate": 9.479968871456679e-05, | |
| "loss": 0.1355, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.7185906351414001, | |
| "eval_loss": 0.16903206921067584, | |
| "eval_runtime": 59.5789, | |
| "eval_samples_per_second": 697.999, | |
| "eval_steps_per_second": 0.352, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.7417709782104775, | |
| "grad_norm": 0.06466613709926605, | |
| "learning_rate": 9.444177243274618e-05, | |
| "loss": 0.135, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.7417709782104775, | |
| "eval_loss": 0.1680566343999807, | |
| "eval_runtime": 59.5911, | |
| "eval_samples_per_second": 697.856, | |
| "eval_steps_per_second": 0.352, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.7649513212795549, | |
| "grad_norm": 0.07864313572645187, | |
| "learning_rate": 9.407266817237911e-05, | |
| "loss": 0.1348, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.7649513212795549, | |
| "eval_loss": 0.16721375296765553, | |
| "eval_runtime": 59.4289, | |
| "eval_samples_per_second": 699.76, | |
| "eval_steps_per_second": 0.353, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.7881316643486324, | |
| "grad_norm": 0.09288563579320908, | |
| "learning_rate": 9.369246885348926e-05, | |
| "loss": 0.1343, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.7881316643486324, | |
| "eval_loss": 0.16728526898731283, | |
| "eval_runtime": 59.4209, | |
| "eval_samples_per_second": 699.855, | |
| "eval_steps_per_second": 0.353, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.8113120074177098, | |
| "grad_norm": 0.1111670434474945, | |
| "learning_rate": 9.330127018922194e-05, | |
| "loss": 0.1342, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.8113120074177098, | |
| "eval_loss": 0.1692570258991495, | |
| "eval_runtime": 59.3557, | |
| "eval_samples_per_second": 700.624, | |
| "eval_steps_per_second": 0.354, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.8344923504867872, | |
| "grad_norm": 0.06098225340247154, | |
| "learning_rate": 9.289917066174886e-05, | |
| "loss": 0.1334, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.8344923504867872, | |
| "eval_loss": 0.16652011733605857, | |
| "eval_runtime": 59.527, | |
| "eval_samples_per_second": 698.607, | |
| "eval_steps_per_second": 0.353, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.8576726935558646, | |
| "grad_norm": 0.11042412370443344, | |
| "learning_rate": 9.248627149747573e-05, | |
| "loss": 0.136, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.8576726935558646, | |
| "eval_loss": 0.16714222769914375, | |
| "eval_runtime": 59.3645, | |
| "eval_samples_per_second": 700.519, | |
| "eval_steps_per_second": 0.354, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.8808530366249421, | |
| "grad_norm": 0.09495564550161362, | |
| "learning_rate": 9.206267664155907e-05, | |
| "loss": 0.1349, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.8808530366249421, | |
| "eval_loss": 0.1690081783682819, | |
| "eval_runtime": 59.2609, | |
| "eval_samples_per_second": 701.744, | |
| "eval_steps_per_second": 0.354, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.9040333796940194, | |
| "grad_norm": 0.08535555005073547, | |
| "learning_rate": 9.162849273173857e-05, | |
| "loss": 0.1345, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.9040333796940194, | |
| "eval_loss": 0.16719838296653933, | |
| "eval_runtime": 59.4328, | |
| "eval_samples_per_second": 699.714, | |
| "eval_steps_per_second": 0.353, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.9272137227630969, | |
| "grad_norm": 0.08415450155735016, | |
| "learning_rate": 9.118382907149165e-05, | |
| "loss": 0.1332, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.9272137227630969, | |
| "eval_loss": 0.16692495198886095, | |
| "eval_runtime": 59.4174, | |
| "eval_samples_per_second": 699.895, | |
| "eval_steps_per_second": 0.353, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.9503940658321743, | |
| "grad_norm": 0.07792109996080399, | |
| "learning_rate": 9.072879760251679e-05, | |
| "loss": 0.1349, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.9503940658321743, | |
| "eval_loss": 0.16853327133732582, | |
| "eval_runtime": 59.3211, | |
| "eval_samples_per_second": 701.032, | |
| "eval_steps_per_second": 0.354, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.9735744089012517, | |
| "grad_norm": 0.09134557843208313, | |
| "learning_rate": 9.026351287655294e-05, | |
| "loss": 0.1355, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.9735744089012517, | |
| "eval_loss": 0.16782760284485718, | |
| "eval_runtime": 59.1119, | |
| "eval_samples_per_second": 703.513, | |
| "eval_steps_per_second": 0.355, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.9967547519703291, | |
| "grad_norm": 0.11134419590234756, | |
| "learning_rate": 8.978809202654162e-05, | |
| "loss": 0.134, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.9967547519703291, | |
| "eval_loss": 0.1670381695935501, | |
| "eval_runtime": 59.4602, | |
| "eval_samples_per_second": 699.393, | |
| "eval_steps_per_second": 0.353, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.0199350950394066, | |
| "grad_norm": 0.08943980187177658, | |
| "learning_rate": 8.930265473713938e-05, | |
| "loss": 0.1345, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.0199350950394066, | |
| "eval_loss": 0.16720885753257103, | |
| "eval_runtime": 59.8457, | |
| "eval_samples_per_second": 694.887, | |
| "eval_steps_per_second": 0.351, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.043115438108484, | |
| "grad_norm": 0.05172237753868103, | |
| "learning_rate": 8.880732321458784e-05, | |
| "loss": 0.1345, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.043115438108484, | |
| "eval_loss": 0.16808202774068384, | |
| "eval_runtime": 59.7591, | |
| "eval_samples_per_second": 695.894, | |
| "eval_steps_per_second": 0.351, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.0662957811775615, | |
| "grad_norm": 0.08457198739051819, | |
| "learning_rate": 8.83022221559489e-05, | |
| "loss": 0.1339, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.0662957811775615, | |
| "eval_loss": 0.16620651689588106, | |
| "eval_runtime": 59.8615, | |
| "eval_samples_per_second": 694.704, | |
| "eval_steps_per_second": 0.351, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.0894761242466389, | |
| "grad_norm": 0.08191724866628647, | |
| "learning_rate": 8.778747871771292e-05, | |
| "loss": 0.1333, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.0894761242466389, | |
| "eval_loss": 0.16742435976845876, | |
| "eval_runtime": 59.8699, | |
| "eval_samples_per_second": 694.606, | |
| "eval_steps_per_second": 0.351, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.1126564673157162, | |
| "grad_norm": 0.08220981061458588, | |
| "learning_rate": 8.726322248378775e-05, | |
| "loss": 0.1336, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.1126564673157162, | |
| "eval_loss": 0.16507172522149283, | |
| "eval_runtime": 59.8591, | |
| "eval_samples_per_second": 694.731, | |
| "eval_steps_per_second": 0.351, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.1358368103847938, | |
| "grad_norm": 0.11390708386898041, | |
| "learning_rate": 8.672958543287666e-05, | |
| "loss": 0.1335, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.1358368103847938, | |
| "eval_loss": 0.16567397155304947, | |
| "eval_runtime": 59.7629, | |
| "eval_samples_per_second": 695.85, | |
| "eval_steps_per_second": 0.351, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.1590171534538711, | |
| "grad_norm": 0.06390725821256638, | |
| "learning_rate": 8.618670190525352e-05, | |
| "loss": 0.1335, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.1590171534538711, | |
| "eval_loss": 0.1671167116531541, | |
| "eval_runtime": 59.5093, | |
| "eval_samples_per_second": 698.815, | |
| "eval_steps_per_second": 0.353, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.1821974965229485, | |
| "grad_norm": 0.06458276510238647, | |
| "learning_rate": 8.563470856894316e-05, | |
| "loss": 0.1322, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 1.1821974965229485, | |
| "eval_loss": 0.16552241982155386, | |
| "eval_runtime": 59.3646, | |
| "eval_samples_per_second": 700.519, | |
| "eval_steps_per_second": 0.354, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 1.205377839592026, | |
| "grad_norm": 0.07258091121912003, | |
| "learning_rate": 8.507374438531607e-05, | |
| "loss": 0.1333, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.205377839592026, | |
| "eval_loss": 0.16643385319936513, | |
| "eval_runtime": 59.7463, | |
| "eval_samples_per_second": 696.043, | |
| "eval_steps_per_second": 0.351, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.2285581826611034, | |
| "grad_norm": 0.08584043383598328, | |
| "learning_rate": 8.450395057410561e-05, | |
| "loss": 0.1325, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 1.2285581826611034, | |
| "eval_loss": 0.16595956749906993, | |
| "eval_runtime": 59.7537, | |
| "eval_samples_per_second": 695.957, | |
| "eval_steps_per_second": 0.351, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 1.2517385257301807, | |
| "grad_norm": 0.054344214498996735, | |
| "learning_rate": 8.392547057785661e-05, | |
| "loss": 0.1334, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.2517385257301807, | |
| "eval_loss": 0.165368604673745, | |
| "eval_runtime": 59.4339, | |
| "eval_samples_per_second": 699.701, | |
| "eval_steps_per_second": 0.353, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.2749188687992583, | |
| "grad_norm": 0.07332266122102737, | |
| "learning_rate": 8.333845002581458e-05, | |
| "loss": 0.1326, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 1.2749188687992583, | |
| "eval_loss": 0.16569167596925843, | |
| "eval_runtime": 59.5544, | |
| "eval_samples_per_second": 698.286, | |
| "eval_steps_per_second": 0.353, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 1.2980992118683357, | |
| "grad_norm": 0.07198917865753174, | |
| "learning_rate": 8.274303669726426e-05, | |
| "loss": 0.1323, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.2980992118683357, | |
| "eval_loss": 0.16580398198626048, | |
| "eval_runtime": 59.8451, | |
| "eval_samples_per_second": 694.894, | |
| "eval_steps_per_second": 0.351, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.321279554937413, | |
| "grad_norm": 0.09278077632188797, | |
| "learning_rate": 8.213938048432697e-05, | |
| "loss": 0.1324, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 1.321279554937413, | |
| "eval_loss": 0.16619885882978533, | |
| "eval_runtime": 59.6857, | |
| "eval_samples_per_second": 696.749, | |
| "eval_steps_per_second": 0.352, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 1.3444598980064906, | |
| "grad_norm": 0.04779389128088951, | |
| "learning_rate": 8.152763335422613e-05, | |
| "loss": 0.1327, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.3444598980064906, | |
| "eval_loss": 0.16639967239163891, | |
| "eval_runtime": 59.5347, | |
| "eval_samples_per_second": 698.517, | |
| "eval_steps_per_second": 0.353, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.367640241075568, | |
| "grad_norm": 0.0650218203663826, | |
| "learning_rate": 8.090794931103026e-05, | |
| "loss": 0.1324, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 1.367640241075568, | |
| "eval_loss": 0.16698249569806287, | |
| "eval_runtime": 59.4938, | |
| "eval_samples_per_second": 698.997, | |
| "eval_steps_per_second": 0.353, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 1.3908205841446453, | |
| "grad_norm": 0.07800327241420746, | |
| "learning_rate": 8.028048435688333e-05, | |
| "loss": 0.1325, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.3908205841446453, | |
| "eval_loss": 0.16588903849861533, | |
| "eval_runtime": 59.7308, | |
| "eval_samples_per_second": 696.223, | |
| "eval_steps_per_second": 0.352, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.4140009272137228, | |
| "grad_norm": 0.09477279335260391, | |
| "learning_rate": 7.964539645273204e-05, | |
| "loss": 0.1318, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 1.4140009272137228, | |
| "eval_loss": 0.16391722600570544, | |
| "eval_runtime": 59.2552, | |
| "eval_samples_per_second": 701.812, | |
| "eval_steps_per_second": 0.354, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 1.4371812702828002, | |
| "grad_norm": 0.061748892068862915, | |
| "learning_rate": 7.900284547855991e-05, | |
| "loss": 0.1328, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.4371812702828002, | |
| "eval_loss": 0.1664695654356475, | |
| "eval_runtime": 59.7882, | |
| "eval_samples_per_second": 695.556, | |
| "eval_steps_per_second": 0.351, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.4603616133518775, | |
| "grad_norm": 0.07277340441942215, | |
| "learning_rate": 7.835299319313853e-05, | |
| "loss": 0.1332, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 1.4603616133518775, | |
| "eval_loss": 0.16764915423728274, | |
| "eval_runtime": 59.7168, | |
| "eval_samples_per_second": 696.387, | |
| "eval_steps_per_second": 0.352, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 1.483541956420955, | |
| "grad_norm": 0.06525903195142746, | |
| "learning_rate": 7.769600319330552e-05, | |
| "loss": 0.1326, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.483541956420955, | |
| "eval_loss": 0.16491104357870506, | |
| "eval_runtime": 59.6126, | |
| "eval_samples_per_second": 697.604, | |
| "eval_steps_per_second": 0.352, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.5067222994900324, | |
| "grad_norm": 0.06889070570468903, | |
| "learning_rate": 7.703204087277988e-05, | |
| "loss": 0.1327, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.5067222994900324, | |
| "eval_loss": 0.16643899540149082, | |
| "eval_runtime": 60.0, | |
| "eval_samples_per_second": 693.1, | |
| "eval_steps_per_second": 0.35, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.5299026425591098, | |
| "grad_norm": 0.09515661001205444, | |
| "learning_rate": 7.636127338052512e-05, | |
| "loss": 0.1332, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.5299026425591098, | |
| "eval_loss": 0.16578109982118125, | |
| "eval_runtime": 60.2083, | |
| "eval_samples_per_second": 690.703, | |
| "eval_steps_per_second": 0.349, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.5530829856281874, | |
| "grad_norm": 0.06826016306877136, | |
| "learning_rate": 7.568386957867033e-05, | |
| "loss": 0.1321, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.5530829856281874, | |
| "eval_loss": 0.16615711001414799, | |
| "eval_runtime": 59.8961, | |
| "eval_samples_per_second": 694.303, | |
| "eval_steps_per_second": 0.351, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.5762633286972647, | |
| "grad_norm": 0.06259354203939438, | |
| "learning_rate": 7.500000000000001e-05, | |
| "loss": 0.1324, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.5762633286972647, | |
| "eval_loss": 0.16420639359901218, | |
| "eval_runtime": 59.8484, | |
| "eval_samples_per_second": 694.856, | |
| "eval_steps_per_second": 0.351, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.599443671766342, | |
| "grad_norm": 0.08373662084341049, | |
| "learning_rate": 7.430983680502344e-05, | |
| "loss": 0.1317, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.599443671766342, | |
| "eval_loss": 0.16580187809904914, | |
| "eval_runtime": 59.5295, | |
| "eval_samples_per_second": 698.578, | |
| "eval_steps_per_second": 0.353, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.6226240148354196, | |
| "grad_norm": 0.052068453282117844, | |
| "learning_rate": 7.361355373863414e-05, | |
| "loss": 0.1326, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.6226240148354196, | |
| "eval_loss": 0.16511726778477553, | |
| "eval_runtime": 59.3774, | |
| "eval_samples_per_second": 700.368, | |
| "eval_steps_per_second": 0.354, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.645804357904497, | |
| "grad_norm": 0.1084132120013237, | |
| "learning_rate": 7.291132608637052e-05, | |
| "loss": 0.1328, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.645804357904497, | |
| "eval_loss": 0.16512942482848092, | |
| "eval_runtime": 59.7073, | |
| "eval_samples_per_second": 696.497, | |
| "eval_steps_per_second": 0.352, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.6689847009735743, | |
| "grad_norm": 0.09590224921703339, | |
| "learning_rate": 7.220333063028872e-05, | |
| "loss": 0.1327, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.6689847009735743, | |
| "eval_loss": 0.1653536906511234, | |
| "eval_runtime": 59.8607, | |
| "eval_samples_per_second": 694.713, | |
| "eval_steps_per_second": 0.351, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.692165044042652, | |
| "grad_norm": 0.09215644001960754, | |
| "learning_rate": 7.148974560445859e-05, | |
| "loss": 0.1314, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.692165044042652, | |
| "eval_loss": 0.16392036224708054, | |
| "eval_runtime": 59.6823, | |
| "eval_samples_per_second": 696.79, | |
| "eval_steps_per_second": 0.352, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.7153453871117292, | |
| "grad_norm": 0.0847523957490921, | |
| "learning_rate": 7.077075065009433e-05, | |
| "loss": 0.1319, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.7153453871117292, | |
| "eval_loss": 0.1658360792512092, | |
| "eval_runtime": 59.6368, | |
| "eval_samples_per_second": 697.322, | |
| "eval_steps_per_second": 0.352, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.7385257301808066, | |
| "grad_norm": 0.06882014125585556, | |
| "learning_rate": 7.004652677033068e-05, | |
| "loss": 0.1308, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.7385257301808066, | |
| "eval_loss": 0.1656867715236748, | |
| "eval_runtime": 59.8626, | |
| "eval_samples_per_second": 694.691, | |
| "eval_steps_per_second": 0.351, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.7617060732498842, | |
| "grad_norm": 0.056948818266391754, | |
| "learning_rate": 6.931725628465643e-05, | |
| "loss": 0.1322, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.7617060732498842, | |
| "eval_loss": 0.16491998551370737, | |
| "eval_runtime": 59.5124, | |
| "eval_samples_per_second": 698.779, | |
| "eval_steps_per_second": 0.353, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.7848864163189615, | |
| "grad_norm": 0.04779543727636337, | |
| "learning_rate": 6.858312278301637e-05, | |
| "loss": 0.1315, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.7848864163189615, | |
| "eval_loss": 0.1649495124686108, | |
| "eval_runtime": 59.9775, | |
| "eval_samples_per_second": 693.36, | |
| "eval_steps_per_second": 0.35, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.8080667593880388, | |
| "grad_norm": 0.05969324707984924, | |
| "learning_rate": 6.784431107959359e-05, | |
| "loss": 0.1316, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.8080667593880388, | |
| "eval_loss": 0.16391757633340012, | |
| "eval_runtime": 60.0346, | |
| "eval_samples_per_second": 692.7, | |
| "eval_steps_per_second": 0.35, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.8312471024571164, | |
| "grad_norm": 0.061390358954668045, | |
| "learning_rate": 6.710100716628344e-05, | |
| "loss": 0.1312, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.8312471024571164, | |
| "eval_loss": 0.1658972028775054, | |
| "eval_runtime": 59.9663, | |
| "eval_samples_per_second": 693.489, | |
| "eval_steps_per_second": 0.35, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.8544274455261938, | |
| "grad_norm": 0.07332038879394531, | |
| "learning_rate": 6.635339816587109e-05, | |
| "loss": 0.1323, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.8544274455261938, | |
| "eval_loss": 0.1647820455194368, | |
| "eval_runtime": 59.5785, | |
| "eval_samples_per_second": 698.004, | |
| "eval_steps_per_second": 0.352, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.877607788595271, | |
| "grad_norm": 0.07641714811325073, | |
| "learning_rate": 6.560167228492436e-05, | |
| "loss": 0.132, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.877607788595271, | |
| "eval_loss": 0.16406535325266738, | |
| "eval_runtime": 60.0931, | |
| "eval_samples_per_second": 692.026, | |
| "eval_steps_per_second": 0.349, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.9007881316643487, | |
| "grad_norm": 0.08891258388757706, | |
| "learning_rate": 6.484601876641375e-05, | |
| "loss": 0.1308, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.9007881316643487, | |
| "eval_loss": 0.164731109091856, | |
| "eval_runtime": 59.8012, | |
| "eval_samples_per_second": 695.405, | |
| "eval_steps_per_second": 0.351, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.923968474733426, | |
| "grad_norm": 0.0818193256855011, | |
| "learning_rate": 6.408662784207149e-05, | |
| "loss": 0.1323, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.923968474733426, | |
| "eval_loss": 0.16444408652573528, | |
| "eval_runtime": 59.6523, | |
| "eval_samples_per_second": 697.14, | |
| "eval_steps_per_second": 0.352, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.9471488178025034, | |
| "grad_norm": 0.05766776204109192, | |
| "learning_rate": 6.332369068450174e-05, | |
| "loss": 0.131, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.9471488178025034, | |
| "eval_loss": 0.1630568549542592, | |
| "eval_runtime": 59.9782, | |
| "eval_samples_per_second": 693.352, | |
| "eval_steps_per_second": 0.35, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.970329160871581, | |
| "grad_norm": 0.07093872129917145, | |
| "learning_rate": 6.255739935905396e-05, | |
| "loss": 0.1313, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.970329160871581, | |
| "eval_loss": 0.16320942743206068, | |
| "eval_runtime": 59.7408, | |
| "eval_samples_per_second": 696.107, | |
| "eval_steps_per_second": 0.352, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.9935095039406583, | |
| "grad_norm": 0.051636241376399994, | |
| "learning_rate": 6.178794677547137e-05, | |
| "loss": 0.1309, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.9935095039406583, | |
| "eval_loss": 0.16439976264264172, | |
| "eval_runtime": 59.7092, | |
| "eval_samples_per_second": 696.476, | |
| "eval_steps_per_second": 0.352, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 2.0166898470097356, | |
| "grad_norm": 0.05819587782025337, | |
| "learning_rate": 6.1015526639327035e-05, | |
| "loss": 0.1319, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 2.0166898470097356, | |
| "eval_loss": 0.16432355870633325, | |
| "eval_runtime": 59.2592, | |
| "eval_samples_per_second": 701.765, | |
| "eval_steps_per_second": 0.354, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 2.039870190078813, | |
| "grad_norm": 0.07939411699771881, | |
| "learning_rate": 6.024033340325954e-05, | |
| "loss": 0.1316, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 2.039870190078813, | |
| "eval_loss": 0.1641168338494948, | |
| "eval_runtime": 59.9534, | |
| "eval_samples_per_second": 693.639, | |
| "eval_steps_per_second": 0.35, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 2.0630505331478908, | |
| "grad_norm": 0.07020165026187897, | |
| "learning_rate": 5.946256221802051e-05, | |
| "loss": 0.1312, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 2.0630505331478908, | |
| "eval_loss": 0.1633037564118911, | |
| "eval_runtime": 60.3433, | |
| "eval_samples_per_second": 689.157, | |
| "eval_steps_per_second": 0.348, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 2.086230876216968, | |
| "grad_norm": 0.07000721246004105, | |
| "learning_rate": 5.868240888334653e-05, | |
| "loss": 0.1313, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 2.086230876216968, | |
| "eval_loss": 0.1646367282392535, | |
| "eval_runtime": 60.5726, | |
| "eval_samples_per_second": 686.548, | |
| "eval_steps_per_second": 0.347, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 2.1094112192860455, | |
| "grad_norm": 0.06988826394081116, | |
| "learning_rate": 5.79000697986675e-05, | |
| "loss": 0.1316, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 2.1094112192860455, | |
| "eval_loss": 0.16286425765036744, | |
| "eval_runtime": 60.2061, | |
| "eval_samples_per_second": 690.727, | |
| "eval_steps_per_second": 0.349, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 2.132591562355123, | |
| "grad_norm": 0.0749220922589302, | |
| "learning_rate": 5.7115741913664264e-05, | |
| "loss": 0.1306, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 2.132591562355123, | |
| "eval_loss": 0.1643572569196068, | |
| "eval_runtime": 59.9586, | |
| "eval_samples_per_second": 693.579, | |
| "eval_steps_per_second": 0.35, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 2.1557719054242, | |
| "grad_norm": 0.06533892452716827, | |
| "learning_rate": 5.6329622678687463e-05, | |
| "loss": 0.1313, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 2.1557719054242, | |
| "eval_loss": 0.1635978048832001, | |
| "eval_runtime": 59.6271, | |
| "eval_samples_per_second": 697.435, | |
| "eval_steps_per_second": 0.352, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 2.1789522484932777, | |
| "grad_norm": 0.07881616055965424, | |
| "learning_rate": 5.5541909995050554e-05, | |
| "loss": 0.131, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 2.1789522484932777, | |
| "eval_loss": 0.1634715372028324, | |
| "eval_runtime": 59.564, | |
| "eval_samples_per_second": 698.173, | |
| "eval_steps_per_second": 0.353, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 2.2021325915623553, | |
| "grad_norm": 0.05812694877386093, | |
| "learning_rate": 5.475280216520913e-05, | |
| "loss": 0.1311, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 2.2021325915623553, | |
| "eval_loss": 0.1636915707335646, | |
| "eval_runtime": 59.9343, | |
| "eval_samples_per_second": 693.86, | |
| "eval_steps_per_second": 0.35, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 2.2253129346314324, | |
| "grad_norm": 0.09842361509799957, | |
| "learning_rate": 5.396249784283942e-05, | |
| "loss": 0.1315, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 2.2253129346314324, | |
| "eval_loss": 0.16410182317726912, | |
| "eval_runtime": 60.4431, | |
| "eval_samples_per_second": 688.019, | |
| "eval_steps_per_second": 0.347, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 2.24849327770051, | |
| "grad_norm": 0.05664157494902611, | |
| "learning_rate": 5.317119598282823e-05, | |
| "loss": 0.1314, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 2.24849327770051, | |
| "eval_loss": 0.16405877684845893, | |
| "eval_runtime": 60.2757, | |
| "eval_samples_per_second": 689.93, | |
| "eval_steps_per_second": 0.348, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 2.2716736207695876, | |
| "grad_norm": 0.08323252946138382, | |
| "learning_rate": 5.2379095791187124e-05, | |
| "loss": 0.1306, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 2.2716736207695876, | |
| "eval_loss": 0.16356865120524391, | |
| "eval_runtime": 60.2036, | |
| "eval_samples_per_second": 690.756, | |
| "eval_steps_per_second": 0.349, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 2.2948539638386647, | |
| "grad_norm": 0.07163384556770325, | |
| "learning_rate": 5.158639667490339e-05, | |
| "loss": 0.1314, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 2.2948539638386647, | |
| "eval_loss": 0.16350787082313517, | |
| "eval_runtime": 59.6657, | |
| "eval_samples_per_second": 696.983, | |
| "eval_steps_per_second": 0.352, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 2.3180343069077423, | |
| "grad_norm": 0.07729226350784302, | |
| "learning_rate": 5.0793298191740404e-05, | |
| "loss": 0.1321, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.3180343069077423, | |
| "eval_loss": 0.16284041257465698, | |
| "eval_runtime": 60.3671, | |
| "eval_samples_per_second": 688.886, | |
| "eval_steps_per_second": 0.348, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.34121464997682, | |
| "grad_norm": 0.07920071482658386, | |
| "learning_rate": 5e-05, | |
| "loss": 0.13, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 2.34121464997682, | |
| "eval_loss": 0.16350252303966548, | |
| "eval_runtime": 60.0663, | |
| "eval_samples_per_second": 692.335, | |
| "eval_steps_per_second": 0.35, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 2.364394993045897, | |
| "grad_norm": 0.05213838815689087, | |
| "learning_rate": 4.92067018082596e-05, | |
| "loss": 0.1315, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 2.364394993045897, | |
| "eval_loss": 0.1640868928554377, | |
| "eval_runtime": 60.1323, | |
| "eval_samples_per_second": 691.575, | |
| "eval_steps_per_second": 0.349, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 2.3875753361149745, | |
| "grad_norm": 0.06551820039749146, | |
| "learning_rate": 4.841360332509663e-05, | |
| "loss": 0.1311, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 2.3875753361149745, | |
| "eval_loss": 0.16375304166425866, | |
| "eval_runtime": 60.0889, | |
| "eval_samples_per_second": 692.074, | |
| "eval_steps_per_second": 0.349, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 2.410755679184052, | |
| "grad_norm": 0.06602519750595093, | |
| "learning_rate": 4.762090420881289e-05, | |
| "loss": 0.1304, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 2.410755679184052, | |
| "eval_loss": 0.1646718036775546, | |
| "eval_runtime": 60.1839, | |
| "eval_samples_per_second": 690.982, | |
| "eval_steps_per_second": 0.349, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 2.433936022253129, | |
| "grad_norm": 0.050050172954797745, | |
| "learning_rate": 4.6828804017171776e-05, | |
| "loss": 0.131, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 2.433936022253129, | |
| "eval_loss": 0.16238808458996815, | |
| "eval_runtime": 60.2346, | |
| "eval_samples_per_second": 690.401, | |
| "eval_steps_per_second": 0.349, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 2.457116365322207, | |
| "grad_norm": 0.06192226707935333, | |
| "learning_rate": 4.603750215716057e-05, | |
| "loss": 0.131, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 2.457116365322207, | |
| "eval_loss": 0.1633245686996306, | |
| "eval_runtime": 59.5691, | |
| "eval_samples_per_second": 698.114, | |
| "eval_steps_per_second": 0.353, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 2.4802967083912844, | |
| "grad_norm": 0.07729701697826385, | |
| "learning_rate": 4.5247197834790876e-05, | |
| "loss": 0.1308, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 2.4802967083912844, | |
| "eval_loss": 0.16388068444979556, | |
| "eval_runtime": 60.3853, | |
| "eval_samples_per_second": 688.677, | |
| "eval_steps_per_second": 0.348, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 2.5034770514603615, | |
| "grad_norm": 0.07346878945827484, | |
| "learning_rate": 4.445809000494946e-05, | |
| "loss": 0.1314, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 2.5034770514603615, | |
| "eval_loss": 0.16427215786452162, | |
| "eval_runtime": 60.0462, | |
| "eval_samples_per_second": 692.567, | |
| "eval_steps_per_second": 0.35, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 2.526657394529439, | |
| "grad_norm": 0.08765513449907303, | |
| "learning_rate": 4.3670377321312535e-05, | |
| "loss": 0.1307, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 2.526657394529439, | |
| "eval_loss": 0.16308954695612046, | |
| "eval_runtime": 59.7344, | |
| "eval_samples_per_second": 696.181, | |
| "eval_steps_per_second": 0.352, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 2.5498377375985166, | |
| "grad_norm": 0.04856225475668907, | |
| "learning_rate": 4.288425808633575e-05, | |
| "loss": 0.1314, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 2.5498377375985166, | |
| "eval_loss": 0.1634677289958684, | |
| "eval_runtime": 60.5651, | |
| "eval_samples_per_second": 686.633, | |
| "eval_steps_per_second": 0.347, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 2.5730180806675937, | |
| "grad_norm": 0.07033301144838333, | |
| "learning_rate": 4.20999302013325e-05, | |
| "loss": 0.1303, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 2.5730180806675937, | |
| "eval_loss": 0.16350203952668135, | |
| "eval_runtime": 59.7363, | |
| "eval_samples_per_second": 696.16, | |
| "eval_steps_per_second": 0.352, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 2.5961984237366713, | |
| "grad_norm": 0.07352133840322495, | |
| "learning_rate": 4.131759111665349e-05, | |
| "loss": 0.1304, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 2.5961984237366713, | |
| "eval_loss": 0.16306162076252775, | |
| "eval_runtime": 60.0517, | |
| "eval_samples_per_second": 692.503, | |
| "eval_steps_per_second": 0.35, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 2.619378766805749, | |
| "grad_norm": 0.05432264879345894, | |
| "learning_rate": 4.0537437781979506e-05, | |
| "loss": 0.1298, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 2.619378766805749, | |
| "eval_loss": 0.16234816348528708, | |
| "eval_runtime": 60.3645, | |
| "eval_samples_per_second": 688.915, | |
| "eval_steps_per_second": 0.348, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 2.642559109874826, | |
| "grad_norm": 0.04657018184661865, | |
| "learning_rate": 3.9759666596740476e-05, | |
| "loss": 0.1305, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 2.642559109874826, | |
| "eval_loss": 0.16270628350418626, | |
| "eval_runtime": 60.0809, | |
| "eval_samples_per_second": 692.167, | |
| "eval_steps_per_second": 0.35, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 2.6657394529439036, | |
| "grad_norm": 0.04448065161705017, | |
| "learning_rate": 3.898447336067297e-05, | |
| "loss": 0.1308, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 2.6657394529439036, | |
| "eval_loss": 0.162430409584318, | |
| "eval_runtime": 59.8634, | |
| "eval_samples_per_second": 694.682, | |
| "eval_steps_per_second": 0.351, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 2.688919796012981, | |
| "grad_norm": 0.047300901263952255, | |
| "learning_rate": 3.821205322452863e-05, | |
| "loss": 0.1306, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 2.688919796012981, | |
| "eval_loss": 0.163914834923588, | |
| "eval_runtime": 59.9699, | |
| "eval_samples_per_second": 693.447, | |
| "eval_steps_per_second": 0.35, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 2.7121001390820583, | |
| "grad_norm": 0.09371935576200485, | |
| "learning_rate": 3.744260064094604e-05, | |
| "loss": 0.1303, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 2.7121001390820583, | |
| "eval_loss": 0.16325797910827158, | |
| "eval_runtime": 60.1596, | |
| "eval_samples_per_second": 691.261, | |
| "eval_steps_per_second": 0.349, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 2.735280482151136, | |
| "grad_norm": 0.0451604500412941, | |
| "learning_rate": 3.6676309315498256e-05, | |
| "loss": 0.131, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 2.735280482151136, | |
| "eval_loss": 0.16252548129222377, | |
| "eval_runtime": 60.0104, | |
| "eval_samples_per_second": 692.98, | |
| "eval_steps_per_second": 0.35, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 2.7584608252202134, | |
| "grad_norm": 0.058029964566230774, | |
| "learning_rate": 3.591337215792852e-05, | |
| "loss": 0.1305, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 2.7584608252202134, | |
| "eval_loss": 0.16366348885138793, | |
| "eval_runtime": 60.372, | |
| "eval_samples_per_second": 688.83, | |
| "eval_steps_per_second": 0.348, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 2.7816411682892905, | |
| "grad_norm": 0.09429273754358292, | |
| "learning_rate": 3.515398123358627e-05, | |
| "loss": 0.1307, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.7816411682892905, | |
| "eval_loss": 0.1623218584160889, | |
| "eval_runtime": 59.5435, | |
| "eval_samples_per_second": 698.413, | |
| "eval_steps_per_second": 0.353, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.804821511358368, | |
| "grad_norm": 0.05752315744757652, | |
| "learning_rate": 3.439832771507565e-05, | |
| "loss": 0.1296, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 2.804821511358368, | |
| "eval_loss": 0.16326439732289802, | |
| "eval_runtime": 59.8306, | |
| "eval_samples_per_second": 695.063, | |
| "eval_steps_per_second": 0.351, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 2.8280018544274457, | |
| "grad_norm": 0.07225628942251205, | |
| "learning_rate": 3.364660183412892e-05, | |
| "loss": 0.1312, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 2.8280018544274457, | |
| "eval_loss": 0.16322279137718054, | |
| "eval_runtime": 59.8418, | |
| "eval_samples_per_second": 694.932, | |
| "eval_steps_per_second": 0.351, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 2.851182197496523, | |
| "grad_norm": 0.06712605059146881, | |
| "learning_rate": 3.289899283371657e-05, | |
| "loss": 0.1305, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 2.851182197496523, | |
| "eval_loss": 0.16403909400299824, | |
| "eval_runtime": 59.4766, | |
| "eval_samples_per_second": 699.199, | |
| "eval_steps_per_second": 0.353, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 2.8743625405656004, | |
| "grad_norm": 0.0743350014090538, | |
| "learning_rate": 3.215568892040641e-05, | |
| "loss": 0.1303, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 2.8743625405656004, | |
| "eval_loss": 0.16315215653435175, | |
| "eval_runtime": 60.0127, | |
| "eval_samples_per_second": 692.953, | |
| "eval_steps_per_second": 0.35, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 2.897542883634678, | |
| "grad_norm": 0.07467668503522873, | |
| "learning_rate": 3.141687721698363e-05, | |
| "loss": 0.1302, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 2.897542883634678, | |
| "eval_loss": 0.16213396084813272, | |
| "eval_runtime": 59.8922, | |
| "eval_samples_per_second": 694.348, | |
| "eval_steps_per_second": 0.351, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 2.920723226703755, | |
| "grad_norm": 0.050527870655059814, | |
| "learning_rate": 3.0682743715343564e-05, | |
| "loss": 0.1298, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 2.920723226703755, | |
| "eval_loss": 0.16243251733829123, | |
| "eval_runtime": 60.3601, | |
| "eval_samples_per_second": 688.965, | |
| "eval_steps_per_second": 0.348, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 2.9439035697728326, | |
| "grad_norm": 0.05331522971391678, | |
| "learning_rate": 2.9953473229669328e-05, | |
| "loss": 0.1313, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 2.9439035697728326, | |
| "eval_loss": 0.16321332234047015, | |
| "eval_runtime": 60.2034, | |
| "eval_samples_per_second": 690.759, | |
| "eval_steps_per_second": 0.349, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 2.96708391284191, | |
| "grad_norm": 0.0566866509616375, | |
| "learning_rate": 2.9229249349905684e-05, | |
| "loss": 0.1304, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 2.96708391284191, | |
| "eval_loss": 0.1623781732971581, | |
| "eval_runtime": 60.2575, | |
| "eval_samples_per_second": 690.138, | |
| "eval_steps_per_second": 0.349, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 2.9902642559109873, | |
| "grad_norm": 0.0674847662448883, | |
| "learning_rate": 2.851025439554142e-05, | |
| "loss": 0.13, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 2.9902642559109873, | |
| "eval_loss": 0.163704374422533, | |
| "eval_runtime": 60.1942, | |
| "eval_samples_per_second": 690.864, | |
| "eval_steps_per_second": 0.349, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 3.013444598980065, | |
| "grad_norm": 0.05663591995835304, | |
| "learning_rate": 2.7796669369711294e-05, | |
| "loss": 0.1313, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 3.013444598980065, | |
| "eval_loss": 0.16296213660440473, | |
| "eval_runtime": 60.9459, | |
| "eval_samples_per_second": 682.343, | |
| "eval_steps_per_second": 0.345, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 3.0366249420491425, | |
| "grad_norm": 0.06456530839204788, | |
| "learning_rate": 2.708867391362948e-05, | |
| "loss": 0.131, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 3.0366249420491425, | |
| "eval_loss": 0.16119627636966075, | |
| "eval_runtime": 60.6451, | |
| "eval_samples_per_second": 685.727, | |
| "eval_steps_per_second": 0.346, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 3.0598052851182196, | |
| "grad_norm": 0.05969541519880295, | |
| "learning_rate": 2.638644626136587e-05, | |
| "loss": 0.1311, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 3.0598052851182196, | |
| "eval_loss": 0.16205494320222197, | |
| "eval_runtime": 60.4532, | |
| "eval_samples_per_second": 687.904, | |
| "eval_steps_per_second": 0.347, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 3.082985628187297, | |
| "grad_norm": 0.06604834645986557, | |
| "learning_rate": 2.5690163194976575e-05, | |
| "loss": 0.1301, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 3.082985628187297, | |
| "eval_loss": 0.16191228875556468, | |
| "eval_runtime": 60.3489, | |
| "eval_samples_per_second": 689.093, | |
| "eval_steps_per_second": 0.348, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 3.1061659712563747, | |
| "grad_norm": 0.06501331180334091, | |
| "learning_rate": 2.500000000000001e-05, | |
| "loss": 0.1298, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 3.1061659712563747, | |
| "eval_loss": 0.16219026561577268, | |
| "eval_runtime": 60.2703, | |
| "eval_samples_per_second": 689.992, | |
| "eval_steps_per_second": 0.348, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 3.129346314325452, | |
| "grad_norm": 0.056004952639341354, | |
| "learning_rate": 2.4316130421329697e-05, | |
| "loss": 0.1302, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 3.129346314325452, | |
| "eval_loss": 0.16085075220051892, | |
| "eval_runtime": 60.336, | |
| "eval_samples_per_second": 689.24, | |
| "eval_steps_per_second": 0.348, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 3.1525266573945294, | |
| "grad_norm": 0.06331496685743332, | |
| "learning_rate": 2.363872661947488e-05, | |
| "loss": 0.1311, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 3.1525266573945294, | |
| "eval_loss": 0.16229801712553438, | |
| "eval_runtime": 59.8727, | |
| "eval_samples_per_second": 694.573, | |
| "eval_steps_per_second": 0.351, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 3.175707000463607, | |
| "grad_norm": 0.05851437896490097, | |
| "learning_rate": 2.296795912722014e-05, | |
| "loss": 0.1304, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 3.175707000463607, | |
| "eval_loss": 0.1624837018550472, | |
| "eval_runtime": 60.0768, | |
| "eval_samples_per_second": 692.214, | |
| "eval_steps_per_second": 0.35, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 3.198887343532684, | |
| "grad_norm": 0.06251411885023117, | |
| "learning_rate": 2.2303996806694488e-05, | |
| "loss": 0.1306, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 3.198887343532684, | |
| "eval_loss": 0.16152431005864756, | |
| "eval_runtime": 60.0137, | |
| "eval_samples_per_second": 692.942, | |
| "eval_steps_per_second": 0.35, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 3.2220676866017617, | |
| "grad_norm": 0.055478889495134354, | |
| "learning_rate": 2.164700680686147e-05, | |
| "loss": 0.1302, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 3.2220676866017617, | |
| "eval_loss": 0.16217289975188992, | |
| "eval_runtime": 59.7201, | |
| "eval_samples_per_second": 696.349, | |
| "eval_steps_per_second": 0.352, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 3.2452480296708393, | |
| "grad_norm": 0.04695391282439232, | |
| "learning_rate": 2.09971545214401e-05, | |
| "loss": 0.1307, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 3.2452480296708393, | |
| "eval_loss": 0.16233282789861117, | |
| "eval_runtime": 60.2382, | |
| "eval_samples_per_second": 690.359, | |
| "eval_steps_per_second": 0.349, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 3.2684283727399164, | |
| "grad_norm": 0.05719252675771713, | |
| "learning_rate": 2.0354603547267985e-05, | |
| "loss": 0.1302, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 3.2684283727399164, | |
| "eval_loss": 0.16257561894818798, | |
| "eval_runtime": 59.9661, | |
| "eval_samples_per_second": 693.491, | |
| "eval_steps_per_second": 0.35, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 3.291608715808994, | |
| "grad_norm": 0.05995924398303032, | |
| "learning_rate": 1.9719515643116674e-05, | |
| "loss": 0.1296, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 3.291608715808994, | |
| "eval_loss": 0.1621910867534911, | |
| "eval_runtime": 59.9872, | |
| "eval_samples_per_second": 693.248, | |
| "eval_steps_per_second": 0.35, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 3.3147890588780715, | |
| "grad_norm": 0.06421925872564316, | |
| "learning_rate": 1.9092050688969738e-05, | |
| "loss": 0.1321, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 3.3147890588780715, | |
| "eval_loss": 0.16221412998892937, | |
| "eval_runtime": 59.9186, | |
| "eval_samples_per_second": 694.042, | |
| "eval_steps_per_second": 0.35, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 3.3379694019471486, | |
| "grad_norm": 0.04900297895073891, | |
| "learning_rate": 1.847236664577389e-05, | |
| "loss": 0.1307, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 3.3379694019471486, | |
| "eval_loss": 0.16276321033314364, | |
| "eval_runtime": 59.8713, | |
| "eval_samples_per_second": 694.59, | |
| "eval_steps_per_second": 0.351, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 3.361149745016226, | |
| "grad_norm": 0.06865038722753525, | |
| "learning_rate": 1.7860619515673033e-05, | |
| "loss": 0.1301, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 3.361149745016226, | |
| "eval_loss": 0.16246198975876286, | |
| "eval_runtime": 60.1366, | |
| "eval_samples_per_second": 691.525, | |
| "eval_steps_per_second": 0.349, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 3.384330088085304, | |
| "grad_norm": 0.060604266822338104, | |
| "learning_rate": 1.725696330273575e-05, | |
| "loss": 0.1307, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 3.384330088085304, | |
| "eval_loss": 0.16321014850841353, | |
| "eval_runtime": 60.2328, | |
| "eval_samples_per_second": 690.421, | |
| "eval_steps_per_second": 0.349, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 3.407510431154381, | |
| "grad_norm": 0.061620261520147324, | |
| "learning_rate": 1.6661549974185424e-05, | |
| "loss": 0.1305, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 3.407510431154381, | |
| "eval_loss": 0.1627398211288196, | |
| "eval_runtime": 60.2249, | |
| "eval_samples_per_second": 690.512, | |
| "eval_steps_per_second": 0.349, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 3.4306907742234585, | |
| "grad_norm": 0.046630218625068665, | |
| "learning_rate": 1.60745294221434e-05, | |
| "loss": 0.1303, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 3.4306907742234585, | |
| "eval_loss": 0.16263843892878527, | |
| "eval_runtime": 59.9369, | |
| "eval_samples_per_second": 693.829, | |
| "eval_steps_per_second": 0.35, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 3.453871117292536, | |
| "grad_norm": 0.06071937829256058, | |
| "learning_rate": 1.549604942589441e-05, | |
| "loss": 0.13, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 3.453871117292536, | |
| "eval_loss": 0.1624999877883929, | |
| "eval_runtime": 59.843, | |
| "eval_samples_per_second": 694.919, | |
| "eval_steps_per_second": 0.351, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 3.477051460361613, | |
| "grad_norm": 0.0633426085114479, | |
| "learning_rate": 1.4926255614683932e-05, | |
| "loss": 0.1288, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 3.477051460361613, | |
| "eval_loss": 0.1632884555568049, | |
| "eval_runtime": 59.8153, | |
| "eval_samples_per_second": 695.24, | |
| "eval_steps_per_second": 0.351, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 3.5002318034306907, | |
| "grad_norm": 0.06753742694854736, | |
| "learning_rate": 1.4365291431056871e-05, | |
| "loss": 0.1301, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 3.5002318034306907, | |
| "eval_loss": 0.16175284084180716, | |
| "eval_runtime": 59.9226, | |
| "eval_samples_per_second": 693.995, | |
| "eval_steps_per_second": 0.35, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 3.5234121464997683, | |
| "grad_norm": 0.05140328034758568, | |
| "learning_rate": 1.3813298094746491e-05, | |
| "loss": 0.1304, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 3.5234121464997683, | |
| "eval_loss": 0.16199897513596326, | |
| "eval_runtime": 59.9917, | |
| "eval_samples_per_second": 693.196, | |
| "eval_steps_per_second": 0.35, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 3.5465924895688454, | |
| "grad_norm": 0.054956089705228806, | |
| "learning_rate": 1.327041456712334e-05, | |
| "loss": 0.1303, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 3.5465924895688454, | |
| "eval_loss": 0.16214041701821919, | |
| "eval_runtime": 59.9306, | |
| "eval_samples_per_second": 693.903, | |
| "eval_steps_per_second": 0.35, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 3.569772832637923, | |
| "grad_norm": 0.059684716165065765, | |
| "learning_rate": 1.2736777516212266e-05, | |
| "loss": 0.1308, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 3.569772832637923, | |
| "eval_loss": 0.16299972612079205, | |
| "eval_runtime": 59.9509, | |
| "eval_samples_per_second": 693.668, | |
| "eval_steps_per_second": 0.35, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 3.5929531757070006, | |
| "grad_norm": 0.059858404099941254, | |
| "learning_rate": 1.2212521282287092e-05, | |
| "loss": 0.1297, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 3.5929531757070006, | |
| "eval_loss": 0.1621026389214657, | |
| "eval_runtime": 60.4063, | |
| "eval_samples_per_second": 688.438, | |
| "eval_steps_per_second": 0.348, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 3.6161335187760777, | |
| "grad_norm": 0.07229738682508469, | |
| "learning_rate": 1.1697777844051105e-05, | |
| "loss": 0.13, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 3.6161335187760777, | |
| "eval_loss": 0.16179662531772324, | |
| "eval_runtime": 60.0141, | |
| "eval_samples_per_second": 692.937, | |
| "eval_steps_per_second": 0.35, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 3.6393138618451553, | |
| "grad_norm": 0.058062318712472916, | |
| "learning_rate": 1.1192676785412154e-05, | |
| "loss": 0.1305, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 3.6393138618451553, | |
| "eval_loss": 0.16283961568372932, | |
| "eval_runtime": 59.7657, | |
| "eval_samples_per_second": 695.817, | |
| "eval_steps_per_second": 0.351, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 3.662494204914233, | |
| "grad_norm": 0.053812187165021896, | |
| "learning_rate": 1.0697345262860636e-05, | |
| "loss": 0.1314, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 3.662494204914233, | |
| "eval_loss": 0.16244345922930356, | |
| "eval_runtime": 60.3156, | |
| "eval_samples_per_second": 689.474, | |
| "eval_steps_per_second": 0.348, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 3.68567454798331, | |
| "grad_norm": 0.05528152361512184, | |
| "learning_rate": 1.021190797345839e-05, | |
| "loss": 0.1299, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 3.68567454798331, | |
| "eval_loss": 0.1616077000723995, | |
| "eval_runtime": 60.1023, | |
| "eval_samples_per_second": 691.92, | |
| "eval_steps_per_second": 0.349, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 3.7088548910523875, | |
| "grad_norm": 0.04686369001865387, | |
| "learning_rate": 9.73648712344707e-06, | |
| "loss": 0.1294, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 3.7088548910523875, | |
| "eval_loss": 0.16104961568942824, | |
| "eval_runtime": 60.1871, | |
| "eval_samples_per_second": 690.946, | |
| "eval_steps_per_second": 0.349, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 3.732035234121465, | |
| "grad_norm": 0.04791761189699173, | |
| "learning_rate": 9.271202397483215e-06, | |
| "loss": 0.1293, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 3.732035234121465, | |
| "eval_loss": 0.16180993744676672, | |
| "eval_runtime": 60.3514, | |
| "eval_samples_per_second": 689.064, | |
| "eval_steps_per_second": 0.348, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 3.755215577190542, | |
| "grad_norm": 0.0580659918487072, | |
| "learning_rate": 8.816170928508365e-06, | |
| "loss": 0.1303, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 3.755215577190542, | |
| "eval_loss": 0.16161086084498935, | |
| "eval_runtime": 60.0527, | |
| "eval_samples_per_second": 692.491, | |
| "eval_steps_per_second": 0.35, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 3.77839592025962, | |
| "grad_norm": 0.0652560144662857, | |
| "learning_rate": 8.371507268261437e-06, | |
| "loss": 0.1318, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 3.77839592025962, | |
| "eval_loss": 0.16206722540467366, | |
| "eval_runtime": 60.2482, | |
| "eval_samples_per_second": 690.244, | |
| "eval_steps_per_second": 0.349, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 3.8015762633286974, | |
| "grad_norm": 0.07411529868841171, | |
| "learning_rate": 7.937323358440935e-06, | |
| "loss": 0.1295, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 3.8015762633286974, | |
| "eval_loss": 0.1613134364148254, | |
| "eval_runtime": 60.1488, | |
| "eval_samples_per_second": 691.385, | |
| "eval_steps_per_second": 0.349, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 3.8247566063977745, | |
| "grad_norm": 0.05504234880208969, | |
| "learning_rate": 7.513728502524286e-06, | |
| "loss": 0.1309, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 3.8247566063977745, | |
| "eval_loss": 0.16200784640385216, | |
| "eval_runtime": 60.4444, | |
| "eval_samples_per_second": 688.004, | |
| "eval_steps_per_second": 0.347, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 3.847936949466852, | |
| "grad_norm": 0.053017448633909225, | |
| "learning_rate": 7.100829338251147e-06, | |
| "loss": 0.1288, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 3.847936949466852, | |
| "eval_loss": 0.1614959925734419, | |
| "eval_runtime": 60.1621, | |
| "eval_samples_per_second": 691.232, | |
| "eval_steps_per_second": 0.349, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 3.8711172925359296, | |
| "grad_norm": 0.055434294044971466, | |
| "learning_rate": 6.698729810778065e-06, | |
| "loss": 0.1296, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 3.8711172925359296, | |
| "eval_loss": 0.16227277423563163, | |
| "eval_runtime": 60.4168, | |
| "eval_samples_per_second": 688.318, | |
| "eval_steps_per_second": 0.348, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 3.8942976356050067, | |
| "grad_norm": 0.06720498204231262, | |
| "learning_rate": 6.3075311465107535e-06, | |
| "loss": 0.1302, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 3.8942976356050067, | |
| "eval_loss": 0.16212167182684745, | |
| "eval_runtime": 60.4209, | |
| "eval_samples_per_second": 688.271, | |
| "eval_steps_per_second": 0.348, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 3.9174779786740843, | |
| "grad_norm": 0.061678655445575714, | |
| "learning_rate": 5.927331827620903e-06, | |
| "loss": 0.1303, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 3.9174779786740843, | |
| "eval_loss": 0.16245438240537732, | |
| "eval_runtime": 60.3802, | |
| "eval_samples_per_second": 688.735, | |
| "eval_steps_per_second": 0.348, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 3.940658321743162, | |
| "grad_norm": 0.05170401930809021, | |
| "learning_rate": 5.558227567253832e-06, | |
| "loss": 0.1296, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 3.940658321743162, | |
| "eval_loss": 0.16238415050768779, | |
| "eval_runtime": 59.8171, | |
| "eval_samples_per_second": 695.22, | |
| "eval_steps_per_second": 0.351, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 3.963838664812239, | |
| "grad_norm": 0.047940943390131, | |
| "learning_rate": 5.200311285433213e-06, | |
| "loss": 0.1302, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 3.963838664812239, | |
| "eval_loss": 0.1614615212760627, | |
| "eval_runtime": 60.3377, | |
| "eval_samples_per_second": 689.221, | |
| "eval_steps_per_second": 0.348, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 3.9870190078813166, | |
| "grad_norm": 0.05732366070151329, | |
| "learning_rate": 4.853673085668947e-06, | |
| "loss": 0.1311, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 3.9870190078813166, | |
| "eval_loss": 0.16182338333614685, | |
| "eval_runtime": 59.83, | |
| "eval_samples_per_second": 695.07, | |
| "eval_steps_per_second": 0.351, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 4.010199350950394, | |
| "grad_norm": 0.04801890626549721, | |
| "learning_rate": 4.5184002322740785e-06, | |
| "loss": 0.13, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 4.010199350950394, | |
| "eval_loss": 0.16167779009605182, | |
| "eval_runtime": 60.1344, | |
| "eval_samples_per_second": 691.551, | |
| "eval_steps_per_second": 0.349, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 4.033379694019471, | |
| "grad_norm": 0.04426449164748192, | |
| "learning_rate": 4.19457712839652e-06, | |
| "loss": 0.1299, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 4.033379694019471, | |
| "eval_loss": 0.16225696461065126, | |
| "eval_runtime": 60.1286, | |
| "eval_samples_per_second": 691.618, | |
| "eval_steps_per_second": 0.349, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 4.056560037088549, | |
| "grad_norm": 0.04997009411454201, | |
| "learning_rate": 3.8822852947709375e-06, | |
| "loss": 0.1302, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 4.056560037088549, | |
| "eval_loss": 0.1626912588154907, | |
| "eval_runtime": 60.4545, | |
| "eval_samples_per_second": 687.889, | |
| "eval_steps_per_second": 0.347, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 4.079740380157626, | |
| "grad_norm": 0.05177464708685875, | |
| "learning_rate": 3.581603349196372e-06, | |
| "loss": 0.1302, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 4.079740380157626, | |
| "eval_loss": 0.16124445235835394, | |
| "eval_runtime": 60.5756, | |
| "eval_samples_per_second": 686.514, | |
| "eval_steps_per_second": 0.347, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 4.1029207232267035, | |
| "grad_norm": 0.050131019204854965, | |
| "learning_rate": 3.2926069867446675e-06, | |
| "loss": 0.1308, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 4.1029207232267035, | |
| "eval_loss": 0.16266127792106785, | |
| "eval_runtime": 60.3334, | |
| "eval_samples_per_second": 689.27, | |
| "eval_steps_per_second": 0.348, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 4.1261010662957815, | |
| "grad_norm": 0.05185890197753906, | |
| "learning_rate": 3.0153689607045845e-06, | |
| "loss": 0.1298, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 4.1261010662957815, | |
| "eval_loss": 0.16332698150424974, | |
| "eval_runtime": 59.9427, | |
| "eval_samples_per_second": 693.763, | |
| "eval_steps_per_second": 0.35, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 4.149281409364859, | |
| "grad_norm": 0.040892358869314194, | |
| "learning_rate": 2.7499590642665774e-06, | |
| "loss": 0.1297, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 4.149281409364859, | |
| "eval_loss": 0.16260406271159317, | |
| "eval_runtime": 60.3006, | |
| "eval_samples_per_second": 689.645, | |
| "eval_steps_per_second": 0.348, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 4.172461752433936, | |
| "grad_norm": 0.05322985723614693, | |
| "learning_rate": 2.496444112952734e-06, | |
| "loss": 0.1298, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 4.172461752433936, | |
| "eval_loss": 0.16165919748334914, | |
| "eval_runtime": 59.7702, | |
| "eval_samples_per_second": 695.765, | |
| "eval_steps_per_second": 0.351, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 4.195642095503014, | |
| "grad_norm": 0.04688135161995888, | |
| "learning_rate": 2.2548879277963064e-06, | |
| "loss": 0.1304, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 4.195642095503014, | |
| "eval_loss": 0.16230118168852276, | |
| "eval_runtime": 59.7759, | |
| "eval_samples_per_second": 695.699, | |
| "eval_steps_per_second": 0.351, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 4.218822438572091, | |
| "grad_norm": 0.056906215846538544, | |
| "learning_rate": 2.0253513192751373e-06, | |
| "loss": 0.1302, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 4.218822438572091, | |
| "eval_loss": 0.16160732294835795, | |
| "eval_runtime": 59.9195, | |
| "eval_samples_per_second": 694.031, | |
| "eval_steps_per_second": 0.35, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 4.242002781641168, | |
| "grad_norm": 0.05124938115477562, | |
| "learning_rate": 1.807892072002898e-06, | |
| "loss": 0.1298, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 4.242002781641168, | |
| "eval_loss": 0.16257827555791163, | |
| "eval_runtime": 59.809, | |
| "eval_samples_per_second": 695.314, | |
| "eval_steps_per_second": 0.351, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 4.265183124710246, | |
| "grad_norm": 0.05366729572415352, | |
| "learning_rate": 1.6025649301821876e-06, | |
| "loss": 0.1294, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 4.265183124710246, | |
| "eval_loss": 0.1625148541687181, | |
| "eval_runtime": 60.1293, | |
| "eval_samples_per_second": 691.61, | |
| "eval_steps_per_second": 0.349, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 4.288363467779323, | |
| "grad_norm": 0.04244421049952507, | |
| "learning_rate": 1.4094215838229176e-06, | |
| "loss": 0.1308, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 4.288363467779323, | |
| "eval_loss": 0.16209612657051437, | |
| "eval_runtime": 59.9231, | |
| "eval_samples_per_second": 693.989, | |
| "eval_steps_per_second": 0.35, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 4.3115438108484, | |
| "grad_norm": 0.048628535121679306, | |
| "learning_rate": 1.2285106557296477e-06, | |
| "loss": 0.1302, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 4.3115438108484, | |
| "eval_loss": 0.16243464161006987, | |
| "eval_runtime": 59.5999, | |
| "eval_samples_per_second": 697.753, | |
| "eval_steps_per_second": 0.352, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 4.334724153917478, | |
| "grad_norm": 0.0497569814324379, | |
| "learning_rate": 1.0598776892610685e-06, | |
| "loss": 0.1311, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 4.334724153917478, | |
| "eval_loss": 0.16128818092832087, | |
| "eval_runtime": 59.8848, | |
| "eval_samples_per_second": 694.433, | |
| "eval_steps_per_second": 0.351, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 4.3579044969865555, | |
| "grad_norm": 0.07471216470003128, | |
| "learning_rate": 9.035651368646648e-07, | |
| "loss": 0.1304, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 4.3579044969865555, | |
| "eval_loss": 0.16288733155633187, | |
| "eval_runtime": 59.8227, | |
| "eval_samples_per_second": 695.154, | |
| "eval_steps_per_second": 0.351, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 4.381084840055633, | |
| "grad_norm": 0.058552809059619904, | |
| "learning_rate": 7.596123493895991e-07, | |
| "loss": 0.13, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 4.381084840055633, | |
| "eval_loss": 0.1634707775926499, | |
| "eval_runtime": 59.9789, | |
| "eval_samples_per_second": 693.344, | |
| "eval_steps_per_second": 0.35, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 4.404265183124711, | |
| "grad_norm": 0.05357597768306732, | |
| "learning_rate": 6.280555661802856e-07, | |
| "loss": 0.1295, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 4.404265183124711, | |
| "eval_loss": 0.1615680252075211, | |
| "eval_runtime": 60.1682, | |
| "eval_samples_per_second": 691.163, | |
| "eval_steps_per_second": 0.349, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 4.427445526193788, | |
| "grad_norm": 0.05787508189678192, | |
| "learning_rate": 5.089279059533658e-07, | |
| "loss": 0.1305, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 4.427445526193788, | |
| "eval_loss": 0.16174036094333355, | |
| "eval_runtime": 60.1289, | |
| "eval_samples_per_second": 691.615, | |
| "eval_steps_per_second": 0.349, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 4.450625869262865, | |
| "grad_norm": 0.049546804279088974, | |
| "learning_rate": 4.02259358460233e-07, | |
| "loss": 0.13, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 4.450625869262865, | |
| "eval_loss": 0.16296962879417173, | |
| "eval_runtime": 60.1209, | |
| "eval_samples_per_second": 691.706, | |
| "eval_steps_per_second": 0.349, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 4.473806212331943, | |
| "grad_norm": 0.05137551948428154, | |
| "learning_rate": 3.080767769372939e-07, | |
| "loss": 0.1297, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 4.473806212331943, | |
| "eval_loss": 0.16134209315513928, | |
| "eval_runtime": 60.0886, | |
| "eval_samples_per_second": 692.078, | |
| "eval_steps_per_second": 0.349, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 4.49698655540102, | |
| "grad_norm": 0.05584505572915077, | |
| "learning_rate": 2.2640387134577058e-07, | |
| "loss": 0.13, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 4.49698655540102, | |
| "eval_loss": 0.1621784231504334, | |
| "eval_runtime": 59.7716, | |
| "eval_samples_per_second": 695.749, | |
| "eval_steps_per_second": 0.351, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 4.520166898470097, | |
| "grad_norm": 0.0450916662812233, | |
| "learning_rate": 1.5726120240288634e-07, | |
| "loss": 0.1302, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 4.520166898470097, | |
| "eval_loss": 0.16172961751477263, | |
| "eval_runtime": 59.9065, | |
| "eval_samples_per_second": 694.182, | |
| "eval_steps_per_second": 0.351, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 4.543347241539175, | |
| "grad_norm": 0.0475350059568882, | |
| "learning_rate": 1.0066617640578368e-07, | |
| "loss": 0.1305, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 4.543347241539175, | |
| "eval_loss": 0.16216248300305847, | |
| "eval_runtime": 60.3498, | |
| "eval_samples_per_second": 689.083, | |
| "eval_steps_per_second": 0.348, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 4.566527584608252, | |
| "grad_norm": 0.057694341987371445, | |
| "learning_rate": 5.663304084960186e-08, | |
| "loss": 0.1299, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 4.566527584608252, | |
| "eval_loss": 0.16307967354038033, | |
| "eval_runtime": 59.9352, | |
| "eval_samples_per_second": 693.849, | |
| "eval_steps_per_second": 0.35, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 4.589707927677329, | |
| "grad_norm": 0.06310451030731201, | |
| "learning_rate": 2.5172880840745873e-08, | |
| "loss": 0.1299, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 4.589707927677329, | |
| "eval_loss": 0.16178384342894997, | |
| "eval_runtime": 60.0389, | |
| "eval_samples_per_second": 692.651, | |
| "eval_steps_per_second": 0.35, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 4.612888270746407, | |
| "grad_norm": 0.041533030569553375, | |
| "learning_rate": 6.293616306246586e-09, | |
| "loss": 0.1307, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 4.612888270746407, | |
| "eval_loss": 0.1629453700829326, | |
| "eval_runtime": 59.9874, | |
| "eval_samples_per_second": 693.246, | |
| "eval_steps_per_second": 0.35, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 4.6360686138154845, | |
| "grad_norm": 0.051685914397239685, | |
| "learning_rate": 0.0, | |
| "loss": 0.1293, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 4.6360686138154845, | |
| "eval_loss": 0.161578423628233, | |
| "eval_runtime": 60.1396, | |
| "eval_samples_per_second": 691.491, | |
| "eval_steps_per_second": 0.349, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 4.6360686138154845, | |
| "step": 10000, | |
| "total_flos": 1.2082504232914125e+17, | |
| "train_loss": 0.134784215593338, | |
| "train_runtime": 38606.1249, | |
| "train_samples_per_second": 530.486, | |
| "train_steps_per_second": 0.259 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 10000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 50, | |
| "total_flos": 1.2082504232914125e+17, | |
| "train_batch_size": 2048, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |