{ "best_metric": 0.16085075220051892, "best_model_checkpoint": "checkpoints/checkpoint-6750", "epoch": 4.6360686138154845, "eval_steps": 50, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.023180343069077423, "grad_norm": 0.08667636662721634, "learning_rate": 5e-05, "loss": 0.6255, "step": 50 }, { "epoch": 0.023180343069077423, "eval_loss": 0.2028235954199523, "eval_runtime": 58.9481, "eval_samples_per_second": 705.468, "eval_steps_per_second": 0.356, "step": 50 }, { "epoch": 0.04636068613815485, "grad_norm": 0.0631365031003952, "learning_rate": 0.0001, "loss": 0.1532, "step": 100 }, { "epoch": 0.04636068613815485, "eval_loss": 0.1865187252730081, "eval_runtime": 59.7995, "eval_samples_per_second": 695.424, "eval_steps_per_second": 0.351, "step": 100 }, { "epoch": 0.06954102920723226, "grad_norm": 0.11565029621124268, "learning_rate": 9.999370638369377e-05, "loss": 0.1448, "step": 150 }, { "epoch": 0.06954102920723226, "eval_loss": 0.1811586473052961, "eval_runtime": 59.448, "eval_samples_per_second": 699.536, "eval_steps_per_second": 0.353, "step": 150 }, { "epoch": 0.0927213722763097, "grad_norm": 0.12287624180316925, "learning_rate": 9.997482711915927e-05, "loss": 0.1431, "step": 200 }, { "epoch": 0.0927213722763097, "eval_loss": 0.17856710216099736, "eval_runtime": 59.6811, "eval_samples_per_second": 696.803, "eval_steps_per_second": 0.352, "step": 200 }, { "epoch": 0.11590171534538711, "grad_norm": 0.1239687129855156, "learning_rate": 9.99433669591504e-05, "loss": 0.1423, "step": 250 }, { "epoch": 0.11590171534538711, "eval_loss": 0.17694150013393828, "eval_runtime": 59.3601, "eval_samples_per_second": 700.572, "eval_steps_per_second": 0.354, "step": 250 }, { "epoch": 0.13908205841446453, "grad_norm": 0.08375083655118942, "learning_rate": 9.989933382359422e-05, "loss": 0.1413, "step": 300 }, { "epoch": 0.13908205841446453, "eval_loss": 0.17530383900746715, "eval_runtime": 59.6335, "eval_samples_per_second": 697.359, "eval_steps_per_second": 0.352, "step": 300 }, { "epoch": 0.16226240148354196, "grad_norm": 0.07470008730888367, "learning_rate": 9.984273879759713e-05, "loss": 0.1391, "step": 350 }, { "epoch": 0.16226240148354196, "eval_loss": 0.17539924518994512, "eval_runtime": 59.2699, "eval_samples_per_second": 701.638, "eval_steps_per_second": 0.354, "step": 350 }, { "epoch": 0.1854427445526194, "grad_norm": 0.11057748645544052, "learning_rate": 9.977359612865423e-05, "loss": 0.1398, "step": 400 }, { "epoch": 0.1854427445526194, "eval_loss": 0.17613958094562268, "eval_runtime": 59.3258, "eval_samples_per_second": 700.977, "eval_steps_per_second": 0.354, "step": 400 }, { "epoch": 0.2086230876216968, "grad_norm": 0.10229019820690155, "learning_rate": 9.969192322306271e-05, "loss": 0.1398, "step": 450 }, { "epoch": 0.2086230876216968, "eval_loss": 0.17319489228196833, "eval_runtime": 59.4596, "eval_samples_per_second": 699.399, "eval_steps_per_second": 0.353, "step": 450 }, { "epoch": 0.23180343069077422, "grad_norm": 0.10784970223903656, "learning_rate": 9.959774064153977e-05, "loss": 0.1384, "step": 500 }, { "epoch": 0.23180343069077422, "eval_loss": 0.17334065558523068, "eval_runtime": 59.5777, "eval_samples_per_second": 698.012, "eval_steps_per_second": 0.352, "step": 500 }, { "epoch": 0.25498377375985165, "grad_norm": 0.08434706926345825, "learning_rate": 9.949107209404665e-05, "loss": 0.1386, "step": 550 }, { "epoch": 0.25498377375985165, "eval_loss": 0.17196178719739552, "eval_runtime": 59.6138, "eval_samples_per_second": 697.59, "eval_steps_per_second": 0.352, "step": 550 }, { "epoch": 0.27816411682892905, "grad_norm": 0.08924778550863266, "learning_rate": 9.937194443381972e-05, "loss": 0.1377, "step": 600 }, { "epoch": 0.27816411682892905, "eval_loss": 0.1740634102700707, "eval_runtime": 59.422, "eval_samples_per_second": 699.842, "eval_steps_per_second": 0.353, "step": 600 }, { "epoch": 0.3013444598980065, "grad_norm": 0.15332703292369843, "learning_rate": 9.924038765061042e-05, "loss": 0.1372, "step": 650 }, { "epoch": 0.3013444598980065, "eval_loss": 0.17391025863974857, "eval_runtime": 59.5007, "eval_samples_per_second": 698.917, "eval_steps_per_second": 0.353, "step": 650 }, { "epoch": 0.3245248029670839, "grad_norm": 0.08973913639783859, "learning_rate": 9.909643486313533e-05, "loss": 0.1374, "step": 700 }, { "epoch": 0.3245248029670839, "eval_loss": 0.17245501519579134, "eval_runtime": 59.2504, "eval_samples_per_second": 701.868, "eval_steps_per_second": 0.354, "step": 700 }, { "epoch": 0.3477051460361613, "grad_norm": 0.07252663373947144, "learning_rate": 9.894012231073894e-05, "loss": 0.1378, "step": 750 }, { "epoch": 0.3477051460361613, "eval_loss": 0.1731146153162719, "eval_runtime": 59.6278, "eval_samples_per_second": 697.426, "eval_steps_per_second": 0.352, "step": 750 }, { "epoch": 0.3708854891052388, "grad_norm": 0.09351957589387894, "learning_rate": 9.877148934427037e-05, "loss": 0.1371, "step": 800 }, { "epoch": 0.3708854891052388, "eval_loss": 0.17056697699015605, "eval_runtime": 59.4338, "eval_samples_per_second": 699.703, "eval_steps_per_second": 0.353, "step": 800 }, { "epoch": 0.3940658321743162, "grad_norm": 0.06937623023986816, "learning_rate": 9.859057841617709e-05, "loss": 0.1364, "step": 850 }, { "epoch": 0.3940658321743162, "eval_loss": 0.1730773180756858, "eval_runtime": 59.2237, "eval_samples_per_second": 702.185, "eval_steps_per_second": 0.355, "step": 850 }, { "epoch": 0.4172461752433936, "grad_norm": 0.1241346001625061, "learning_rate": 9.839743506981782e-05, "loss": 0.1382, "step": 900 }, { "epoch": 0.4172461752433936, "eval_loss": 0.17300324635270986, "eval_runtime": 59.1648, "eval_samples_per_second": 702.884, "eval_steps_per_second": 0.355, "step": 900 }, { "epoch": 0.44042651831247104, "grad_norm": 0.0649554654955864, "learning_rate": 9.819210792799712e-05, "loss": 0.1369, "step": 950 }, { "epoch": 0.44042651831247104, "eval_loss": 0.17298936761230632, "eval_runtime": 59.4593, "eval_samples_per_second": 699.402, "eval_steps_per_second": 0.353, "step": 950 }, { "epoch": 0.46360686138154844, "grad_norm": 0.07767663151025772, "learning_rate": 9.797464868072488e-05, "loss": 0.1373, "step": 1000 }, { "epoch": 0.46360686138154844, "eval_loss": 0.1722117168758624, "eval_runtime": 59.4433, "eval_samples_per_second": 699.592, "eval_steps_per_second": 0.353, "step": 1000 }, { "epoch": 0.48678720445062584, "grad_norm": 0.09637939929962158, "learning_rate": 9.77451120722037e-05, "loss": 0.1357, "step": 1050 }, { "epoch": 0.48678720445062584, "eval_loss": 0.17295359261954948, "eval_runtime": 59.0076, "eval_samples_per_second": 704.757, "eval_steps_per_second": 0.356, "step": 1050 }, { "epoch": 0.5099675475197033, "grad_norm": 0.0731373056769371, "learning_rate": 9.750355588704727e-05, "loss": 0.135, "step": 1100 }, { "epoch": 0.5099675475197033, "eval_loss": 0.1715334055521701, "eval_runtime": 59.0167, "eval_samples_per_second": 704.648, "eval_steps_per_second": 0.356, "step": 1100 }, { "epoch": 0.5331478905887808, "grad_norm": 0.1365990936756134, "learning_rate": 9.725004093573342e-05, "loss": 0.1357, "step": 1150 }, { "epoch": 0.5331478905887808, "eval_loss": 0.17017831764477356, "eval_runtime": 59.0779, "eval_samples_per_second": 703.918, "eval_steps_per_second": 0.355, "step": 1150 }, { "epoch": 0.5563282336578581, "grad_norm": 0.07747852057218552, "learning_rate": 9.698463103929542e-05, "loss": 0.1366, "step": 1200 }, { "epoch": 0.5563282336578581, "eval_loss": 0.17079754339969364, "eval_runtime": 59.1474, "eval_samples_per_second": 703.091, "eval_steps_per_second": 0.355, "step": 1200 }, { "epoch": 0.5795085767269356, "grad_norm": 0.08369060605764389, "learning_rate": 9.670739301325534e-05, "loss": 0.1352, "step": 1250 }, { "epoch": 0.5795085767269356, "eval_loss": 0.17218272966053694, "eval_runtime": 59.4772, "eval_samples_per_second": 699.192, "eval_steps_per_second": 0.353, "step": 1250 }, { "epoch": 0.602688919796013, "grad_norm": 0.15560708940029144, "learning_rate": 9.641839665080363e-05, "loss": 0.1366, "step": 1300 }, { "epoch": 0.602688919796013, "eval_loss": 0.1698094484306934, "eval_runtime": 59.4226, "eval_samples_per_second": 699.835, "eval_steps_per_second": 0.353, "step": 1300 }, { "epoch": 0.6258692628650904, "grad_norm": 0.1404338777065277, "learning_rate": 9.611771470522908e-05, "loss": 0.1353, "step": 1350 }, { "epoch": 0.6258692628650904, "eval_loss": 0.17023876656477224, "eval_runtime": 59.3422, "eval_samples_per_second": 700.783, "eval_steps_per_second": 0.354, "step": 1350 }, { "epoch": 0.6490496059341678, "grad_norm": 0.07887144386768341, "learning_rate": 9.580542287160348e-05, "loss": 0.1363, "step": 1400 }, { "epoch": 0.6490496059341678, "eval_loss": 0.1706377184753332, "eval_runtime": 59.2598, "eval_samples_per_second": 701.758, "eval_steps_per_second": 0.354, "step": 1400 }, { "epoch": 0.6722299490032453, "grad_norm": 0.09286168217658997, "learning_rate": 9.548159976772592e-05, "loss": 0.1362, "step": 1450 }, { "epoch": 0.6722299490032453, "eval_loss": 0.16891294843072946, "eval_runtime": 59.4024, "eval_samples_per_second": 700.073, "eval_steps_per_second": 0.354, "step": 1450 }, { "epoch": 0.6954102920723226, "grad_norm": 0.08167006820440292, "learning_rate": 9.514632691433107e-05, "loss": 0.1345, "step": 1500 }, { "epoch": 0.6954102920723226, "eval_loss": 0.16790113662592512, "eval_runtime": 60.0378, "eval_samples_per_second": 692.664, "eval_steps_per_second": 0.35, "step": 1500 }, { "epoch": 0.7185906351414001, "grad_norm": 0.09860191494226456, "learning_rate": 9.479968871456679e-05, "loss": 0.1355, "step": 1550 }, { "epoch": 0.7185906351414001, "eval_loss": 0.16903206921067584, "eval_runtime": 59.5789, "eval_samples_per_second": 697.999, "eval_steps_per_second": 0.352, "step": 1550 }, { "epoch": 0.7417709782104775, "grad_norm": 0.06466613709926605, "learning_rate": 9.444177243274618e-05, "loss": 0.135, "step": 1600 }, { "epoch": 0.7417709782104775, "eval_loss": 0.1680566343999807, "eval_runtime": 59.5911, "eval_samples_per_second": 697.856, "eval_steps_per_second": 0.352, "step": 1600 }, { "epoch": 0.7649513212795549, "grad_norm": 0.07864313572645187, "learning_rate": 9.407266817237911e-05, "loss": 0.1348, "step": 1650 }, { "epoch": 0.7649513212795549, "eval_loss": 0.16721375296765553, "eval_runtime": 59.4289, "eval_samples_per_second": 699.76, "eval_steps_per_second": 0.353, "step": 1650 }, { "epoch": 0.7881316643486324, "grad_norm": 0.09288563579320908, "learning_rate": 9.369246885348926e-05, "loss": 0.1343, "step": 1700 }, { "epoch": 0.7881316643486324, "eval_loss": 0.16728526898731283, "eval_runtime": 59.4209, "eval_samples_per_second": 699.855, "eval_steps_per_second": 0.353, "step": 1700 }, { "epoch": 0.8113120074177098, "grad_norm": 0.1111670434474945, "learning_rate": 9.330127018922194e-05, "loss": 0.1342, "step": 1750 }, { "epoch": 0.8113120074177098, "eval_loss": 0.1692570258991495, "eval_runtime": 59.3557, "eval_samples_per_second": 700.624, "eval_steps_per_second": 0.354, "step": 1750 }, { "epoch": 0.8344923504867872, "grad_norm": 0.06098225340247154, "learning_rate": 9.289917066174886e-05, "loss": 0.1334, "step": 1800 }, { "epoch": 0.8344923504867872, "eval_loss": 0.16652011733605857, "eval_runtime": 59.527, "eval_samples_per_second": 698.607, "eval_steps_per_second": 0.353, "step": 1800 }, { "epoch": 0.8576726935558646, "grad_norm": 0.11042412370443344, "learning_rate": 9.248627149747573e-05, "loss": 0.136, "step": 1850 }, { "epoch": 0.8576726935558646, "eval_loss": 0.16714222769914375, "eval_runtime": 59.3645, "eval_samples_per_second": 700.519, "eval_steps_per_second": 0.354, "step": 1850 }, { "epoch": 0.8808530366249421, "grad_norm": 0.09495564550161362, "learning_rate": 9.206267664155907e-05, "loss": 0.1349, "step": 1900 }, { "epoch": 0.8808530366249421, "eval_loss": 0.1690081783682819, "eval_runtime": 59.2609, "eval_samples_per_second": 701.744, "eval_steps_per_second": 0.354, "step": 1900 }, { "epoch": 0.9040333796940194, "grad_norm": 0.08535555005073547, "learning_rate": 9.162849273173857e-05, "loss": 0.1345, "step": 1950 }, { "epoch": 0.9040333796940194, "eval_loss": 0.16719838296653933, "eval_runtime": 59.4328, "eval_samples_per_second": 699.714, "eval_steps_per_second": 0.353, "step": 1950 }, { "epoch": 0.9272137227630969, "grad_norm": 0.08415450155735016, "learning_rate": 9.118382907149165e-05, "loss": 0.1332, "step": 2000 }, { "epoch": 0.9272137227630969, "eval_loss": 0.16692495198886095, "eval_runtime": 59.4174, "eval_samples_per_second": 699.895, "eval_steps_per_second": 0.353, "step": 2000 }, { "epoch": 0.9503940658321743, "grad_norm": 0.07792109996080399, "learning_rate": 9.072879760251679e-05, "loss": 0.1349, "step": 2050 }, { "epoch": 0.9503940658321743, "eval_loss": 0.16853327133732582, "eval_runtime": 59.3211, "eval_samples_per_second": 701.032, "eval_steps_per_second": 0.354, "step": 2050 }, { "epoch": 0.9735744089012517, "grad_norm": 0.09134557843208313, "learning_rate": 9.026351287655294e-05, "loss": 0.1355, "step": 2100 }, { "epoch": 0.9735744089012517, "eval_loss": 0.16782760284485718, "eval_runtime": 59.1119, "eval_samples_per_second": 703.513, "eval_steps_per_second": 0.355, "step": 2100 }, { "epoch": 0.9967547519703291, "grad_norm": 0.11134419590234756, "learning_rate": 8.978809202654162e-05, "loss": 0.134, "step": 2150 }, { "epoch": 0.9967547519703291, "eval_loss": 0.1670381695935501, "eval_runtime": 59.4602, "eval_samples_per_second": 699.393, "eval_steps_per_second": 0.353, "step": 2150 }, { "epoch": 1.0199350950394066, "grad_norm": 0.08943980187177658, "learning_rate": 8.930265473713938e-05, "loss": 0.1345, "step": 2200 }, { "epoch": 1.0199350950394066, "eval_loss": 0.16720885753257103, "eval_runtime": 59.8457, "eval_samples_per_second": 694.887, "eval_steps_per_second": 0.351, "step": 2200 }, { "epoch": 1.043115438108484, "grad_norm": 0.05172237753868103, "learning_rate": 8.880732321458784e-05, "loss": 0.1345, "step": 2250 }, { "epoch": 1.043115438108484, "eval_loss": 0.16808202774068384, "eval_runtime": 59.7591, "eval_samples_per_second": 695.894, "eval_steps_per_second": 0.351, "step": 2250 }, { "epoch": 1.0662957811775615, "grad_norm": 0.08457198739051819, "learning_rate": 8.83022221559489e-05, "loss": 0.1339, "step": 2300 }, { "epoch": 1.0662957811775615, "eval_loss": 0.16620651689588106, "eval_runtime": 59.8615, "eval_samples_per_second": 694.704, "eval_steps_per_second": 0.351, "step": 2300 }, { "epoch": 1.0894761242466389, "grad_norm": 0.08191724866628647, "learning_rate": 8.778747871771292e-05, "loss": 0.1333, "step": 2350 }, { "epoch": 1.0894761242466389, "eval_loss": 0.16742435976845876, "eval_runtime": 59.8699, "eval_samples_per_second": 694.606, "eval_steps_per_second": 0.351, "step": 2350 }, { "epoch": 1.1126564673157162, "grad_norm": 0.08220981061458588, "learning_rate": 8.726322248378775e-05, "loss": 0.1336, "step": 2400 }, { "epoch": 1.1126564673157162, "eval_loss": 0.16507172522149283, "eval_runtime": 59.8591, "eval_samples_per_second": 694.731, "eval_steps_per_second": 0.351, "step": 2400 }, { "epoch": 1.1358368103847938, "grad_norm": 0.11390708386898041, "learning_rate": 8.672958543287666e-05, "loss": 0.1335, "step": 2450 }, { "epoch": 1.1358368103847938, "eval_loss": 0.16567397155304947, "eval_runtime": 59.7629, "eval_samples_per_second": 695.85, "eval_steps_per_second": 0.351, "step": 2450 }, { "epoch": 1.1590171534538711, "grad_norm": 0.06390725821256638, "learning_rate": 8.618670190525352e-05, "loss": 0.1335, "step": 2500 }, { "epoch": 1.1590171534538711, "eval_loss": 0.1671167116531541, "eval_runtime": 59.5093, "eval_samples_per_second": 698.815, "eval_steps_per_second": 0.353, "step": 2500 }, { "epoch": 1.1821974965229485, "grad_norm": 0.06458276510238647, "learning_rate": 8.563470856894316e-05, "loss": 0.1322, "step": 2550 }, { "epoch": 1.1821974965229485, "eval_loss": 0.16552241982155386, "eval_runtime": 59.3646, "eval_samples_per_second": 700.519, "eval_steps_per_second": 0.354, "step": 2550 }, { "epoch": 1.205377839592026, "grad_norm": 0.07258091121912003, "learning_rate": 8.507374438531607e-05, "loss": 0.1333, "step": 2600 }, { "epoch": 1.205377839592026, "eval_loss": 0.16643385319936513, "eval_runtime": 59.7463, "eval_samples_per_second": 696.043, "eval_steps_per_second": 0.351, "step": 2600 }, { "epoch": 1.2285581826611034, "grad_norm": 0.08584043383598328, "learning_rate": 8.450395057410561e-05, "loss": 0.1325, "step": 2650 }, { "epoch": 1.2285581826611034, "eval_loss": 0.16595956749906993, "eval_runtime": 59.7537, "eval_samples_per_second": 695.957, "eval_steps_per_second": 0.351, "step": 2650 }, { "epoch": 1.2517385257301807, "grad_norm": 0.054344214498996735, "learning_rate": 8.392547057785661e-05, "loss": 0.1334, "step": 2700 }, { "epoch": 1.2517385257301807, "eval_loss": 0.165368604673745, "eval_runtime": 59.4339, "eval_samples_per_second": 699.701, "eval_steps_per_second": 0.353, "step": 2700 }, { "epoch": 1.2749188687992583, "grad_norm": 0.07332266122102737, "learning_rate": 8.333845002581458e-05, "loss": 0.1326, "step": 2750 }, { "epoch": 1.2749188687992583, "eval_loss": 0.16569167596925843, "eval_runtime": 59.5544, "eval_samples_per_second": 698.286, "eval_steps_per_second": 0.353, "step": 2750 }, { "epoch": 1.2980992118683357, "grad_norm": 0.07198917865753174, "learning_rate": 8.274303669726426e-05, "loss": 0.1323, "step": 2800 }, { "epoch": 1.2980992118683357, "eval_loss": 0.16580398198626048, "eval_runtime": 59.8451, "eval_samples_per_second": 694.894, "eval_steps_per_second": 0.351, "step": 2800 }, { "epoch": 1.321279554937413, "grad_norm": 0.09278077632188797, "learning_rate": 8.213938048432697e-05, "loss": 0.1324, "step": 2850 }, { "epoch": 1.321279554937413, "eval_loss": 0.16619885882978533, "eval_runtime": 59.6857, "eval_samples_per_second": 696.749, "eval_steps_per_second": 0.352, "step": 2850 }, { "epoch": 1.3444598980064906, "grad_norm": 0.04779389128088951, "learning_rate": 8.152763335422613e-05, "loss": 0.1327, "step": 2900 }, { "epoch": 1.3444598980064906, "eval_loss": 0.16639967239163891, "eval_runtime": 59.5347, "eval_samples_per_second": 698.517, "eval_steps_per_second": 0.353, "step": 2900 }, { "epoch": 1.367640241075568, "grad_norm": 0.0650218203663826, "learning_rate": 8.090794931103026e-05, "loss": 0.1324, "step": 2950 }, { "epoch": 1.367640241075568, "eval_loss": 0.16698249569806287, "eval_runtime": 59.4938, "eval_samples_per_second": 698.997, "eval_steps_per_second": 0.353, "step": 2950 }, { "epoch": 1.3908205841446453, "grad_norm": 0.07800327241420746, "learning_rate": 8.028048435688333e-05, "loss": 0.1325, "step": 3000 }, { "epoch": 1.3908205841446453, "eval_loss": 0.16588903849861533, "eval_runtime": 59.7308, "eval_samples_per_second": 696.223, "eval_steps_per_second": 0.352, "step": 3000 }, { "epoch": 1.4140009272137228, "grad_norm": 0.09477279335260391, "learning_rate": 7.964539645273204e-05, "loss": 0.1318, "step": 3050 }, { "epoch": 1.4140009272137228, "eval_loss": 0.16391722600570544, "eval_runtime": 59.2552, "eval_samples_per_second": 701.812, "eval_steps_per_second": 0.354, "step": 3050 }, { "epoch": 1.4371812702828002, "grad_norm": 0.061748892068862915, "learning_rate": 7.900284547855991e-05, "loss": 0.1328, "step": 3100 }, { "epoch": 1.4371812702828002, "eval_loss": 0.1664695654356475, "eval_runtime": 59.7882, "eval_samples_per_second": 695.556, "eval_steps_per_second": 0.351, "step": 3100 }, { "epoch": 1.4603616133518775, "grad_norm": 0.07277340441942215, "learning_rate": 7.835299319313853e-05, "loss": 0.1332, "step": 3150 }, { "epoch": 1.4603616133518775, "eval_loss": 0.16764915423728274, "eval_runtime": 59.7168, "eval_samples_per_second": 696.387, "eval_steps_per_second": 0.352, "step": 3150 }, { "epoch": 1.483541956420955, "grad_norm": 0.06525903195142746, "learning_rate": 7.769600319330552e-05, "loss": 0.1326, "step": 3200 }, { "epoch": 1.483541956420955, "eval_loss": 0.16491104357870506, "eval_runtime": 59.6126, "eval_samples_per_second": 697.604, "eval_steps_per_second": 0.352, "step": 3200 }, { "epoch": 1.5067222994900324, "grad_norm": 0.06889070570468903, "learning_rate": 7.703204087277988e-05, "loss": 0.1327, "step": 3250 }, { "epoch": 1.5067222994900324, "eval_loss": 0.16643899540149082, "eval_runtime": 60.0, "eval_samples_per_second": 693.1, "eval_steps_per_second": 0.35, "step": 3250 }, { "epoch": 1.5299026425591098, "grad_norm": 0.09515661001205444, "learning_rate": 7.636127338052512e-05, "loss": 0.1332, "step": 3300 }, { "epoch": 1.5299026425591098, "eval_loss": 0.16578109982118125, "eval_runtime": 60.2083, "eval_samples_per_second": 690.703, "eval_steps_per_second": 0.349, "step": 3300 }, { "epoch": 1.5530829856281874, "grad_norm": 0.06826016306877136, "learning_rate": 7.568386957867033e-05, "loss": 0.1321, "step": 3350 }, { "epoch": 1.5530829856281874, "eval_loss": 0.16615711001414799, "eval_runtime": 59.8961, "eval_samples_per_second": 694.303, "eval_steps_per_second": 0.351, "step": 3350 }, { "epoch": 1.5762633286972647, "grad_norm": 0.06259354203939438, "learning_rate": 7.500000000000001e-05, "loss": 0.1324, "step": 3400 }, { "epoch": 1.5762633286972647, "eval_loss": 0.16420639359901218, "eval_runtime": 59.8484, "eval_samples_per_second": 694.856, "eval_steps_per_second": 0.351, "step": 3400 }, { "epoch": 1.599443671766342, "grad_norm": 0.08373662084341049, "learning_rate": 7.430983680502344e-05, "loss": 0.1317, "step": 3450 }, { "epoch": 1.599443671766342, "eval_loss": 0.16580187809904914, "eval_runtime": 59.5295, "eval_samples_per_second": 698.578, "eval_steps_per_second": 0.353, "step": 3450 }, { "epoch": 1.6226240148354196, "grad_norm": 0.052068453282117844, "learning_rate": 7.361355373863414e-05, "loss": 0.1326, "step": 3500 }, { "epoch": 1.6226240148354196, "eval_loss": 0.16511726778477553, "eval_runtime": 59.3774, "eval_samples_per_second": 700.368, "eval_steps_per_second": 0.354, "step": 3500 }, { "epoch": 1.645804357904497, "grad_norm": 0.1084132120013237, "learning_rate": 7.291132608637052e-05, "loss": 0.1328, "step": 3550 }, { "epoch": 1.645804357904497, "eval_loss": 0.16512942482848092, "eval_runtime": 59.7073, "eval_samples_per_second": 696.497, "eval_steps_per_second": 0.352, "step": 3550 }, { "epoch": 1.6689847009735743, "grad_norm": 0.09590224921703339, "learning_rate": 7.220333063028872e-05, "loss": 0.1327, "step": 3600 }, { "epoch": 1.6689847009735743, "eval_loss": 0.1653536906511234, "eval_runtime": 59.8607, "eval_samples_per_second": 694.713, "eval_steps_per_second": 0.351, "step": 3600 }, { "epoch": 1.692165044042652, "grad_norm": 0.09215644001960754, "learning_rate": 7.148974560445859e-05, "loss": 0.1314, "step": 3650 }, { "epoch": 1.692165044042652, "eval_loss": 0.16392036224708054, "eval_runtime": 59.6823, "eval_samples_per_second": 696.79, "eval_steps_per_second": 0.352, "step": 3650 }, { "epoch": 1.7153453871117292, "grad_norm": 0.0847523957490921, "learning_rate": 7.077075065009433e-05, "loss": 0.1319, "step": 3700 }, { "epoch": 1.7153453871117292, "eval_loss": 0.1658360792512092, "eval_runtime": 59.6368, "eval_samples_per_second": 697.322, "eval_steps_per_second": 0.352, "step": 3700 }, { "epoch": 1.7385257301808066, "grad_norm": 0.06882014125585556, "learning_rate": 7.004652677033068e-05, "loss": 0.1308, "step": 3750 }, { "epoch": 1.7385257301808066, "eval_loss": 0.1656867715236748, "eval_runtime": 59.8626, "eval_samples_per_second": 694.691, "eval_steps_per_second": 0.351, "step": 3750 }, { "epoch": 1.7617060732498842, "grad_norm": 0.056948818266391754, "learning_rate": 6.931725628465643e-05, "loss": 0.1322, "step": 3800 }, { "epoch": 1.7617060732498842, "eval_loss": 0.16491998551370737, "eval_runtime": 59.5124, "eval_samples_per_second": 698.779, "eval_steps_per_second": 0.353, "step": 3800 }, { "epoch": 1.7848864163189615, "grad_norm": 0.04779543727636337, "learning_rate": 6.858312278301637e-05, "loss": 0.1315, "step": 3850 }, { "epoch": 1.7848864163189615, "eval_loss": 0.1649495124686108, "eval_runtime": 59.9775, "eval_samples_per_second": 693.36, "eval_steps_per_second": 0.35, "step": 3850 }, { "epoch": 1.8080667593880388, "grad_norm": 0.05969324707984924, "learning_rate": 6.784431107959359e-05, "loss": 0.1316, "step": 3900 }, { "epoch": 1.8080667593880388, "eval_loss": 0.16391757633340012, "eval_runtime": 60.0346, "eval_samples_per_second": 692.7, "eval_steps_per_second": 0.35, "step": 3900 }, { "epoch": 1.8312471024571164, "grad_norm": 0.061390358954668045, "learning_rate": 6.710100716628344e-05, "loss": 0.1312, "step": 3950 }, { "epoch": 1.8312471024571164, "eval_loss": 0.1658972028775054, "eval_runtime": 59.9663, "eval_samples_per_second": 693.489, "eval_steps_per_second": 0.35, "step": 3950 }, { "epoch": 1.8544274455261938, "grad_norm": 0.07332038879394531, "learning_rate": 6.635339816587109e-05, "loss": 0.1323, "step": 4000 }, { "epoch": 1.8544274455261938, "eval_loss": 0.1647820455194368, "eval_runtime": 59.5785, "eval_samples_per_second": 698.004, "eval_steps_per_second": 0.352, "step": 4000 }, { "epoch": 1.877607788595271, "grad_norm": 0.07641714811325073, "learning_rate": 6.560167228492436e-05, "loss": 0.132, "step": 4050 }, { "epoch": 1.877607788595271, "eval_loss": 0.16406535325266738, "eval_runtime": 60.0931, "eval_samples_per_second": 692.026, "eval_steps_per_second": 0.349, "step": 4050 }, { "epoch": 1.9007881316643487, "grad_norm": 0.08891258388757706, "learning_rate": 6.484601876641375e-05, "loss": 0.1308, "step": 4100 }, { "epoch": 1.9007881316643487, "eval_loss": 0.164731109091856, "eval_runtime": 59.8012, "eval_samples_per_second": 695.405, "eval_steps_per_second": 0.351, "step": 4100 }, { "epoch": 1.923968474733426, "grad_norm": 0.0818193256855011, "learning_rate": 6.408662784207149e-05, "loss": 0.1323, "step": 4150 }, { "epoch": 1.923968474733426, "eval_loss": 0.16444408652573528, "eval_runtime": 59.6523, "eval_samples_per_second": 697.14, "eval_steps_per_second": 0.352, "step": 4150 }, { "epoch": 1.9471488178025034, "grad_norm": 0.05766776204109192, "learning_rate": 6.332369068450174e-05, "loss": 0.131, "step": 4200 }, { "epoch": 1.9471488178025034, "eval_loss": 0.1630568549542592, "eval_runtime": 59.9782, "eval_samples_per_second": 693.352, "eval_steps_per_second": 0.35, "step": 4200 }, { "epoch": 1.970329160871581, "grad_norm": 0.07093872129917145, "learning_rate": 6.255739935905396e-05, "loss": 0.1313, "step": 4250 }, { "epoch": 1.970329160871581, "eval_loss": 0.16320942743206068, "eval_runtime": 59.7408, "eval_samples_per_second": 696.107, "eval_steps_per_second": 0.352, "step": 4250 }, { "epoch": 1.9935095039406583, "grad_norm": 0.051636241376399994, "learning_rate": 6.178794677547137e-05, "loss": 0.1309, "step": 4300 }, { "epoch": 1.9935095039406583, "eval_loss": 0.16439976264264172, "eval_runtime": 59.7092, "eval_samples_per_second": 696.476, "eval_steps_per_second": 0.352, "step": 4300 }, { "epoch": 2.0166898470097356, "grad_norm": 0.05819587782025337, "learning_rate": 6.1015526639327035e-05, "loss": 0.1319, "step": 4350 }, { "epoch": 2.0166898470097356, "eval_loss": 0.16432355870633325, "eval_runtime": 59.2592, "eval_samples_per_second": 701.765, "eval_steps_per_second": 0.354, "step": 4350 }, { "epoch": 2.039870190078813, "grad_norm": 0.07939411699771881, "learning_rate": 6.024033340325954e-05, "loss": 0.1316, "step": 4400 }, { "epoch": 2.039870190078813, "eval_loss": 0.1641168338494948, "eval_runtime": 59.9534, "eval_samples_per_second": 693.639, "eval_steps_per_second": 0.35, "step": 4400 }, { "epoch": 2.0630505331478908, "grad_norm": 0.07020165026187897, "learning_rate": 5.946256221802051e-05, "loss": 0.1312, "step": 4450 }, { "epoch": 2.0630505331478908, "eval_loss": 0.1633037564118911, "eval_runtime": 60.3433, "eval_samples_per_second": 689.157, "eval_steps_per_second": 0.348, "step": 4450 }, { "epoch": 2.086230876216968, "grad_norm": 0.07000721246004105, "learning_rate": 5.868240888334653e-05, "loss": 0.1313, "step": 4500 }, { "epoch": 2.086230876216968, "eval_loss": 0.1646367282392535, "eval_runtime": 60.5726, "eval_samples_per_second": 686.548, "eval_steps_per_second": 0.347, "step": 4500 }, { "epoch": 2.1094112192860455, "grad_norm": 0.06988826394081116, "learning_rate": 5.79000697986675e-05, "loss": 0.1316, "step": 4550 }, { "epoch": 2.1094112192860455, "eval_loss": 0.16286425765036744, "eval_runtime": 60.2061, "eval_samples_per_second": 690.727, "eval_steps_per_second": 0.349, "step": 4550 }, { "epoch": 2.132591562355123, "grad_norm": 0.0749220922589302, "learning_rate": 5.7115741913664264e-05, "loss": 0.1306, "step": 4600 }, { "epoch": 2.132591562355123, "eval_loss": 0.1643572569196068, "eval_runtime": 59.9586, "eval_samples_per_second": 693.579, "eval_steps_per_second": 0.35, "step": 4600 }, { "epoch": 2.1557719054242, "grad_norm": 0.06533892452716827, "learning_rate": 5.6329622678687463e-05, "loss": 0.1313, "step": 4650 }, { "epoch": 2.1557719054242, "eval_loss": 0.1635978048832001, "eval_runtime": 59.6271, "eval_samples_per_second": 697.435, "eval_steps_per_second": 0.352, "step": 4650 }, { "epoch": 2.1789522484932777, "grad_norm": 0.07881616055965424, "learning_rate": 5.5541909995050554e-05, "loss": 0.131, "step": 4700 }, { "epoch": 2.1789522484932777, "eval_loss": 0.1634715372028324, "eval_runtime": 59.564, "eval_samples_per_second": 698.173, "eval_steps_per_second": 0.353, "step": 4700 }, { "epoch": 2.2021325915623553, "grad_norm": 0.05812694877386093, "learning_rate": 5.475280216520913e-05, "loss": 0.1311, "step": 4750 }, { "epoch": 2.2021325915623553, "eval_loss": 0.1636915707335646, "eval_runtime": 59.9343, "eval_samples_per_second": 693.86, "eval_steps_per_second": 0.35, "step": 4750 }, { "epoch": 2.2253129346314324, "grad_norm": 0.09842361509799957, "learning_rate": 5.396249784283942e-05, "loss": 0.1315, "step": 4800 }, { "epoch": 2.2253129346314324, "eval_loss": 0.16410182317726912, "eval_runtime": 60.4431, "eval_samples_per_second": 688.019, "eval_steps_per_second": 0.347, "step": 4800 }, { "epoch": 2.24849327770051, "grad_norm": 0.05664157494902611, "learning_rate": 5.317119598282823e-05, "loss": 0.1314, "step": 4850 }, { "epoch": 2.24849327770051, "eval_loss": 0.16405877684845893, "eval_runtime": 60.2757, "eval_samples_per_second": 689.93, "eval_steps_per_second": 0.348, "step": 4850 }, { "epoch": 2.2716736207695876, "grad_norm": 0.08323252946138382, "learning_rate": 5.2379095791187124e-05, "loss": 0.1306, "step": 4900 }, { "epoch": 2.2716736207695876, "eval_loss": 0.16356865120524391, "eval_runtime": 60.2036, "eval_samples_per_second": 690.756, "eval_steps_per_second": 0.349, "step": 4900 }, { "epoch": 2.2948539638386647, "grad_norm": 0.07163384556770325, "learning_rate": 5.158639667490339e-05, "loss": 0.1314, "step": 4950 }, { "epoch": 2.2948539638386647, "eval_loss": 0.16350787082313517, "eval_runtime": 59.6657, "eval_samples_per_second": 696.983, "eval_steps_per_second": 0.352, "step": 4950 }, { "epoch": 2.3180343069077423, "grad_norm": 0.07729226350784302, "learning_rate": 5.0793298191740404e-05, "loss": 0.1321, "step": 5000 }, { "epoch": 2.3180343069077423, "eval_loss": 0.16284041257465698, "eval_runtime": 60.3671, "eval_samples_per_second": 688.886, "eval_steps_per_second": 0.348, "step": 5000 }, { "epoch": 2.34121464997682, "grad_norm": 0.07920071482658386, "learning_rate": 5e-05, "loss": 0.13, "step": 5050 }, { "epoch": 2.34121464997682, "eval_loss": 0.16350252303966548, "eval_runtime": 60.0663, "eval_samples_per_second": 692.335, "eval_steps_per_second": 0.35, "step": 5050 }, { "epoch": 2.364394993045897, "grad_norm": 0.05213838815689087, "learning_rate": 4.92067018082596e-05, "loss": 0.1315, "step": 5100 }, { "epoch": 2.364394993045897, "eval_loss": 0.1640868928554377, "eval_runtime": 60.1323, "eval_samples_per_second": 691.575, "eval_steps_per_second": 0.349, "step": 5100 }, { "epoch": 2.3875753361149745, "grad_norm": 0.06551820039749146, "learning_rate": 4.841360332509663e-05, "loss": 0.1311, "step": 5150 }, { "epoch": 2.3875753361149745, "eval_loss": 0.16375304166425866, "eval_runtime": 60.0889, "eval_samples_per_second": 692.074, "eval_steps_per_second": 0.349, "step": 5150 }, { "epoch": 2.410755679184052, "grad_norm": 0.06602519750595093, "learning_rate": 4.762090420881289e-05, "loss": 0.1304, "step": 5200 }, { "epoch": 2.410755679184052, "eval_loss": 0.1646718036775546, "eval_runtime": 60.1839, "eval_samples_per_second": 690.982, "eval_steps_per_second": 0.349, "step": 5200 }, { "epoch": 2.433936022253129, "grad_norm": 0.050050172954797745, "learning_rate": 4.6828804017171776e-05, "loss": 0.131, "step": 5250 }, { "epoch": 2.433936022253129, "eval_loss": 0.16238808458996815, "eval_runtime": 60.2346, "eval_samples_per_second": 690.401, "eval_steps_per_second": 0.349, "step": 5250 }, { "epoch": 2.457116365322207, "grad_norm": 0.06192226707935333, "learning_rate": 4.603750215716057e-05, "loss": 0.131, "step": 5300 }, { "epoch": 2.457116365322207, "eval_loss": 0.1633245686996306, "eval_runtime": 59.5691, "eval_samples_per_second": 698.114, "eval_steps_per_second": 0.353, "step": 5300 }, { "epoch": 2.4802967083912844, "grad_norm": 0.07729701697826385, "learning_rate": 4.5247197834790876e-05, "loss": 0.1308, "step": 5350 }, { "epoch": 2.4802967083912844, "eval_loss": 0.16388068444979556, "eval_runtime": 60.3853, "eval_samples_per_second": 688.677, "eval_steps_per_second": 0.348, "step": 5350 }, { "epoch": 2.5034770514603615, "grad_norm": 0.07346878945827484, "learning_rate": 4.445809000494946e-05, "loss": 0.1314, "step": 5400 }, { "epoch": 2.5034770514603615, "eval_loss": 0.16427215786452162, "eval_runtime": 60.0462, "eval_samples_per_second": 692.567, "eval_steps_per_second": 0.35, "step": 5400 }, { "epoch": 2.526657394529439, "grad_norm": 0.08765513449907303, "learning_rate": 4.3670377321312535e-05, "loss": 0.1307, "step": 5450 }, { "epoch": 2.526657394529439, "eval_loss": 0.16308954695612046, "eval_runtime": 59.7344, "eval_samples_per_second": 696.181, "eval_steps_per_second": 0.352, "step": 5450 }, { "epoch": 2.5498377375985166, "grad_norm": 0.04856225475668907, "learning_rate": 4.288425808633575e-05, "loss": 0.1314, "step": 5500 }, { "epoch": 2.5498377375985166, "eval_loss": 0.1634677289958684, "eval_runtime": 60.5651, "eval_samples_per_second": 686.633, "eval_steps_per_second": 0.347, "step": 5500 }, { "epoch": 2.5730180806675937, "grad_norm": 0.07033301144838333, "learning_rate": 4.20999302013325e-05, "loss": 0.1303, "step": 5550 }, { "epoch": 2.5730180806675937, "eval_loss": 0.16350203952668135, "eval_runtime": 59.7363, "eval_samples_per_second": 696.16, "eval_steps_per_second": 0.352, "step": 5550 }, { "epoch": 2.5961984237366713, "grad_norm": 0.07352133840322495, "learning_rate": 4.131759111665349e-05, "loss": 0.1304, "step": 5600 }, { "epoch": 2.5961984237366713, "eval_loss": 0.16306162076252775, "eval_runtime": 60.0517, "eval_samples_per_second": 692.503, "eval_steps_per_second": 0.35, "step": 5600 }, { "epoch": 2.619378766805749, "grad_norm": 0.05432264879345894, "learning_rate": 4.0537437781979506e-05, "loss": 0.1298, "step": 5650 }, { "epoch": 2.619378766805749, "eval_loss": 0.16234816348528708, "eval_runtime": 60.3645, "eval_samples_per_second": 688.915, "eval_steps_per_second": 0.348, "step": 5650 }, { "epoch": 2.642559109874826, "grad_norm": 0.04657018184661865, "learning_rate": 3.9759666596740476e-05, "loss": 0.1305, "step": 5700 }, { "epoch": 2.642559109874826, "eval_loss": 0.16270628350418626, "eval_runtime": 60.0809, "eval_samples_per_second": 692.167, "eval_steps_per_second": 0.35, "step": 5700 }, { "epoch": 2.6657394529439036, "grad_norm": 0.04448065161705017, "learning_rate": 3.898447336067297e-05, "loss": 0.1308, "step": 5750 }, { "epoch": 2.6657394529439036, "eval_loss": 0.162430409584318, "eval_runtime": 59.8634, "eval_samples_per_second": 694.682, "eval_steps_per_second": 0.351, "step": 5750 }, { "epoch": 2.688919796012981, "grad_norm": 0.047300901263952255, "learning_rate": 3.821205322452863e-05, "loss": 0.1306, "step": 5800 }, { "epoch": 2.688919796012981, "eval_loss": 0.163914834923588, "eval_runtime": 59.9699, "eval_samples_per_second": 693.447, "eval_steps_per_second": 0.35, "step": 5800 }, { "epoch": 2.7121001390820583, "grad_norm": 0.09371935576200485, "learning_rate": 3.744260064094604e-05, "loss": 0.1303, "step": 5850 }, { "epoch": 2.7121001390820583, "eval_loss": 0.16325797910827158, "eval_runtime": 60.1596, "eval_samples_per_second": 691.261, "eval_steps_per_second": 0.349, "step": 5850 }, { "epoch": 2.735280482151136, "grad_norm": 0.0451604500412941, "learning_rate": 3.6676309315498256e-05, "loss": 0.131, "step": 5900 }, { "epoch": 2.735280482151136, "eval_loss": 0.16252548129222377, "eval_runtime": 60.0104, "eval_samples_per_second": 692.98, "eval_steps_per_second": 0.35, "step": 5900 }, { "epoch": 2.7584608252202134, "grad_norm": 0.058029964566230774, "learning_rate": 3.591337215792852e-05, "loss": 0.1305, "step": 5950 }, { "epoch": 2.7584608252202134, "eval_loss": 0.16366348885138793, "eval_runtime": 60.372, "eval_samples_per_second": 688.83, "eval_steps_per_second": 0.348, "step": 5950 }, { "epoch": 2.7816411682892905, "grad_norm": 0.09429273754358292, "learning_rate": 3.515398123358627e-05, "loss": 0.1307, "step": 6000 }, { "epoch": 2.7816411682892905, "eval_loss": 0.1623218584160889, "eval_runtime": 59.5435, "eval_samples_per_second": 698.413, "eval_steps_per_second": 0.353, "step": 6000 }, { "epoch": 2.804821511358368, "grad_norm": 0.05752315744757652, "learning_rate": 3.439832771507565e-05, "loss": 0.1296, "step": 6050 }, { "epoch": 2.804821511358368, "eval_loss": 0.16326439732289802, "eval_runtime": 59.8306, "eval_samples_per_second": 695.063, "eval_steps_per_second": 0.351, "step": 6050 }, { "epoch": 2.8280018544274457, "grad_norm": 0.07225628942251205, "learning_rate": 3.364660183412892e-05, "loss": 0.1312, "step": 6100 }, { "epoch": 2.8280018544274457, "eval_loss": 0.16322279137718054, "eval_runtime": 59.8418, "eval_samples_per_second": 694.932, "eval_steps_per_second": 0.351, "step": 6100 }, { "epoch": 2.851182197496523, "grad_norm": 0.06712605059146881, "learning_rate": 3.289899283371657e-05, "loss": 0.1305, "step": 6150 }, { "epoch": 2.851182197496523, "eval_loss": 0.16403909400299824, "eval_runtime": 59.4766, "eval_samples_per_second": 699.199, "eval_steps_per_second": 0.353, "step": 6150 }, { "epoch": 2.8743625405656004, "grad_norm": 0.0743350014090538, "learning_rate": 3.215568892040641e-05, "loss": 0.1303, "step": 6200 }, { "epoch": 2.8743625405656004, "eval_loss": 0.16315215653435175, "eval_runtime": 60.0127, "eval_samples_per_second": 692.953, "eval_steps_per_second": 0.35, "step": 6200 }, { "epoch": 2.897542883634678, "grad_norm": 0.07467668503522873, "learning_rate": 3.141687721698363e-05, "loss": 0.1302, "step": 6250 }, { "epoch": 2.897542883634678, "eval_loss": 0.16213396084813272, "eval_runtime": 59.8922, "eval_samples_per_second": 694.348, "eval_steps_per_second": 0.351, "step": 6250 }, { "epoch": 2.920723226703755, "grad_norm": 0.050527870655059814, "learning_rate": 3.0682743715343564e-05, "loss": 0.1298, "step": 6300 }, { "epoch": 2.920723226703755, "eval_loss": 0.16243251733829123, "eval_runtime": 60.3601, "eval_samples_per_second": 688.965, "eval_steps_per_second": 0.348, "step": 6300 }, { "epoch": 2.9439035697728326, "grad_norm": 0.05331522971391678, "learning_rate": 2.9953473229669328e-05, "loss": 0.1313, "step": 6350 }, { "epoch": 2.9439035697728326, "eval_loss": 0.16321332234047015, "eval_runtime": 60.2034, "eval_samples_per_second": 690.759, "eval_steps_per_second": 0.349, "step": 6350 }, { "epoch": 2.96708391284191, "grad_norm": 0.0566866509616375, "learning_rate": 2.9229249349905684e-05, "loss": 0.1304, "step": 6400 }, { "epoch": 2.96708391284191, "eval_loss": 0.1623781732971581, "eval_runtime": 60.2575, "eval_samples_per_second": 690.138, "eval_steps_per_second": 0.349, "step": 6400 }, { "epoch": 2.9902642559109873, "grad_norm": 0.0674847662448883, "learning_rate": 2.851025439554142e-05, "loss": 0.13, "step": 6450 }, { "epoch": 2.9902642559109873, "eval_loss": 0.163704374422533, "eval_runtime": 60.1942, "eval_samples_per_second": 690.864, "eval_steps_per_second": 0.349, "step": 6450 }, { "epoch": 3.013444598980065, "grad_norm": 0.05663591995835304, "learning_rate": 2.7796669369711294e-05, "loss": 0.1313, "step": 6500 }, { "epoch": 3.013444598980065, "eval_loss": 0.16296213660440473, "eval_runtime": 60.9459, "eval_samples_per_second": 682.343, "eval_steps_per_second": 0.345, "step": 6500 }, { "epoch": 3.0366249420491425, "grad_norm": 0.06456530839204788, "learning_rate": 2.708867391362948e-05, "loss": 0.131, "step": 6550 }, { "epoch": 3.0366249420491425, "eval_loss": 0.16119627636966075, "eval_runtime": 60.6451, "eval_samples_per_second": 685.727, "eval_steps_per_second": 0.346, "step": 6550 }, { "epoch": 3.0598052851182196, "grad_norm": 0.05969541519880295, "learning_rate": 2.638644626136587e-05, "loss": 0.1311, "step": 6600 }, { "epoch": 3.0598052851182196, "eval_loss": 0.16205494320222197, "eval_runtime": 60.4532, "eval_samples_per_second": 687.904, "eval_steps_per_second": 0.347, "step": 6600 }, { "epoch": 3.082985628187297, "grad_norm": 0.06604834645986557, "learning_rate": 2.5690163194976575e-05, "loss": 0.1301, "step": 6650 }, { "epoch": 3.082985628187297, "eval_loss": 0.16191228875556468, "eval_runtime": 60.3489, "eval_samples_per_second": 689.093, "eval_steps_per_second": 0.348, "step": 6650 }, { "epoch": 3.1061659712563747, "grad_norm": 0.06501331180334091, "learning_rate": 2.500000000000001e-05, "loss": 0.1298, "step": 6700 }, { "epoch": 3.1061659712563747, "eval_loss": 0.16219026561577268, "eval_runtime": 60.2703, "eval_samples_per_second": 689.992, "eval_steps_per_second": 0.348, "step": 6700 }, { "epoch": 3.129346314325452, "grad_norm": 0.056004952639341354, "learning_rate": 2.4316130421329697e-05, "loss": 0.1302, "step": 6750 }, { "epoch": 3.129346314325452, "eval_loss": 0.16085075220051892, "eval_runtime": 60.336, "eval_samples_per_second": 689.24, "eval_steps_per_second": 0.348, "step": 6750 }, { "epoch": 3.1525266573945294, "grad_norm": 0.06331496685743332, "learning_rate": 2.363872661947488e-05, "loss": 0.1311, "step": 6800 }, { "epoch": 3.1525266573945294, "eval_loss": 0.16229801712553438, "eval_runtime": 59.8727, "eval_samples_per_second": 694.573, "eval_steps_per_second": 0.351, "step": 6800 }, { "epoch": 3.175707000463607, "grad_norm": 0.05851437896490097, "learning_rate": 2.296795912722014e-05, "loss": 0.1304, "step": 6850 }, { "epoch": 3.175707000463607, "eval_loss": 0.1624837018550472, "eval_runtime": 60.0768, "eval_samples_per_second": 692.214, "eval_steps_per_second": 0.35, "step": 6850 }, { "epoch": 3.198887343532684, "grad_norm": 0.06251411885023117, "learning_rate": 2.2303996806694488e-05, "loss": 0.1306, "step": 6900 }, { "epoch": 3.198887343532684, "eval_loss": 0.16152431005864756, "eval_runtime": 60.0137, "eval_samples_per_second": 692.942, "eval_steps_per_second": 0.35, "step": 6900 }, { "epoch": 3.2220676866017617, "grad_norm": 0.055478889495134354, "learning_rate": 2.164700680686147e-05, "loss": 0.1302, "step": 6950 }, { "epoch": 3.2220676866017617, "eval_loss": 0.16217289975188992, "eval_runtime": 59.7201, "eval_samples_per_second": 696.349, "eval_steps_per_second": 0.352, "step": 6950 }, { "epoch": 3.2452480296708393, "grad_norm": 0.04695391282439232, "learning_rate": 2.09971545214401e-05, "loss": 0.1307, "step": 7000 }, { "epoch": 3.2452480296708393, "eval_loss": 0.16233282789861117, "eval_runtime": 60.2382, "eval_samples_per_second": 690.359, "eval_steps_per_second": 0.349, "step": 7000 }, { "epoch": 3.2684283727399164, "grad_norm": 0.05719252675771713, "learning_rate": 2.0354603547267985e-05, "loss": 0.1302, "step": 7050 }, { "epoch": 3.2684283727399164, "eval_loss": 0.16257561894818798, "eval_runtime": 59.9661, "eval_samples_per_second": 693.491, "eval_steps_per_second": 0.35, "step": 7050 }, { "epoch": 3.291608715808994, "grad_norm": 0.05995924398303032, "learning_rate": 1.9719515643116674e-05, "loss": 0.1296, "step": 7100 }, { "epoch": 3.291608715808994, "eval_loss": 0.1621910867534911, "eval_runtime": 59.9872, "eval_samples_per_second": 693.248, "eval_steps_per_second": 0.35, "step": 7100 }, { "epoch": 3.3147890588780715, "grad_norm": 0.06421925872564316, "learning_rate": 1.9092050688969738e-05, "loss": 0.1321, "step": 7150 }, { "epoch": 3.3147890588780715, "eval_loss": 0.16221412998892937, "eval_runtime": 59.9186, "eval_samples_per_second": 694.042, "eval_steps_per_second": 0.35, "step": 7150 }, { "epoch": 3.3379694019471486, "grad_norm": 0.04900297895073891, "learning_rate": 1.847236664577389e-05, "loss": 0.1307, "step": 7200 }, { "epoch": 3.3379694019471486, "eval_loss": 0.16276321033314364, "eval_runtime": 59.8713, "eval_samples_per_second": 694.59, "eval_steps_per_second": 0.351, "step": 7200 }, { "epoch": 3.361149745016226, "grad_norm": 0.06865038722753525, "learning_rate": 1.7860619515673033e-05, "loss": 0.1301, "step": 7250 }, { "epoch": 3.361149745016226, "eval_loss": 0.16246198975876286, "eval_runtime": 60.1366, "eval_samples_per_second": 691.525, "eval_steps_per_second": 0.349, "step": 7250 }, { "epoch": 3.384330088085304, "grad_norm": 0.060604266822338104, "learning_rate": 1.725696330273575e-05, "loss": 0.1307, "step": 7300 }, { "epoch": 3.384330088085304, "eval_loss": 0.16321014850841353, "eval_runtime": 60.2328, "eval_samples_per_second": 690.421, "eval_steps_per_second": 0.349, "step": 7300 }, { "epoch": 3.407510431154381, "grad_norm": 0.061620261520147324, "learning_rate": 1.6661549974185424e-05, "loss": 0.1305, "step": 7350 }, { "epoch": 3.407510431154381, "eval_loss": 0.1627398211288196, "eval_runtime": 60.2249, "eval_samples_per_second": 690.512, "eval_steps_per_second": 0.349, "step": 7350 }, { "epoch": 3.4306907742234585, "grad_norm": 0.046630218625068665, "learning_rate": 1.60745294221434e-05, "loss": 0.1303, "step": 7400 }, { "epoch": 3.4306907742234585, "eval_loss": 0.16263843892878527, "eval_runtime": 59.9369, "eval_samples_per_second": 693.829, "eval_steps_per_second": 0.35, "step": 7400 }, { "epoch": 3.453871117292536, "grad_norm": 0.06071937829256058, "learning_rate": 1.549604942589441e-05, "loss": 0.13, "step": 7450 }, { "epoch": 3.453871117292536, "eval_loss": 0.1624999877883929, "eval_runtime": 59.843, "eval_samples_per_second": 694.919, "eval_steps_per_second": 0.351, "step": 7450 }, { "epoch": 3.477051460361613, "grad_norm": 0.0633426085114479, "learning_rate": 1.4926255614683932e-05, "loss": 0.1288, "step": 7500 }, { "epoch": 3.477051460361613, "eval_loss": 0.1632884555568049, "eval_runtime": 59.8153, "eval_samples_per_second": 695.24, "eval_steps_per_second": 0.351, "step": 7500 }, { "epoch": 3.5002318034306907, "grad_norm": 0.06753742694854736, "learning_rate": 1.4365291431056871e-05, "loss": 0.1301, "step": 7550 }, { "epoch": 3.5002318034306907, "eval_loss": 0.16175284084180716, "eval_runtime": 59.9226, "eval_samples_per_second": 693.995, "eval_steps_per_second": 0.35, "step": 7550 }, { "epoch": 3.5234121464997683, "grad_norm": 0.05140328034758568, "learning_rate": 1.3813298094746491e-05, "loss": 0.1304, "step": 7600 }, { "epoch": 3.5234121464997683, "eval_loss": 0.16199897513596326, "eval_runtime": 59.9917, "eval_samples_per_second": 693.196, "eval_steps_per_second": 0.35, "step": 7600 }, { "epoch": 3.5465924895688454, "grad_norm": 0.054956089705228806, "learning_rate": 1.327041456712334e-05, "loss": 0.1303, "step": 7650 }, { "epoch": 3.5465924895688454, "eval_loss": 0.16214041701821919, "eval_runtime": 59.9306, "eval_samples_per_second": 693.903, "eval_steps_per_second": 0.35, "step": 7650 }, { "epoch": 3.569772832637923, "grad_norm": 0.059684716165065765, "learning_rate": 1.2736777516212266e-05, "loss": 0.1308, "step": 7700 }, { "epoch": 3.569772832637923, "eval_loss": 0.16299972612079205, "eval_runtime": 59.9509, "eval_samples_per_second": 693.668, "eval_steps_per_second": 0.35, "step": 7700 }, { "epoch": 3.5929531757070006, "grad_norm": 0.059858404099941254, "learning_rate": 1.2212521282287092e-05, "loss": 0.1297, "step": 7750 }, { "epoch": 3.5929531757070006, "eval_loss": 0.1621026389214657, "eval_runtime": 60.4063, "eval_samples_per_second": 688.438, "eval_steps_per_second": 0.348, "step": 7750 }, { "epoch": 3.6161335187760777, "grad_norm": 0.07229738682508469, "learning_rate": 1.1697777844051105e-05, "loss": 0.13, "step": 7800 }, { "epoch": 3.6161335187760777, "eval_loss": 0.16179662531772324, "eval_runtime": 60.0141, "eval_samples_per_second": 692.937, "eval_steps_per_second": 0.35, "step": 7800 }, { "epoch": 3.6393138618451553, "grad_norm": 0.058062318712472916, "learning_rate": 1.1192676785412154e-05, "loss": 0.1305, "step": 7850 }, { "epoch": 3.6393138618451553, "eval_loss": 0.16283961568372932, "eval_runtime": 59.7657, "eval_samples_per_second": 695.817, "eval_steps_per_second": 0.351, "step": 7850 }, { "epoch": 3.662494204914233, "grad_norm": 0.053812187165021896, "learning_rate": 1.0697345262860636e-05, "loss": 0.1314, "step": 7900 }, { "epoch": 3.662494204914233, "eval_loss": 0.16244345922930356, "eval_runtime": 60.3156, "eval_samples_per_second": 689.474, "eval_steps_per_second": 0.348, "step": 7900 }, { "epoch": 3.68567454798331, "grad_norm": 0.05528152361512184, "learning_rate": 1.021190797345839e-05, "loss": 0.1299, "step": 7950 }, { "epoch": 3.68567454798331, "eval_loss": 0.1616077000723995, "eval_runtime": 60.1023, "eval_samples_per_second": 691.92, "eval_steps_per_second": 0.349, "step": 7950 }, { "epoch": 3.7088548910523875, "grad_norm": 0.04686369001865387, "learning_rate": 9.73648712344707e-06, "loss": 0.1294, "step": 8000 }, { "epoch": 3.7088548910523875, "eval_loss": 0.16104961568942824, "eval_runtime": 60.1871, "eval_samples_per_second": 690.946, "eval_steps_per_second": 0.349, "step": 8000 }, { "epoch": 3.732035234121465, "grad_norm": 0.04791761189699173, "learning_rate": 9.271202397483215e-06, "loss": 0.1293, "step": 8050 }, { "epoch": 3.732035234121465, "eval_loss": 0.16180993744676672, "eval_runtime": 60.3514, "eval_samples_per_second": 689.064, "eval_steps_per_second": 0.348, "step": 8050 }, { "epoch": 3.755215577190542, "grad_norm": 0.0580659918487072, "learning_rate": 8.816170928508365e-06, "loss": 0.1303, "step": 8100 }, { "epoch": 3.755215577190542, "eval_loss": 0.16161086084498935, "eval_runtime": 60.0527, "eval_samples_per_second": 692.491, "eval_steps_per_second": 0.35, "step": 8100 }, { "epoch": 3.77839592025962, "grad_norm": 0.0652560144662857, "learning_rate": 8.371507268261437e-06, "loss": 0.1318, "step": 8150 }, { "epoch": 3.77839592025962, "eval_loss": 0.16206722540467366, "eval_runtime": 60.2482, "eval_samples_per_second": 690.244, "eval_steps_per_second": 0.349, "step": 8150 }, { "epoch": 3.8015762633286974, "grad_norm": 0.07411529868841171, "learning_rate": 7.937323358440935e-06, "loss": 0.1295, "step": 8200 }, { "epoch": 3.8015762633286974, "eval_loss": 0.1613134364148254, "eval_runtime": 60.1488, "eval_samples_per_second": 691.385, "eval_steps_per_second": 0.349, "step": 8200 }, { "epoch": 3.8247566063977745, "grad_norm": 0.05504234880208969, "learning_rate": 7.513728502524286e-06, "loss": 0.1309, "step": 8250 }, { "epoch": 3.8247566063977745, "eval_loss": 0.16200784640385216, "eval_runtime": 60.4444, "eval_samples_per_second": 688.004, "eval_steps_per_second": 0.347, "step": 8250 }, { "epoch": 3.847936949466852, "grad_norm": 0.053017448633909225, "learning_rate": 7.100829338251147e-06, "loss": 0.1288, "step": 8300 }, { "epoch": 3.847936949466852, "eval_loss": 0.1614959925734419, "eval_runtime": 60.1621, "eval_samples_per_second": 691.232, "eval_steps_per_second": 0.349, "step": 8300 }, { "epoch": 3.8711172925359296, "grad_norm": 0.055434294044971466, "learning_rate": 6.698729810778065e-06, "loss": 0.1296, "step": 8350 }, { "epoch": 3.8711172925359296, "eval_loss": 0.16227277423563163, "eval_runtime": 60.4168, "eval_samples_per_second": 688.318, "eval_steps_per_second": 0.348, "step": 8350 }, { "epoch": 3.8942976356050067, "grad_norm": 0.06720498204231262, "learning_rate": 6.3075311465107535e-06, "loss": 0.1302, "step": 8400 }, { "epoch": 3.8942976356050067, "eval_loss": 0.16212167182684745, "eval_runtime": 60.4209, "eval_samples_per_second": 688.271, "eval_steps_per_second": 0.348, "step": 8400 }, { "epoch": 3.9174779786740843, "grad_norm": 0.061678655445575714, "learning_rate": 5.927331827620903e-06, "loss": 0.1303, "step": 8450 }, { "epoch": 3.9174779786740843, "eval_loss": 0.16245438240537732, "eval_runtime": 60.3802, "eval_samples_per_second": 688.735, "eval_steps_per_second": 0.348, "step": 8450 }, { "epoch": 3.940658321743162, "grad_norm": 0.05170401930809021, "learning_rate": 5.558227567253832e-06, "loss": 0.1296, "step": 8500 }, { "epoch": 3.940658321743162, "eval_loss": 0.16238415050768779, "eval_runtime": 59.8171, "eval_samples_per_second": 695.22, "eval_steps_per_second": 0.351, "step": 8500 }, { "epoch": 3.963838664812239, "grad_norm": 0.047940943390131, "learning_rate": 5.200311285433213e-06, "loss": 0.1302, "step": 8550 }, { "epoch": 3.963838664812239, "eval_loss": 0.1614615212760627, "eval_runtime": 60.3377, "eval_samples_per_second": 689.221, "eval_steps_per_second": 0.348, "step": 8550 }, { "epoch": 3.9870190078813166, "grad_norm": 0.05732366070151329, "learning_rate": 4.853673085668947e-06, "loss": 0.1311, "step": 8600 }, { "epoch": 3.9870190078813166, "eval_loss": 0.16182338333614685, "eval_runtime": 59.83, "eval_samples_per_second": 695.07, "eval_steps_per_second": 0.351, "step": 8600 }, { "epoch": 4.010199350950394, "grad_norm": 0.04801890626549721, "learning_rate": 4.5184002322740785e-06, "loss": 0.13, "step": 8650 }, { "epoch": 4.010199350950394, "eval_loss": 0.16167779009605182, "eval_runtime": 60.1344, "eval_samples_per_second": 691.551, "eval_steps_per_second": 0.349, "step": 8650 }, { "epoch": 4.033379694019471, "grad_norm": 0.04426449164748192, "learning_rate": 4.19457712839652e-06, "loss": 0.1299, "step": 8700 }, { "epoch": 4.033379694019471, "eval_loss": 0.16225696461065126, "eval_runtime": 60.1286, "eval_samples_per_second": 691.618, "eval_steps_per_second": 0.349, "step": 8700 }, { "epoch": 4.056560037088549, "grad_norm": 0.04997009411454201, "learning_rate": 3.8822852947709375e-06, "loss": 0.1302, "step": 8750 }, { "epoch": 4.056560037088549, "eval_loss": 0.1626912588154907, "eval_runtime": 60.4545, "eval_samples_per_second": 687.889, "eval_steps_per_second": 0.347, "step": 8750 }, { "epoch": 4.079740380157626, "grad_norm": 0.05177464708685875, "learning_rate": 3.581603349196372e-06, "loss": 0.1302, "step": 8800 }, { "epoch": 4.079740380157626, "eval_loss": 0.16124445235835394, "eval_runtime": 60.5756, "eval_samples_per_second": 686.514, "eval_steps_per_second": 0.347, "step": 8800 }, { "epoch": 4.1029207232267035, "grad_norm": 0.050131019204854965, "learning_rate": 3.2926069867446675e-06, "loss": 0.1308, "step": 8850 }, { "epoch": 4.1029207232267035, "eval_loss": 0.16266127792106785, "eval_runtime": 60.3334, "eval_samples_per_second": 689.27, "eval_steps_per_second": 0.348, "step": 8850 }, { "epoch": 4.1261010662957815, "grad_norm": 0.05185890197753906, "learning_rate": 3.0153689607045845e-06, "loss": 0.1298, "step": 8900 }, { "epoch": 4.1261010662957815, "eval_loss": 0.16332698150424974, "eval_runtime": 59.9427, "eval_samples_per_second": 693.763, "eval_steps_per_second": 0.35, "step": 8900 }, { "epoch": 4.149281409364859, "grad_norm": 0.040892358869314194, "learning_rate": 2.7499590642665774e-06, "loss": 0.1297, "step": 8950 }, { "epoch": 4.149281409364859, "eval_loss": 0.16260406271159317, "eval_runtime": 60.3006, "eval_samples_per_second": 689.645, "eval_steps_per_second": 0.348, "step": 8950 }, { "epoch": 4.172461752433936, "grad_norm": 0.05322985723614693, "learning_rate": 2.496444112952734e-06, "loss": 0.1298, "step": 9000 }, { "epoch": 4.172461752433936, "eval_loss": 0.16165919748334914, "eval_runtime": 59.7702, "eval_samples_per_second": 695.765, "eval_steps_per_second": 0.351, "step": 9000 }, { "epoch": 4.195642095503014, "grad_norm": 0.04688135161995888, "learning_rate": 2.2548879277963064e-06, "loss": 0.1304, "step": 9050 }, { "epoch": 4.195642095503014, "eval_loss": 0.16230118168852276, "eval_runtime": 59.7759, "eval_samples_per_second": 695.699, "eval_steps_per_second": 0.351, "step": 9050 }, { "epoch": 4.218822438572091, "grad_norm": 0.056906215846538544, "learning_rate": 2.0253513192751373e-06, "loss": 0.1302, "step": 9100 }, { "epoch": 4.218822438572091, "eval_loss": 0.16160732294835795, "eval_runtime": 59.9195, "eval_samples_per_second": 694.031, "eval_steps_per_second": 0.35, "step": 9100 }, { "epoch": 4.242002781641168, "grad_norm": 0.05124938115477562, "learning_rate": 1.807892072002898e-06, "loss": 0.1298, "step": 9150 }, { "epoch": 4.242002781641168, "eval_loss": 0.16257827555791163, "eval_runtime": 59.809, "eval_samples_per_second": 695.314, "eval_steps_per_second": 0.351, "step": 9150 }, { "epoch": 4.265183124710246, "grad_norm": 0.05366729572415352, "learning_rate": 1.6025649301821876e-06, "loss": 0.1294, "step": 9200 }, { "epoch": 4.265183124710246, "eval_loss": 0.1625148541687181, "eval_runtime": 60.1293, "eval_samples_per_second": 691.61, "eval_steps_per_second": 0.349, "step": 9200 }, { "epoch": 4.288363467779323, "grad_norm": 0.04244421049952507, "learning_rate": 1.4094215838229176e-06, "loss": 0.1308, "step": 9250 }, { "epoch": 4.288363467779323, "eval_loss": 0.16209612657051437, "eval_runtime": 59.9231, "eval_samples_per_second": 693.989, "eval_steps_per_second": 0.35, "step": 9250 }, { "epoch": 4.3115438108484, "grad_norm": 0.048628535121679306, "learning_rate": 1.2285106557296477e-06, "loss": 0.1302, "step": 9300 }, { "epoch": 4.3115438108484, "eval_loss": 0.16243464161006987, "eval_runtime": 59.5999, "eval_samples_per_second": 697.753, "eval_steps_per_second": 0.352, "step": 9300 }, { "epoch": 4.334724153917478, "grad_norm": 0.0497569814324379, "learning_rate": 1.0598776892610685e-06, "loss": 0.1311, "step": 9350 }, { "epoch": 4.334724153917478, "eval_loss": 0.16128818092832087, "eval_runtime": 59.8848, "eval_samples_per_second": 694.433, "eval_steps_per_second": 0.351, "step": 9350 }, { "epoch": 4.3579044969865555, "grad_norm": 0.07471216470003128, "learning_rate": 9.035651368646648e-07, "loss": 0.1304, "step": 9400 }, { "epoch": 4.3579044969865555, "eval_loss": 0.16288733155633187, "eval_runtime": 59.8227, "eval_samples_per_second": 695.154, "eval_steps_per_second": 0.351, "step": 9400 }, { "epoch": 4.381084840055633, "grad_norm": 0.058552809059619904, "learning_rate": 7.596123493895991e-07, "loss": 0.13, "step": 9450 }, { "epoch": 4.381084840055633, "eval_loss": 0.1634707775926499, "eval_runtime": 59.9789, "eval_samples_per_second": 693.344, "eval_steps_per_second": 0.35, "step": 9450 }, { "epoch": 4.404265183124711, "grad_norm": 0.05357597768306732, "learning_rate": 6.280555661802856e-07, "loss": 0.1295, "step": 9500 }, { "epoch": 4.404265183124711, "eval_loss": 0.1615680252075211, "eval_runtime": 60.1682, "eval_samples_per_second": 691.163, "eval_steps_per_second": 0.349, "step": 9500 }, { "epoch": 4.427445526193788, "grad_norm": 0.05787508189678192, "learning_rate": 5.089279059533658e-07, "loss": 0.1305, "step": 9550 }, { "epoch": 4.427445526193788, "eval_loss": 0.16174036094333355, "eval_runtime": 60.1289, "eval_samples_per_second": 691.615, "eval_steps_per_second": 0.349, "step": 9550 }, { "epoch": 4.450625869262865, "grad_norm": 0.049546804279088974, "learning_rate": 4.02259358460233e-07, "loss": 0.13, "step": 9600 }, { "epoch": 4.450625869262865, "eval_loss": 0.16296962879417173, "eval_runtime": 60.1209, "eval_samples_per_second": 691.706, "eval_steps_per_second": 0.349, "step": 9600 }, { "epoch": 4.473806212331943, "grad_norm": 0.05137551948428154, "learning_rate": 3.080767769372939e-07, "loss": 0.1297, "step": 9650 }, { "epoch": 4.473806212331943, "eval_loss": 0.16134209315513928, "eval_runtime": 60.0886, "eval_samples_per_second": 692.078, "eval_steps_per_second": 0.349, "step": 9650 }, { "epoch": 4.49698655540102, "grad_norm": 0.05584505572915077, "learning_rate": 2.2640387134577058e-07, "loss": 0.13, "step": 9700 }, { "epoch": 4.49698655540102, "eval_loss": 0.1621784231504334, "eval_runtime": 59.7716, "eval_samples_per_second": 695.749, "eval_steps_per_second": 0.351, "step": 9700 }, { "epoch": 4.520166898470097, "grad_norm": 0.0450916662812233, "learning_rate": 1.5726120240288634e-07, "loss": 0.1302, "step": 9750 }, { "epoch": 4.520166898470097, "eval_loss": 0.16172961751477263, "eval_runtime": 59.9065, "eval_samples_per_second": 694.182, "eval_steps_per_second": 0.351, "step": 9750 }, { "epoch": 4.543347241539175, "grad_norm": 0.0475350059568882, "learning_rate": 1.0066617640578368e-07, "loss": 0.1305, "step": 9800 }, { "epoch": 4.543347241539175, "eval_loss": 0.16216248300305847, "eval_runtime": 60.3498, "eval_samples_per_second": 689.083, "eval_steps_per_second": 0.348, "step": 9800 }, { "epoch": 4.566527584608252, "grad_norm": 0.057694341987371445, "learning_rate": 5.663304084960186e-08, "loss": 0.1299, "step": 9850 }, { "epoch": 4.566527584608252, "eval_loss": 0.16307967354038033, "eval_runtime": 59.9352, "eval_samples_per_second": 693.849, "eval_steps_per_second": 0.35, "step": 9850 }, { "epoch": 4.589707927677329, "grad_norm": 0.06310451030731201, "learning_rate": 2.5172880840745873e-08, "loss": 0.1299, "step": 9900 }, { "epoch": 4.589707927677329, "eval_loss": 0.16178384342894997, "eval_runtime": 60.0389, "eval_samples_per_second": 692.651, "eval_steps_per_second": 0.35, "step": 9900 }, { "epoch": 4.612888270746407, "grad_norm": 0.041533030569553375, "learning_rate": 6.293616306246586e-09, "loss": 0.1307, "step": 9950 }, { "epoch": 4.612888270746407, "eval_loss": 0.1629453700829326, "eval_runtime": 59.9874, "eval_samples_per_second": 693.246, "eval_steps_per_second": 0.35, "step": 9950 }, { "epoch": 4.6360686138154845, "grad_norm": 0.051685914397239685, "learning_rate": 0.0, "loss": 0.1293, "step": 10000 }, { "epoch": 4.6360686138154845, "eval_loss": 0.161578423628233, "eval_runtime": 60.1396, "eval_samples_per_second": 691.491, "eval_steps_per_second": 0.349, "step": 10000 }, { "epoch": 4.6360686138154845, "step": 10000, "total_flos": 1.2082504232914125e+17, "train_loss": 0.134784215593338, "train_runtime": 38606.1249, "train_samples_per_second": 530.486, "train_steps_per_second": 0.259 } ], "logging_steps": 50, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 50, "total_flos": 1.2082504232914125e+17, "train_batch_size": 2048, "trial_name": null, "trial_params": null }