| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 40.0, | |
| "eval_steps": 500, | |
| "global_step": 160320, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.124750499001996, | |
| "grad_norm": 13.176804542541504, | |
| "learning_rate": 1.9937624750499e-06, | |
| "loss": 0.2137, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.249500998003992, | |
| "grad_norm": 52.68854904174805, | |
| "learning_rate": 1.9875249500998005e-06, | |
| "loss": 0.2463, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.37425149700598803, | |
| "grad_norm": 9.197150230407715, | |
| "learning_rate": 1.9812874251497004e-06, | |
| "loss": 0.2316, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.499001996007984, | |
| "grad_norm": 23.94010353088379, | |
| "learning_rate": 1.9750499001996007e-06, | |
| "loss": 0.2095, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6237524950099801, | |
| "grad_norm": 25.69223976135254, | |
| "learning_rate": 1.968812375249501e-06, | |
| "loss": 0.2102, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.7485029940119761, | |
| "grad_norm": 14.870789527893066, | |
| "learning_rate": 1.9625748502994013e-06, | |
| "loss": 0.2335, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.873253493013972, | |
| "grad_norm": 19.752464294433594, | |
| "learning_rate": 1.9563373253493016e-06, | |
| "loss": 0.2065, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.998003992015968, | |
| "grad_norm": 7.356762409210205, | |
| "learning_rate": 1.9500998003992014e-06, | |
| "loss": 0.215, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.4418031871318817, | |
| "eval_runtime": 50.9423, | |
| "eval_samples_per_second": 62.934, | |
| "eval_steps_per_second": 15.743, | |
| "step": 4008 | |
| }, | |
| { | |
| "epoch": 1.122754491017964, | |
| "grad_norm": 1.049210786819458, | |
| "learning_rate": 1.9438622754491017e-06, | |
| "loss": 0.1786, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.24750499001996, | |
| "grad_norm": 33.95945358276367, | |
| "learning_rate": 1.937624750499002e-06, | |
| "loss": 0.2107, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.372255489021956, | |
| "grad_norm": 3.9420273303985596, | |
| "learning_rate": 1.931387225548902e-06, | |
| "loss": 0.1877, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.4970059880239521, | |
| "grad_norm": 15.459404945373535, | |
| "learning_rate": 1.925149700598802e-06, | |
| "loss": 0.1808, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.621756487025948, | |
| "grad_norm": 0.35231631994247437, | |
| "learning_rate": 1.9189121756487025e-06, | |
| "loss": 0.1842, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.746506986027944, | |
| "grad_norm": 22.17848014831543, | |
| "learning_rate": 1.9126746506986028e-06, | |
| "loss": 0.2165, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.8712574850299402, | |
| "grad_norm": 31.21565055847168, | |
| "learning_rate": 1.906437125748503e-06, | |
| "loss": 0.1879, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.996007984031936, | |
| "grad_norm": 1.7563763856887817, | |
| "learning_rate": 1.9001996007984032e-06, | |
| "loss": 0.1913, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.4956786632537842, | |
| "eval_runtime": 48.4339, | |
| "eval_samples_per_second": 66.193, | |
| "eval_steps_per_second": 16.559, | |
| "step": 8016 | |
| }, | |
| { | |
| "epoch": 2.1207584830339323, | |
| "grad_norm": 0.0910625234246254, | |
| "learning_rate": 1.8939620758483032e-06, | |
| "loss": 0.1712, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.245508982035928, | |
| "grad_norm": 30.4615421295166, | |
| "learning_rate": 1.8877245508982035e-06, | |
| "loss": 0.1579, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.370259481037924, | |
| "grad_norm": 29.169662475585938, | |
| "learning_rate": 1.8814870259481036e-06, | |
| "loss": 0.1701, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.49500998003992, | |
| "grad_norm": 0.9950535893440247, | |
| "learning_rate": 1.875249500998004e-06, | |
| "loss": 0.1717, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.6197604790419162, | |
| "grad_norm": 0.30978772044181824, | |
| "learning_rate": 1.8690119760479042e-06, | |
| "loss": 0.1778, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 2.744510978043912, | |
| "grad_norm": 1.4617693424224854, | |
| "learning_rate": 1.8627744510978043e-06, | |
| "loss": 0.1772, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 2.8692614770459084, | |
| "grad_norm": 13.257425308227539, | |
| "learning_rate": 1.8565369261477044e-06, | |
| "loss": 0.1697, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 2.9940119760479043, | |
| "grad_norm": 11.522214889526367, | |
| "learning_rate": 1.8502994011976047e-06, | |
| "loss": 0.1651, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.4591982960700989, | |
| "eval_runtime": 48.8032, | |
| "eval_samples_per_second": 65.692, | |
| "eval_steps_per_second": 16.433, | |
| "step": 12024 | |
| }, | |
| { | |
| "epoch": 3.1187624750499, | |
| "grad_norm": 20.58974266052246, | |
| "learning_rate": 1.844061876247505e-06, | |
| "loss": 0.1607, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 3.243512974051896, | |
| "grad_norm": 54.52241516113281, | |
| "learning_rate": 1.8378243512974053e-06, | |
| "loss": 0.1527, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.3682634730538923, | |
| "grad_norm": 12.846843719482422, | |
| "learning_rate": 1.8315868263473054e-06, | |
| "loss": 0.1484, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 3.493013972055888, | |
| "grad_norm": 0.6479830145835876, | |
| "learning_rate": 1.8253493013972054e-06, | |
| "loss": 0.1621, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 3.6177644710578845, | |
| "grad_norm": 0.7256312370300293, | |
| "learning_rate": 1.8191117764471057e-06, | |
| "loss": 0.1428, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 3.7425149700598803, | |
| "grad_norm": 12.274479866027832, | |
| "learning_rate": 1.8128742514970058e-06, | |
| "loss": 0.1433, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 3.867265469061876, | |
| "grad_norm": 35.40715408325195, | |
| "learning_rate": 1.8066367265469061e-06, | |
| "loss": 0.161, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 3.992015968063872, | |
| "grad_norm": 0.78450608253479, | |
| "learning_rate": 1.8003992015968064e-06, | |
| "loss": 0.171, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.4495457410812378, | |
| "eval_runtime": 49.2624, | |
| "eval_samples_per_second": 65.08, | |
| "eval_steps_per_second": 16.28, | |
| "step": 16032 | |
| }, | |
| { | |
| "epoch": 4.116766467065868, | |
| "grad_norm": 0.06905636936426163, | |
| "learning_rate": 1.7941616766467065e-06, | |
| "loss": 0.1475, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 4.241516966067865, | |
| "grad_norm": 34.77931213378906, | |
| "learning_rate": 1.7879241516966066e-06, | |
| "loss": 0.1365, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.3662674650698605, | |
| "grad_norm": 0.5809102058410645, | |
| "learning_rate": 1.7816866267465069e-06, | |
| "loss": 0.1366, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 4.491017964071856, | |
| "grad_norm": 66.70156860351562, | |
| "learning_rate": 1.775449101796407e-06, | |
| "loss": 0.1526, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 4.615768463073852, | |
| "grad_norm": 29.423938751220703, | |
| "learning_rate": 1.7692115768463075e-06, | |
| "loss": 0.135, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 4.740518962075848, | |
| "grad_norm": 0.48827868700027466, | |
| "learning_rate": 1.7629740518962075e-06, | |
| "loss": 0.1444, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 4.865269461077844, | |
| "grad_norm": 8.966581344604492, | |
| "learning_rate": 1.7567365269461076e-06, | |
| "loss": 0.1295, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 4.99001996007984, | |
| "grad_norm": 3.6332414150238037, | |
| "learning_rate": 1.750499001996008e-06, | |
| "loss": 0.1407, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.5054113268852234, | |
| "eval_runtime": 46.022, | |
| "eval_samples_per_second": 69.662, | |
| "eval_steps_per_second": 17.426, | |
| "step": 20040 | |
| }, | |
| { | |
| "epoch": 5.114770459081837, | |
| "grad_norm": 26.99722671508789, | |
| "learning_rate": 1.744261477045908e-06, | |
| "loss": 0.1307, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 5.2395209580838324, | |
| "grad_norm": 0.7371481657028198, | |
| "learning_rate": 1.7380239520958083e-06, | |
| "loss": 0.1153, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 5.364271457085828, | |
| "grad_norm": 0.3232800364494324, | |
| "learning_rate": 1.7317864271457086e-06, | |
| "loss": 0.1154, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 5.489021956087824, | |
| "grad_norm": 1.8309438228607178, | |
| "learning_rate": 1.7255489021956087e-06, | |
| "loss": 0.1331, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 5.61377245508982, | |
| "grad_norm": 0.4226222038269043, | |
| "learning_rate": 1.719311377245509e-06, | |
| "loss": 0.1206, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 5.738522954091817, | |
| "grad_norm": 1.4337540864944458, | |
| "learning_rate": 1.713073852295409e-06, | |
| "loss": 0.13, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 5.863273453093813, | |
| "grad_norm": 47.5312614440918, | |
| "learning_rate": 1.7068363273453091e-06, | |
| "loss": 0.1285, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 5.9880239520958085, | |
| "grad_norm": 1.092816710472107, | |
| "learning_rate": 1.7005988023952097e-06, | |
| "loss": 0.1412, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 0.4939550459384918, | |
| "eval_runtime": 45.0298, | |
| "eval_samples_per_second": 71.197, | |
| "eval_steps_per_second": 17.81, | |
| "step": 24048 | |
| }, | |
| { | |
| "epoch": 6.112774451097804, | |
| "grad_norm": 0.03936842083930969, | |
| "learning_rate": 1.6943612774451097e-06, | |
| "loss": 0.1134, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 6.2375249500998, | |
| "grad_norm": 3.047616481781006, | |
| "learning_rate": 1.6881237524950098e-06, | |
| "loss": 0.1066, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 6.362275449101796, | |
| "grad_norm": 16.7564754486084, | |
| "learning_rate": 1.6818862275449101e-06, | |
| "loss": 0.1615, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 6.487025948103792, | |
| "grad_norm": 21.36778450012207, | |
| "learning_rate": 1.6756487025948102e-06, | |
| "loss": 0.1645, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 6.611776447105789, | |
| "grad_norm": 78.45208740234375, | |
| "learning_rate": 1.6694111776447105e-06, | |
| "loss": 0.1675, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 6.736526946107785, | |
| "grad_norm": 7.212148666381836, | |
| "learning_rate": 1.6631736526946108e-06, | |
| "loss": 0.146, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 6.86127744510978, | |
| "grad_norm": 9.503207206726074, | |
| "learning_rate": 1.6569361277445109e-06, | |
| "loss": 0.1606, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 6.986027944111776, | |
| "grad_norm": 0.4464740753173828, | |
| "learning_rate": 1.6506986027944112e-06, | |
| "loss": 0.1429, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 0.4717544615268707, | |
| "eval_runtime": 47.386, | |
| "eval_samples_per_second": 67.657, | |
| "eval_steps_per_second": 16.925, | |
| "step": 28056 | |
| }, | |
| { | |
| "epoch": 7.110778443113772, | |
| "grad_norm": 0.42686018347740173, | |
| "learning_rate": 1.6444610778443113e-06, | |
| "loss": 0.1207, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 7.235528942115769, | |
| "grad_norm": 24.92848014831543, | |
| "learning_rate": 1.6382235528942113e-06, | |
| "loss": 0.1351, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 7.360279441117765, | |
| "grad_norm": 7.397327423095703, | |
| "learning_rate": 1.6319860279441118e-06, | |
| "loss": 0.1543, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 7.485029940119761, | |
| "grad_norm": 0.43539106845855713, | |
| "learning_rate": 1.625748502994012e-06, | |
| "loss": 0.1494, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 7.6097804391217565, | |
| "grad_norm": 14.456055641174316, | |
| "learning_rate": 1.619510978043912e-06, | |
| "loss": 0.1419, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 7.734530938123752, | |
| "grad_norm": 9.563997268676758, | |
| "learning_rate": 1.6132734530938123e-06, | |
| "loss": 0.1357, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 7.859281437125748, | |
| "grad_norm": 1.7568217515945435, | |
| "learning_rate": 1.6070359281437124e-06, | |
| "loss": 0.1369, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 7.984031936127744, | |
| "grad_norm": 2.780186653137207, | |
| "learning_rate": 1.600798403193613e-06, | |
| "loss": 0.1451, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 0.45947006344795227, | |
| "eval_runtime": 44.2941, | |
| "eval_samples_per_second": 72.38, | |
| "eval_steps_per_second": 18.106, | |
| "step": 32064 | |
| }, | |
| { | |
| "epoch": 8.10878243512974, | |
| "grad_norm": 10.451217651367188, | |
| "learning_rate": 1.594560878243513e-06, | |
| "loss": 0.1136, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 8.233532934131736, | |
| "grad_norm": 0.18200552463531494, | |
| "learning_rate": 1.588323353293413e-06, | |
| "loss": 0.1259, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 8.358283433133732, | |
| "grad_norm": 1.9428528547286987, | |
| "learning_rate": 1.5820858283433134e-06, | |
| "loss": 0.1279, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 8.48303393213573, | |
| "grad_norm": 1.7016535997390747, | |
| "learning_rate": 1.5758483033932135e-06, | |
| "loss": 0.1231, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 8.607784431137725, | |
| "grad_norm": 0.7158037424087524, | |
| "learning_rate": 1.5696107784431135e-06, | |
| "loss": 0.1446, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 8.732534930139721, | |
| "grad_norm": 0.4712078273296356, | |
| "learning_rate": 1.563373253493014e-06, | |
| "loss": 0.1344, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 8.857285429141717, | |
| "grad_norm": 24.5105037689209, | |
| "learning_rate": 1.5571357285429141e-06, | |
| "loss": 0.1331, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 8.982035928143713, | |
| "grad_norm": 27.750621795654297, | |
| "learning_rate": 1.5508982035928142e-06, | |
| "loss": 0.1296, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 0.47351646423339844, | |
| "eval_runtime": 41.3902, | |
| "eval_samples_per_second": 77.458, | |
| "eval_steps_per_second": 19.377, | |
| "step": 36072 | |
| }, | |
| { | |
| "epoch": 9.106786427145709, | |
| "grad_norm": 28.095134735107422, | |
| "learning_rate": 1.5446606786427145e-06, | |
| "loss": 0.119, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 9.231536926147704, | |
| "grad_norm": 0.07204411178827286, | |
| "learning_rate": 1.5384231536926146e-06, | |
| "loss": 0.1098, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 9.3562874251497, | |
| "grad_norm": 0.2767297327518463, | |
| "learning_rate": 1.532185628742515e-06, | |
| "loss": 0.1224, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 9.481037924151696, | |
| "grad_norm": 0.14060889184474945, | |
| "learning_rate": 1.5259481037924152e-06, | |
| "loss": 0.1247, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 9.605788423153692, | |
| "grad_norm": 32.673011779785156, | |
| "learning_rate": 1.5197105788423153e-06, | |
| "loss": 0.122, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 9.730538922155688, | |
| "grad_norm": 0.21247480809688568, | |
| "learning_rate": 1.5134730538922156e-06, | |
| "loss": 0.1233, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 9.855289421157684, | |
| "grad_norm": 0.4861377775669098, | |
| "learning_rate": 1.5072355289421156e-06, | |
| "loss": 0.1286, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 9.980039920159681, | |
| "grad_norm": 11.489697456359863, | |
| "learning_rate": 1.5009980039920157e-06, | |
| "loss": 0.1203, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 0.44174808263778687, | |
| "eval_runtime": 40.4714, | |
| "eval_samples_per_second": 79.216, | |
| "eval_steps_per_second": 19.816, | |
| "step": 40080 | |
| }, | |
| { | |
| "epoch": 10.104790419161677, | |
| "grad_norm": 0.06284382939338684, | |
| "learning_rate": 1.4947604790419162e-06, | |
| "loss": 0.1176, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 10.229540918163673, | |
| "grad_norm": 0.8282334804534912, | |
| "learning_rate": 1.4885229540918163e-06, | |
| "loss": 0.1133, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 10.354291417165669, | |
| "grad_norm": 0.675163984298706, | |
| "learning_rate": 1.4822854291417164e-06, | |
| "loss": 0.0977, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 10.479041916167665, | |
| "grad_norm": 6.970102310180664, | |
| "learning_rate": 1.4760479041916167e-06, | |
| "loss": 0.1113, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 10.60379241516966, | |
| "grad_norm": 8.85517406463623, | |
| "learning_rate": 1.4698103792415168e-06, | |
| "loss": 0.1164, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 10.728542914171657, | |
| "grad_norm": 0.9282238483428955, | |
| "learning_rate": 1.4635728542914173e-06, | |
| "loss": 0.1167, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 10.853293413173652, | |
| "grad_norm": 9.984148979187012, | |
| "learning_rate": 1.4573353293413174e-06, | |
| "loss": 0.1261, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 10.978043912175648, | |
| "grad_norm": 0.20773719251155853, | |
| "learning_rate": 1.4510978043912175e-06, | |
| "loss": 0.1132, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_loss": 0.49900639057159424, | |
| "eval_runtime": 43.241, | |
| "eval_samples_per_second": 74.143, | |
| "eval_steps_per_second": 18.547, | |
| "step": 44088 | |
| }, | |
| { | |
| "epoch": 11.102794411177644, | |
| "grad_norm": 12.603593826293945, | |
| "learning_rate": 1.4448602794411178e-06, | |
| "loss": 0.1061, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 11.22754491017964, | |
| "grad_norm": 51.32432174682617, | |
| "learning_rate": 1.4386227544910178e-06, | |
| "loss": 0.1079, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 11.352295409181636, | |
| "grad_norm": 10.22624397277832, | |
| "learning_rate": 1.432385229540918e-06, | |
| "loss": 0.1166, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 11.477045908183634, | |
| "grad_norm": 11.041003227233887, | |
| "learning_rate": 1.4261477045908184e-06, | |
| "loss": 0.105, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 11.60179640718563, | |
| "grad_norm": 35.79409408569336, | |
| "learning_rate": 1.4199101796407185e-06, | |
| "loss": 0.1124, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 11.726546906187625, | |
| "grad_norm": 0.18676696717739105, | |
| "learning_rate": 1.4136726546906188e-06, | |
| "loss": 0.0928, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 11.851297405189621, | |
| "grad_norm": 1.4925884008407593, | |
| "learning_rate": 1.4074351297405189e-06, | |
| "loss": 0.1098, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 11.976047904191617, | |
| "grad_norm": 0.32953181862831116, | |
| "learning_rate": 1.401197604790419e-06, | |
| "loss": 0.1117, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_loss": 0.4872562289237976, | |
| "eval_runtime": 41.6872, | |
| "eval_samples_per_second": 76.906, | |
| "eval_steps_per_second": 19.239, | |
| "step": 48096 | |
| }, | |
| { | |
| "epoch": 12.100798403193613, | |
| "grad_norm": 0.027937307953834534, | |
| "learning_rate": 1.3949600798403195e-06, | |
| "loss": 0.0992, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 12.225548902195609, | |
| "grad_norm": 0.29068148136138916, | |
| "learning_rate": 1.3887225548902196e-06, | |
| "loss": 0.0921, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 12.350299401197605, | |
| "grad_norm": 0.127395898103714, | |
| "learning_rate": 1.3824850299401197e-06, | |
| "loss": 0.0933, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 12.4750499001996, | |
| "grad_norm": 0.09435238689184189, | |
| "learning_rate": 1.37624750499002e-06, | |
| "loss": 0.116, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 12.599800399201596, | |
| "grad_norm": 39.19729232788086, | |
| "learning_rate": 1.37000998003992e-06, | |
| "loss": 0.1052, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 12.724550898203592, | |
| "grad_norm": 0.28930047154426575, | |
| "learning_rate": 1.3637724550898201e-06, | |
| "loss": 0.1038, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 12.849301397205588, | |
| "grad_norm": 0.15510033071041107, | |
| "learning_rate": 1.3575349301397206e-06, | |
| "loss": 0.0983, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 12.974051896207584, | |
| "grad_norm": 81.58076477050781, | |
| "learning_rate": 1.3512974051896207e-06, | |
| "loss": 0.1117, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_loss": 0.45387548208236694, | |
| "eval_runtime": 43.4012, | |
| "eval_samples_per_second": 73.869, | |
| "eval_steps_per_second": 18.479, | |
| "step": 52104 | |
| }, | |
| { | |
| "epoch": 13.098802395209582, | |
| "grad_norm": 4.060844421386719, | |
| "learning_rate": 1.345059880239521e-06, | |
| "loss": 0.0983, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 13.223552894211577, | |
| "grad_norm": 33.315853118896484, | |
| "learning_rate": 1.338822355289421e-06, | |
| "loss": 0.0941, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 13.348303393213573, | |
| "grad_norm": 0.1183587834239006, | |
| "learning_rate": 1.3325848303393212e-06, | |
| "loss": 0.0973, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 13.47305389221557, | |
| "grad_norm": 40.30908966064453, | |
| "learning_rate": 1.3263473053892215e-06, | |
| "loss": 0.0871, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 13.597804391217565, | |
| "grad_norm": 0.619777262210846, | |
| "learning_rate": 1.3201097804391218e-06, | |
| "loss": 0.1001, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 13.72255489021956, | |
| "grad_norm": 0.2705942392349243, | |
| "learning_rate": 1.3138722554890218e-06, | |
| "loss": 0.0983, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 13.847305389221557, | |
| "grad_norm": 6.151524066925049, | |
| "learning_rate": 1.3076347305389221e-06, | |
| "loss": 0.0793, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 13.972055888223553, | |
| "grad_norm": 2.340573787689209, | |
| "learning_rate": 1.3013972055888222e-06, | |
| "loss": 0.099, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_loss": 0.47363531589508057, | |
| "eval_runtime": 42.3279, | |
| "eval_samples_per_second": 75.742, | |
| "eval_steps_per_second": 18.947, | |
| "step": 56112 | |
| }, | |
| { | |
| "epoch": 14.096806387225548, | |
| "grad_norm": 2.052589178085327, | |
| "learning_rate": 1.2951596806387225e-06, | |
| "loss": 0.0875, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 14.221556886227544, | |
| "grad_norm": 1.2925941944122314, | |
| "learning_rate": 1.2889221556886228e-06, | |
| "loss": 0.0812, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 14.34630738522954, | |
| "grad_norm": 0.062304213643074036, | |
| "learning_rate": 1.282684630738523e-06, | |
| "loss": 0.1017, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 14.471057884231538, | |
| "grad_norm": 0.1741693764925003, | |
| "learning_rate": 1.2764471057884232e-06, | |
| "loss": 0.0836, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 14.595808383233534, | |
| "grad_norm": 0.6444254517555237, | |
| "learning_rate": 1.2702095808383233e-06, | |
| "loss": 0.0804, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 14.72055888223553, | |
| "grad_norm": 2.0034759044647217, | |
| "learning_rate": 1.2639720558882234e-06, | |
| "loss": 0.0953, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 14.845309381237525, | |
| "grad_norm": 52.82548522949219, | |
| "learning_rate": 1.2577345309381237e-06, | |
| "loss": 0.0996, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 14.970059880239521, | |
| "grad_norm": 6.955111503601074, | |
| "learning_rate": 1.251497005988024e-06, | |
| "loss": 0.0857, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_loss": 0.45942702889442444, | |
| "eval_runtime": 43.409, | |
| "eval_samples_per_second": 73.856, | |
| "eval_steps_per_second": 18.475, | |
| "step": 60120 | |
| }, | |
| { | |
| "epoch": 15.094810379241517, | |
| "grad_norm": 3.2324092388153076, | |
| "learning_rate": 1.245259481037924e-06, | |
| "loss": 0.0849, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 15.219560878243513, | |
| "grad_norm": 61.83153533935547, | |
| "learning_rate": 1.2390219560878243e-06, | |
| "loss": 0.0798, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 15.344311377245509, | |
| "grad_norm": 0.015876924619078636, | |
| "learning_rate": 1.2327844311377244e-06, | |
| "loss": 0.0785, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 15.469061876247505, | |
| "grad_norm": 3.0025134086608887, | |
| "learning_rate": 1.2265469061876247e-06, | |
| "loss": 0.0881, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 15.5938123752495, | |
| "grad_norm": 12.912367820739746, | |
| "learning_rate": 1.220309381237525e-06, | |
| "loss": 0.0802, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 15.718562874251496, | |
| "grad_norm": 0.3600245714187622, | |
| "learning_rate": 1.214071856287425e-06, | |
| "loss": 0.0849, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 15.843313373253492, | |
| "grad_norm": 0.21024100482463837, | |
| "learning_rate": 1.2078343313373254e-06, | |
| "loss": 0.078, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 15.968063872255488, | |
| "grad_norm": 9.392132759094238, | |
| "learning_rate": 1.2015968063872255e-06, | |
| "loss": 0.0865, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_loss": 0.48642057180404663, | |
| "eval_runtime": 46.7976, | |
| "eval_samples_per_second": 68.508, | |
| "eval_steps_per_second": 17.138, | |
| "step": 64128 | |
| }, | |
| { | |
| "epoch": 16.092814371257486, | |
| "grad_norm": 0.5227041244506836, | |
| "learning_rate": 1.1953592814371256e-06, | |
| "loss": 0.0722, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 16.21756487025948, | |
| "grad_norm": 25.282564163208008, | |
| "learning_rate": 1.1891217564870259e-06, | |
| "loss": 0.0981, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 16.342315369261478, | |
| "grad_norm": 0.6670591235160828, | |
| "learning_rate": 1.1828842315369261e-06, | |
| "loss": 0.0787, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 16.46706586826347, | |
| "grad_norm": 22.668352127075195, | |
| "learning_rate": 1.1766467065868262e-06, | |
| "loss": 0.0764, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 16.59181636726547, | |
| "grad_norm": 0.22597374022006989, | |
| "learning_rate": 1.1704091816367265e-06, | |
| "loss": 0.078, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 16.716566866267463, | |
| "grad_norm": 21.123409271240234, | |
| "learning_rate": 1.1641716566866266e-06, | |
| "loss": 0.0766, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 16.84131736526946, | |
| "grad_norm": 0.04259370267391205, | |
| "learning_rate": 1.157934131736527e-06, | |
| "loss": 0.0765, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 16.96606786427146, | |
| "grad_norm": 0.021560240536928177, | |
| "learning_rate": 1.1516966067864272e-06, | |
| "loss": 0.0785, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_loss": 0.4793809652328491, | |
| "eval_runtime": 45.0906, | |
| "eval_samples_per_second": 71.101, | |
| "eval_steps_per_second": 17.786, | |
| "step": 68136 | |
| }, | |
| { | |
| "epoch": 17.090818363273453, | |
| "grad_norm": 9.094868659973145, | |
| "learning_rate": 1.1454590818363273e-06, | |
| "loss": 0.0647, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 17.21556886227545, | |
| "grad_norm": 0.195833221077919, | |
| "learning_rate": 1.1392215568862276e-06, | |
| "loss": 0.0698, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 17.340319361277444, | |
| "grad_norm": 0.18507197499275208, | |
| "learning_rate": 1.1329840319361277e-06, | |
| "loss": 0.0712, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 17.465069860279442, | |
| "grad_norm": 0.9911601543426514, | |
| "learning_rate": 1.1267465069860278e-06, | |
| "loss": 0.0752, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 17.589820359281436, | |
| "grad_norm": 1.9703953266143799, | |
| "learning_rate": 1.120508982035928e-06, | |
| "loss": 0.0675, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 17.714570858283434, | |
| "grad_norm": 41.10940933227539, | |
| "learning_rate": 1.1142714570858283e-06, | |
| "loss": 0.0705, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 17.839321357285428, | |
| "grad_norm": 15.87336254119873, | |
| "learning_rate": 1.1080339321357286e-06, | |
| "loss": 0.0763, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 17.964071856287426, | |
| "grad_norm": 0.060888275504112244, | |
| "learning_rate": 1.1017964071856287e-06, | |
| "loss": 0.0784, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_loss": 0.4715409278869629, | |
| "eval_runtime": 44.1035, | |
| "eval_samples_per_second": 72.693, | |
| "eval_steps_per_second": 18.184, | |
| "step": 72144 | |
| }, | |
| { | |
| "epoch": 18.08882235528942, | |
| "grad_norm": 2.47182035446167, | |
| "learning_rate": 1.0955588822355288e-06, | |
| "loss": 0.0747, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 18.213572854291417, | |
| "grad_norm": 40.5880126953125, | |
| "learning_rate": 1.089321357285429e-06, | |
| "loss": 0.0678, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 18.338323353293415, | |
| "grad_norm": 0.4340246915817261, | |
| "learning_rate": 1.0830838323353294e-06, | |
| "loss": 0.0713, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 18.46307385229541, | |
| "grad_norm": 4.4763312339782715, | |
| "learning_rate": 1.0768463073852295e-06, | |
| "loss": 0.065, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 18.587824351297407, | |
| "grad_norm": 0.1397508829832077, | |
| "learning_rate": 1.0706087824351298e-06, | |
| "loss": 0.0727, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 18.7125748502994, | |
| "grad_norm": 7.134496212005615, | |
| "learning_rate": 1.0643712574850299e-06, | |
| "loss": 0.0605, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 18.8373253493014, | |
| "grad_norm": 0.05227530747652054, | |
| "learning_rate": 1.05813373253493e-06, | |
| "loss": 0.0764, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 18.962075848303392, | |
| "grad_norm": 17.22441864013672, | |
| "learning_rate": 1.0518962075848302e-06, | |
| "loss": 0.0696, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_loss": 0.4802711308002472, | |
| "eval_runtime": 45.7109, | |
| "eval_samples_per_second": 70.136, | |
| "eval_steps_per_second": 17.545, | |
| "step": 76152 | |
| }, | |
| { | |
| "epoch": 19.08682634730539, | |
| "grad_norm": 0.02889215387403965, | |
| "learning_rate": 1.0456586826347305e-06, | |
| "loss": 0.0625, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 19.211576846307384, | |
| "grad_norm": 128.0497283935547, | |
| "learning_rate": 1.0394211576846308e-06, | |
| "loss": 0.0548, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 19.336327345309382, | |
| "grad_norm": 0.22108981013298035, | |
| "learning_rate": 1.033183632734531e-06, | |
| "loss": 0.0695, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 19.461077844311376, | |
| "grad_norm": 55.13557815551758, | |
| "learning_rate": 1.026946107784431e-06, | |
| "loss": 0.0679, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 19.585828343313374, | |
| "grad_norm": 3.5990562438964844, | |
| "learning_rate": 1.0207085828343313e-06, | |
| "loss": 0.0697, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 19.710578842315368, | |
| "grad_norm": 3.9640650749206543, | |
| "learning_rate": 1.0144710578842316e-06, | |
| "loss": 0.0699, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 19.835329341317365, | |
| "grad_norm": 0.3529013395309448, | |
| "learning_rate": 1.0082335329341317e-06, | |
| "loss": 0.0676, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 19.960079840319363, | |
| "grad_norm": 1.3875175714492798, | |
| "learning_rate": 1.001996007984032e-06, | |
| "loss": 0.0683, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_loss": 0.5128437280654907, | |
| "eval_runtime": 46.0282, | |
| "eval_samples_per_second": 69.653, | |
| "eval_steps_per_second": 17.424, | |
| "step": 80160 | |
| }, | |
| { | |
| "epoch": 20.084830339321357, | |
| "grad_norm": 6.171479225158691, | |
| "learning_rate": 9.95758483033932e-07, | |
| "loss": 0.0698, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 20.209580838323355, | |
| "grad_norm": 0.012239497154951096, | |
| "learning_rate": 9.895209580838323e-07, | |
| "loss": 0.0532, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 20.33433133732535, | |
| "grad_norm": 7.920960426330566, | |
| "learning_rate": 9.832834331337324e-07, | |
| "loss": 0.0609, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 20.459081836327346, | |
| "grad_norm": 59.41933822631836, | |
| "learning_rate": 9.770459081836327e-07, | |
| "loss": 0.0653, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 20.58383233532934, | |
| "grad_norm": 0.10031065344810486, | |
| "learning_rate": 9.708083832335328e-07, | |
| "loss": 0.0497, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 20.708582834331338, | |
| "grad_norm": 5.42900276184082, | |
| "learning_rate": 9.645708582834331e-07, | |
| "loss": 0.061, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 20.833333333333332, | |
| "grad_norm": 20.380285263061523, | |
| "learning_rate": 9.583333333333334e-07, | |
| "loss": 0.0717, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 20.95808383233533, | |
| "grad_norm": 0.10651753097772598, | |
| "learning_rate": 9.520958083832335e-07, | |
| "loss": 0.0638, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "eval_loss": 0.4833807945251465, | |
| "eval_runtime": 46.5592, | |
| "eval_samples_per_second": 68.859, | |
| "eval_steps_per_second": 17.225, | |
| "step": 84168 | |
| }, | |
| { | |
| "epoch": 21.082834331337324, | |
| "grad_norm": 0.3842374086380005, | |
| "learning_rate": 9.458582834331337e-07, | |
| "loss": 0.0603, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 21.20758483033932, | |
| "grad_norm": 51.563140869140625, | |
| "learning_rate": 9.396207584830339e-07, | |
| "loss": 0.06, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 21.33233532934132, | |
| "grad_norm": 0.037806153297424316, | |
| "learning_rate": 9.333832335329342e-07, | |
| "loss": 0.0612, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 21.457085828343313, | |
| "grad_norm": 0.11586946994066238, | |
| "learning_rate": 9.271457085828342e-07, | |
| "loss": 0.0664, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 21.58183632734531, | |
| "grad_norm": 0.34262338280677795, | |
| "learning_rate": 9.209081836327344e-07, | |
| "loss": 0.0602, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 21.706586826347305, | |
| "grad_norm": 0.11894870549440384, | |
| "learning_rate": 9.146706586826347e-07, | |
| "loss": 0.0522, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 21.831337325349303, | |
| "grad_norm": 0.1180167868733406, | |
| "learning_rate": 9.084331337325349e-07, | |
| "loss": 0.0616, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 21.956087824351297, | |
| "grad_norm": 0.09437087923288345, | |
| "learning_rate": 9.02195608782435e-07, | |
| "loss": 0.0607, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "eval_loss": 0.4958905279636383, | |
| "eval_runtime": 44.4581, | |
| "eval_samples_per_second": 72.113, | |
| "eval_steps_per_second": 18.039, | |
| "step": 88176 | |
| }, | |
| { | |
| "epoch": 22.080838323353294, | |
| "grad_norm": 0.5892271399497986, | |
| "learning_rate": 8.959580838323353e-07, | |
| "loss": 0.058, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 22.20558882235529, | |
| "grad_norm": 1.0569002628326416, | |
| "learning_rate": 8.897205588822355e-07, | |
| "loss": 0.0559, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 22.330339321357286, | |
| "grad_norm": 50.68812561035156, | |
| "learning_rate": 8.834830339321357e-07, | |
| "loss": 0.05, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 22.45508982035928, | |
| "grad_norm": 0.08090469241142273, | |
| "learning_rate": 8.772455089820359e-07, | |
| "loss": 0.0595, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 22.579840319361278, | |
| "grad_norm": 14.62991714477539, | |
| "learning_rate": 8.710079840319361e-07, | |
| "loss": 0.059, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 22.704590818363272, | |
| "grad_norm": 0.2893312871456146, | |
| "learning_rate": 8.647704590818364e-07, | |
| "loss": 0.0518, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 22.82934131736527, | |
| "grad_norm": 22.239938735961914, | |
| "learning_rate": 8.585329341317364e-07, | |
| "loss": 0.0493, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 22.954091816367267, | |
| "grad_norm": 0.09933929890394211, | |
| "learning_rate": 8.522954091816366e-07, | |
| "loss": 0.0536, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 23.0, | |
| "eval_loss": 0.48672357201576233, | |
| "eval_runtime": 44.6856, | |
| "eval_samples_per_second": 71.746, | |
| "eval_steps_per_second": 17.948, | |
| "step": 92184 | |
| }, | |
| { | |
| "epoch": 23.07884231536926, | |
| "grad_norm": 0.821902871131897, | |
| "learning_rate": 8.460578842315369e-07, | |
| "loss": 0.0553, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 23.20359281437126, | |
| "grad_norm": 0.2537296414375305, | |
| "learning_rate": 8.398203592814371e-07, | |
| "loss": 0.046, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 23.328343313373253, | |
| "grad_norm": 0.198989599943161, | |
| "learning_rate": 8.335828343313372e-07, | |
| "loss": 0.0496, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 23.45309381237525, | |
| "grad_norm": 14.523540496826172, | |
| "learning_rate": 8.273453093812375e-07, | |
| "loss": 0.0465, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 23.577844311377245, | |
| "grad_norm": 0.3473449945449829, | |
| "learning_rate": 8.211077844311377e-07, | |
| "loss": 0.048, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 23.702594810379242, | |
| "grad_norm": 4.4253129959106445, | |
| "learning_rate": 8.14870259481038e-07, | |
| "loss": 0.0489, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 23.827345309381236, | |
| "grad_norm": 159.51025390625, | |
| "learning_rate": 8.086327345309381e-07, | |
| "loss": 0.0552, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 23.952095808383234, | |
| "grad_norm": 0.31450316309928894, | |
| "learning_rate": 8.023952095808383e-07, | |
| "loss": 0.0537, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "eval_loss": 0.5026536583900452, | |
| "eval_runtime": 46.0362, | |
| "eval_samples_per_second": 69.641, | |
| "eval_steps_per_second": 17.421, | |
| "step": 96192 | |
| }, | |
| { | |
| "epoch": 24.076846307385228, | |
| "grad_norm": 1.8670942783355713, | |
| "learning_rate": 7.961576846307386e-07, | |
| "loss": 0.0556, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 24.201596806387226, | |
| "grad_norm": 0.4119631052017212, | |
| "learning_rate": 7.899201596806386e-07, | |
| "loss": 0.0427, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 24.32634730538922, | |
| "grad_norm": 4.47167444229126, | |
| "learning_rate": 7.836826347305388e-07, | |
| "loss": 0.0579, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 24.451097804391217, | |
| "grad_norm": 0.940743625164032, | |
| "learning_rate": 7.774451097804391e-07, | |
| "loss": 0.0462, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 24.575848303393215, | |
| "grad_norm": 4.091241359710693, | |
| "learning_rate": 7.712075848303393e-07, | |
| "loss": 0.0524, | |
| "step": 98500 | |
| }, | |
| { | |
| "epoch": 24.70059880239521, | |
| "grad_norm": 11.099757194519043, | |
| "learning_rate": 7.649700598802394e-07, | |
| "loss": 0.0549, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 24.825349301397207, | |
| "grad_norm": 2.001067876815796, | |
| "learning_rate": 7.587325349301397e-07, | |
| "loss": 0.0485, | |
| "step": 99500 | |
| }, | |
| { | |
| "epoch": 24.9500998003992, | |
| "grad_norm": 0.15496690571308136, | |
| "learning_rate": 7.524950099800399e-07, | |
| "loss": 0.0537, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "eval_loss": 0.48970088362693787, | |
| "eval_runtime": 48.0502, | |
| "eval_samples_per_second": 66.722, | |
| "eval_steps_per_second": 16.691, | |
| "step": 100200 | |
| }, | |
| { | |
| "epoch": 25.0748502994012, | |
| "grad_norm": 5.718461513519287, | |
| "learning_rate": 7.462574850299402e-07, | |
| "loss": 0.0471, | |
| "step": 100500 | |
| }, | |
| { | |
| "epoch": 25.199600798403193, | |
| "grad_norm": 53.097293853759766, | |
| "learning_rate": 7.400199600798403e-07, | |
| "loss": 0.0467, | |
| "step": 101000 | |
| }, | |
| { | |
| "epoch": 25.32435129740519, | |
| "grad_norm": 70.51046752929688, | |
| "learning_rate": 7.337824351297404e-07, | |
| "loss": 0.0464, | |
| "step": 101500 | |
| }, | |
| { | |
| "epoch": 25.449101796407184, | |
| "grad_norm": 6.485039234161377, | |
| "learning_rate": 7.275449101796407e-07, | |
| "loss": 0.0501, | |
| "step": 102000 | |
| }, | |
| { | |
| "epoch": 25.573852295409182, | |
| "grad_norm": 0.2076825648546219, | |
| "learning_rate": 7.213073852295409e-07, | |
| "loss": 0.05, | |
| "step": 102500 | |
| }, | |
| { | |
| "epoch": 25.698602794411176, | |
| "grad_norm": 40.60255432128906, | |
| "learning_rate": 7.15069860279441e-07, | |
| "loss": 0.0374, | |
| "step": 103000 | |
| }, | |
| { | |
| "epoch": 25.823353293413174, | |
| "grad_norm": 1.1958940029144287, | |
| "learning_rate": 7.088323353293413e-07, | |
| "loss": 0.0533, | |
| "step": 103500 | |
| }, | |
| { | |
| "epoch": 25.948103792415168, | |
| "grad_norm": 11.201072692871094, | |
| "learning_rate": 7.025948103792415e-07, | |
| "loss": 0.0388, | |
| "step": 104000 | |
| }, | |
| { | |
| "epoch": 26.0, | |
| "eval_loss": 0.48730549216270447, | |
| "eval_runtime": 48.7336, | |
| "eval_samples_per_second": 65.786, | |
| "eval_steps_per_second": 16.457, | |
| "step": 104208 | |
| }, | |
| { | |
| "epoch": 26.072854291417165, | |
| "grad_norm": 0.08899884670972824, | |
| "learning_rate": 6.963572854291417e-07, | |
| "loss": 0.0482, | |
| "step": 104500 | |
| }, | |
| { | |
| "epoch": 26.197604790419163, | |
| "grad_norm": 0.08736108243465424, | |
| "learning_rate": 6.901197604790419e-07, | |
| "loss": 0.042, | |
| "step": 105000 | |
| }, | |
| { | |
| "epoch": 26.322355289421157, | |
| "grad_norm": 0.050059039145708084, | |
| "learning_rate": 6.838822355289421e-07, | |
| "loss": 0.0443, | |
| "step": 105500 | |
| }, | |
| { | |
| "epoch": 26.447105788423155, | |
| "grad_norm": 0.3098917603492737, | |
| "learning_rate": 6.776447105788423e-07, | |
| "loss": 0.0431, | |
| "step": 106000 | |
| }, | |
| { | |
| "epoch": 26.57185628742515, | |
| "grad_norm": 0.601845920085907, | |
| "learning_rate": 6.714071856287425e-07, | |
| "loss": 0.0474, | |
| "step": 106500 | |
| }, | |
| { | |
| "epoch": 26.696606786427147, | |
| "grad_norm": 43.90340805053711, | |
| "learning_rate": 6.651696606786426e-07, | |
| "loss": 0.0546, | |
| "step": 107000 | |
| }, | |
| { | |
| "epoch": 26.82135728542914, | |
| "grad_norm": 0.1658441424369812, | |
| "learning_rate": 6.589321357285429e-07, | |
| "loss": 0.0463, | |
| "step": 107500 | |
| }, | |
| { | |
| "epoch": 26.94610778443114, | |
| "grad_norm": 0.7097954154014587, | |
| "learning_rate": 6.526946107784431e-07, | |
| "loss": 0.0413, | |
| "step": 108000 | |
| }, | |
| { | |
| "epoch": 27.0, | |
| "eval_loss": 0.49195966124534607, | |
| "eval_runtime": 48.5815, | |
| "eval_samples_per_second": 65.992, | |
| "eval_steps_per_second": 16.508, | |
| "step": 108216 | |
| }, | |
| { | |
| "epoch": 27.070858283433132, | |
| "grad_norm": 0.12945351004600525, | |
| "learning_rate": 6.464570858283432e-07, | |
| "loss": 0.0514, | |
| "step": 108500 | |
| }, | |
| { | |
| "epoch": 27.19560878243513, | |
| "grad_norm": 0.09241262078285217, | |
| "learning_rate": 6.402195608782435e-07, | |
| "loss": 0.0454, | |
| "step": 109000 | |
| }, | |
| { | |
| "epoch": 27.320359281437124, | |
| "grad_norm": 0.07145562022924423, | |
| "learning_rate": 6.339820359281437e-07, | |
| "loss": 0.0381, | |
| "step": 109500 | |
| }, | |
| { | |
| "epoch": 27.44510978043912, | |
| "grad_norm": 0.003607134334743023, | |
| "learning_rate": 6.277445109780439e-07, | |
| "loss": 0.0476, | |
| "step": 110000 | |
| }, | |
| { | |
| "epoch": 27.56986027944112, | |
| "grad_norm": 10.220846176147461, | |
| "learning_rate": 6.215069860279441e-07, | |
| "loss": 0.0441, | |
| "step": 110500 | |
| }, | |
| { | |
| "epoch": 27.694610778443113, | |
| "grad_norm": 0.18386581540107727, | |
| "learning_rate": 6.152694610778443e-07, | |
| "loss": 0.0461, | |
| "step": 111000 | |
| }, | |
| { | |
| "epoch": 27.81936127744511, | |
| "grad_norm": 0.26254481077194214, | |
| "learning_rate": 6.090319361277445e-07, | |
| "loss": 0.0367, | |
| "step": 111500 | |
| }, | |
| { | |
| "epoch": 27.944111776447105, | |
| "grad_norm": 68.7042007446289, | |
| "learning_rate": 6.027944111776448e-07, | |
| "loss": 0.0471, | |
| "step": 112000 | |
| }, | |
| { | |
| "epoch": 28.0, | |
| "eval_loss": 0.4870954751968384, | |
| "eval_runtime": 45.0714, | |
| "eval_samples_per_second": 71.132, | |
| "eval_steps_per_second": 17.794, | |
| "step": 112224 | |
| }, | |
| { | |
| "epoch": 28.068862275449103, | |
| "grad_norm": 0.0271464716643095, | |
| "learning_rate": 5.965568862275448e-07, | |
| "loss": 0.0433, | |
| "step": 112500 | |
| }, | |
| { | |
| "epoch": 28.193612774451097, | |
| "grad_norm": 0.0086235161870718, | |
| "learning_rate": 5.903193612774451e-07, | |
| "loss": 0.0475, | |
| "step": 113000 | |
| }, | |
| { | |
| "epoch": 28.318363273453095, | |
| "grad_norm": 0.11506126821041107, | |
| "learning_rate": 5.840818363273453e-07, | |
| "loss": 0.0353, | |
| "step": 113500 | |
| }, | |
| { | |
| "epoch": 28.44311377245509, | |
| "grad_norm": 10.355070114135742, | |
| "learning_rate": 5.778443113772454e-07, | |
| "loss": 0.0416, | |
| "step": 114000 | |
| }, | |
| { | |
| "epoch": 28.567864271457086, | |
| "grad_norm": 0.2200528234243393, | |
| "learning_rate": 5.716067864271457e-07, | |
| "loss": 0.0325, | |
| "step": 114500 | |
| }, | |
| { | |
| "epoch": 28.69261477045908, | |
| "grad_norm": 0.05802537873387337, | |
| "learning_rate": 5.653692614770459e-07, | |
| "loss": 0.0468, | |
| "step": 115000 | |
| }, | |
| { | |
| "epoch": 28.817365269461078, | |
| "grad_norm": 0.10829133540391922, | |
| "learning_rate": 5.591317365269461e-07, | |
| "loss": 0.042, | |
| "step": 115500 | |
| }, | |
| { | |
| "epoch": 28.942115768463076, | |
| "grad_norm": 0.162460595369339, | |
| "learning_rate": 5.528942115768463e-07, | |
| "loss": 0.049, | |
| "step": 116000 | |
| }, | |
| { | |
| "epoch": 29.0, | |
| "eval_loss": 0.4795687198638916, | |
| "eval_runtime": 45.1647, | |
| "eval_samples_per_second": 70.985, | |
| "eval_steps_per_second": 17.757, | |
| "step": 116232 | |
| }, | |
| { | |
| "epoch": 29.06686626746507, | |
| "grad_norm": 134.6587677001953, | |
| "learning_rate": 5.466566866267465e-07, | |
| "loss": 0.0416, | |
| "step": 116500 | |
| }, | |
| { | |
| "epoch": 29.191616766467067, | |
| "grad_norm": 0.09312257915735245, | |
| "learning_rate": 5.404191616766467e-07, | |
| "loss": 0.0287, | |
| "step": 117000 | |
| }, | |
| { | |
| "epoch": 29.31636726546906, | |
| "grad_norm": 0.3530866503715515, | |
| "learning_rate": 5.341816367265469e-07, | |
| "loss": 0.0384, | |
| "step": 117500 | |
| }, | |
| { | |
| "epoch": 29.44111776447106, | |
| "grad_norm": 0.033993642777204514, | |
| "learning_rate": 5.27944111776447e-07, | |
| "loss": 0.043, | |
| "step": 118000 | |
| }, | |
| { | |
| "epoch": 29.565868263473053, | |
| "grad_norm": 0.3124711513519287, | |
| "learning_rate": 5.217065868263473e-07, | |
| "loss": 0.04, | |
| "step": 118500 | |
| }, | |
| { | |
| "epoch": 29.69061876247505, | |
| "grad_norm": 10.49288272857666, | |
| "learning_rate": 5.154690618762475e-07, | |
| "loss": 0.0463, | |
| "step": 119000 | |
| }, | |
| { | |
| "epoch": 29.815369261477045, | |
| "grad_norm": 0.024224599823355675, | |
| "learning_rate": 5.092315369261477e-07, | |
| "loss": 0.0411, | |
| "step": 119500 | |
| }, | |
| { | |
| "epoch": 29.940119760479043, | |
| "grad_norm": 3.9215731620788574, | |
| "learning_rate": 5.029940119760479e-07, | |
| "loss": 0.0408, | |
| "step": 120000 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "eval_loss": 0.492553174495697, | |
| "eval_runtime": 46.2042, | |
| "eval_samples_per_second": 69.388, | |
| "eval_steps_per_second": 17.358, | |
| "step": 120240 | |
| }, | |
| { | |
| "epoch": 30.064870259481037, | |
| "grad_norm": 0.021667474880814552, | |
| "learning_rate": 4.967564870259481e-07, | |
| "loss": 0.0374, | |
| "step": 120500 | |
| }, | |
| { | |
| "epoch": 30.189620758483034, | |
| "grad_norm": 0.5888983011245728, | |
| "learning_rate": 4.905189620758483e-07, | |
| "loss": 0.0463, | |
| "step": 121000 | |
| }, | |
| { | |
| "epoch": 30.31437125748503, | |
| "grad_norm": 0.09637131541967392, | |
| "learning_rate": 4.842814371257485e-07, | |
| "loss": 0.033, | |
| "step": 121500 | |
| }, | |
| { | |
| "epoch": 30.439121756487026, | |
| "grad_norm": 0.23179832100868225, | |
| "learning_rate": 4.780439121756487e-07, | |
| "loss": 0.0402, | |
| "step": 122000 | |
| }, | |
| { | |
| "epoch": 30.563872255489024, | |
| "grad_norm": 0.14170564711093903, | |
| "learning_rate": 4.718063872255489e-07, | |
| "loss": 0.0395, | |
| "step": 122500 | |
| }, | |
| { | |
| "epoch": 30.688622754491018, | |
| "grad_norm": 0.006093321368098259, | |
| "learning_rate": 4.6556886227544903e-07, | |
| "loss": 0.0356, | |
| "step": 123000 | |
| }, | |
| { | |
| "epoch": 30.813373253493015, | |
| "grad_norm": 0.1018219068646431, | |
| "learning_rate": 4.593313373253493e-07, | |
| "loss": 0.0419, | |
| "step": 123500 | |
| }, | |
| { | |
| "epoch": 30.93812375249501, | |
| "grad_norm": 2.9131383895874023, | |
| "learning_rate": 4.5309381237524947e-07, | |
| "loss": 0.0378, | |
| "step": 124000 | |
| }, | |
| { | |
| "epoch": 31.0, | |
| "eval_loss": 0.5052226781845093, | |
| "eval_runtime": 43.1611, | |
| "eval_samples_per_second": 74.28, | |
| "eval_steps_per_second": 18.582, | |
| "step": 124248 | |
| }, | |
| { | |
| "epoch": 31.062874251497007, | |
| "grad_norm": 11.588695526123047, | |
| "learning_rate": 4.468562874251497e-07, | |
| "loss": 0.0346, | |
| "step": 124500 | |
| }, | |
| { | |
| "epoch": 31.187624750499, | |
| "grad_norm": 0.2488149255514145, | |
| "learning_rate": 4.4061876247504985e-07, | |
| "loss": 0.0351, | |
| "step": 125000 | |
| }, | |
| { | |
| "epoch": 31.312375249501, | |
| "grad_norm": 12.691544532775879, | |
| "learning_rate": 4.343812375249501e-07, | |
| "loss": 0.0323, | |
| "step": 125500 | |
| }, | |
| { | |
| "epoch": 31.437125748502993, | |
| "grad_norm": 0.004168800078332424, | |
| "learning_rate": 4.281437125748503e-07, | |
| "loss": 0.033, | |
| "step": 126000 | |
| }, | |
| { | |
| "epoch": 31.56187624750499, | |
| "grad_norm": 0.042690277099609375, | |
| "learning_rate": 4.219061876247505e-07, | |
| "loss": 0.039, | |
| "step": 126500 | |
| }, | |
| { | |
| "epoch": 31.686626746506985, | |
| "grad_norm": 1.1096973419189453, | |
| "learning_rate": 4.1566866267465066e-07, | |
| "loss": 0.0349, | |
| "step": 127000 | |
| }, | |
| { | |
| "epoch": 31.811377245508982, | |
| "grad_norm": 0.2642970085144043, | |
| "learning_rate": 4.094311377245509e-07, | |
| "loss": 0.0338, | |
| "step": 127500 | |
| }, | |
| { | |
| "epoch": 31.936127744510976, | |
| "grad_norm": 0.21338249742984772, | |
| "learning_rate": 4.031936127744511e-07, | |
| "loss": 0.0349, | |
| "step": 128000 | |
| }, | |
| { | |
| "epoch": 32.0, | |
| "eval_loss": 0.4927305281162262, | |
| "eval_runtime": 43.7641, | |
| "eval_samples_per_second": 73.256, | |
| "eval_steps_per_second": 18.326, | |
| "step": 128256 | |
| }, | |
| { | |
| "epoch": 32.060878243512974, | |
| "grad_norm": 0.1497274786233902, | |
| "learning_rate": 3.969560878243513e-07, | |
| "loss": 0.0403, | |
| "step": 128500 | |
| }, | |
| { | |
| "epoch": 32.18562874251497, | |
| "grad_norm": 0.5848351120948792, | |
| "learning_rate": 3.9071856287425147e-07, | |
| "loss": 0.037, | |
| "step": 129000 | |
| }, | |
| { | |
| "epoch": 32.31037924151697, | |
| "grad_norm": 0.11372077465057373, | |
| "learning_rate": 3.8448103792415166e-07, | |
| "loss": 0.0383, | |
| "step": 129500 | |
| }, | |
| { | |
| "epoch": 32.43512974051896, | |
| "grad_norm": 0.1047956719994545, | |
| "learning_rate": 3.782435129740519e-07, | |
| "loss": 0.0315, | |
| "step": 130000 | |
| }, | |
| { | |
| "epoch": 32.55988023952096, | |
| "grad_norm": 0.2975727617740631, | |
| "learning_rate": 3.7200598802395204e-07, | |
| "loss": 0.0264, | |
| "step": 130500 | |
| }, | |
| { | |
| "epoch": 32.684630738522955, | |
| "grad_norm": 0.2123280167579651, | |
| "learning_rate": 3.657684630738523e-07, | |
| "loss": 0.0341, | |
| "step": 131000 | |
| }, | |
| { | |
| "epoch": 32.80938123752495, | |
| "grad_norm": 27.63080596923828, | |
| "learning_rate": 3.5953093812375247e-07, | |
| "loss": 0.0368, | |
| "step": 131500 | |
| }, | |
| { | |
| "epoch": 32.93413173652694, | |
| "grad_norm": 0.034935545176267624, | |
| "learning_rate": 3.5329341317365266e-07, | |
| "loss": 0.0394, | |
| "step": 132000 | |
| }, | |
| { | |
| "epoch": 33.0, | |
| "eval_loss": 0.4937605559825897, | |
| "eval_runtime": 39.9355, | |
| "eval_samples_per_second": 80.279, | |
| "eval_steps_per_second": 20.082, | |
| "step": 132264 | |
| }, | |
| { | |
| "epoch": 33.05888223552894, | |
| "grad_norm": 0.003380158683285117, | |
| "learning_rate": 3.4705588822355285e-07, | |
| "loss": 0.0394, | |
| "step": 132500 | |
| }, | |
| { | |
| "epoch": 33.18363273453094, | |
| "grad_norm": 2.721451997756958, | |
| "learning_rate": 3.408183632734531e-07, | |
| "loss": 0.0365, | |
| "step": 133000 | |
| }, | |
| { | |
| "epoch": 33.308383233532936, | |
| "grad_norm": 0.4309988021850586, | |
| "learning_rate": 3.345808383233533e-07, | |
| "loss": 0.0302, | |
| "step": 133500 | |
| }, | |
| { | |
| "epoch": 33.43313373253493, | |
| "grad_norm": 0.24694228172302246, | |
| "learning_rate": 3.283433133732535e-07, | |
| "loss": 0.037, | |
| "step": 134000 | |
| }, | |
| { | |
| "epoch": 33.557884231536924, | |
| "grad_norm": 0.34988418221473694, | |
| "learning_rate": 3.2210578842315366e-07, | |
| "loss": 0.0258, | |
| "step": 134500 | |
| }, | |
| { | |
| "epoch": 33.68263473053892, | |
| "grad_norm": 0.19452495872974396, | |
| "learning_rate": 3.158682634730539e-07, | |
| "loss": 0.035, | |
| "step": 135000 | |
| }, | |
| { | |
| "epoch": 33.80738522954092, | |
| "grad_norm": 0.006651519797742367, | |
| "learning_rate": 3.096307385229541e-07, | |
| "loss": 0.0368, | |
| "step": 135500 | |
| }, | |
| { | |
| "epoch": 33.93213572854292, | |
| "grad_norm": 0.04128989204764366, | |
| "learning_rate": 3.033932135728543e-07, | |
| "loss": 0.0301, | |
| "step": 136000 | |
| }, | |
| { | |
| "epoch": 34.0, | |
| "eval_loss": 0.4872666597366333, | |
| "eval_runtime": 37.6035, | |
| "eval_samples_per_second": 85.258, | |
| "eval_steps_per_second": 21.328, | |
| "step": 136272 | |
| }, | |
| { | |
| "epoch": 34.05688622754491, | |
| "grad_norm": 0.05333876982331276, | |
| "learning_rate": 2.971556886227545e-07, | |
| "loss": 0.0349, | |
| "step": 136500 | |
| }, | |
| { | |
| "epoch": 34.181636726546905, | |
| "grad_norm": 1.3579726219177246, | |
| "learning_rate": 2.909181636726547e-07, | |
| "loss": 0.0285, | |
| "step": 137000 | |
| }, | |
| { | |
| "epoch": 34.3063872255489, | |
| "grad_norm": 0.6725994348526001, | |
| "learning_rate": 2.8468063872255486e-07, | |
| "loss": 0.0361, | |
| "step": 137500 | |
| }, | |
| { | |
| "epoch": 34.4311377245509, | |
| "grad_norm": 0.03919246420264244, | |
| "learning_rate": 2.7844311377245504e-07, | |
| "loss": 0.0274, | |
| "step": 138000 | |
| }, | |
| { | |
| "epoch": 34.55588822355289, | |
| "grad_norm": 35.5837287902832, | |
| "learning_rate": 2.722055888223553e-07, | |
| "loss": 0.0363, | |
| "step": 138500 | |
| }, | |
| { | |
| "epoch": 34.68063872255489, | |
| "grad_norm": 0.007728968746960163, | |
| "learning_rate": 2.659680638722555e-07, | |
| "loss": 0.0391, | |
| "step": 139000 | |
| }, | |
| { | |
| "epoch": 34.80538922155689, | |
| "grad_norm": 0.07272203266620636, | |
| "learning_rate": 2.5973053892215567e-07, | |
| "loss": 0.0268, | |
| "step": 139500 | |
| }, | |
| { | |
| "epoch": 34.930139720558884, | |
| "grad_norm": 0.33094656467437744, | |
| "learning_rate": 2.5349301397205586e-07, | |
| "loss": 0.0365, | |
| "step": 140000 | |
| }, | |
| { | |
| "epoch": 35.0, | |
| "eval_loss": 0.4920032024383545, | |
| "eval_runtime": 40.4781, | |
| "eval_samples_per_second": 79.203, | |
| "eval_steps_per_second": 19.813, | |
| "step": 140280 | |
| }, | |
| { | |
| "epoch": 35.054890219560875, | |
| "grad_norm": 191.99266052246094, | |
| "learning_rate": 2.472554890219561e-07, | |
| "loss": 0.0333, | |
| "step": 140500 | |
| }, | |
| { | |
| "epoch": 35.17964071856287, | |
| "grad_norm": 0.002573936013504863, | |
| "learning_rate": 2.410179640718563e-07, | |
| "loss": 0.0327, | |
| "step": 141000 | |
| }, | |
| { | |
| "epoch": 35.30439121756487, | |
| "grad_norm": 0.04750495404005051, | |
| "learning_rate": 2.3478043912175645e-07, | |
| "loss": 0.0345, | |
| "step": 141500 | |
| }, | |
| { | |
| "epoch": 35.42914171656687, | |
| "grad_norm": 193.8626251220703, | |
| "learning_rate": 2.2854291417165667e-07, | |
| "loss": 0.0321, | |
| "step": 142000 | |
| }, | |
| { | |
| "epoch": 35.553892215568865, | |
| "grad_norm": 0.0009173236903734505, | |
| "learning_rate": 2.2230538922155686e-07, | |
| "loss": 0.0359, | |
| "step": 142500 | |
| }, | |
| { | |
| "epoch": 35.678642714570856, | |
| "grad_norm": 0.12355954945087433, | |
| "learning_rate": 2.1606786427145708e-07, | |
| "loss": 0.0347, | |
| "step": 143000 | |
| }, | |
| { | |
| "epoch": 35.80339321357285, | |
| "grad_norm": 0.24140344560146332, | |
| "learning_rate": 2.0983033932135726e-07, | |
| "loss": 0.031, | |
| "step": 143500 | |
| }, | |
| { | |
| "epoch": 35.92814371257485, | |
| "grad_norm": 0.007129414472728968, | |
| "learning_rate": 2.0359281437125748e-07, | |
| "loss": 0.0214, | |
| "step": 144000 | |
| }, | |
| { | |
| "epoch": 36.0, | |
| "eval_loss": 0.4941750466823578, | |
| "eval_runtime": 38.7085, | |
| "eval_samples_per_second": 82.824, | |
| "eval_steps_per_second": 20.719, | |
| "step": 144288 | |
| }, | |
| { | |
| "epoch": 36.05289421157685, | |
| "grad_norm": 0.27973344922065735, | |
| "learning_rate": 1.9735528942115767e-07, | |
| "loss": 0.0331, | |
| "step": 144500 | |
| }, | |
| { | |
| "epoch": 36.17764471057884, | |
| "grad_norm": 0.05331612005829811, | |
| "learning_rate": 1.911177644710579e-07, | |
| "loss": 0.0303, | |
| "step": 145000 | |
| }, | |
| { | |
| "epoch": 36.30239520958084, | |
| "grad_norm": 1.8135106563568115, | |
| "learning_rate": 1.8488023952095808e-07, | |
| "loss": 0.0349, | |
| "step": 145500 | |
| }, | |
| { | |
| "epoch": 36.427145708582835, | |
| "grad_norm": 0.13009090721607208, | |
| "learning_rate": 1.7864271457085827e-07, | |
| "loss": 0.0405, | |
| "step": 146000 | |
| }, | |
| { | |
| "epoch": 36.55189620758483, | |
| "grad_norm": 0.07144490629434586, | |
| "learning_rate": 1.7240518962075848e-07, | |
| "loss": 0.0377, | |
| "step": 146500 | |
| }, | |
| { | |
| "epoch": 36.67664670658683, | |
| "grad_norm": 74.39689636230469, | |
| "learning_rate": 1.6616766467065867e-07, | |
| "loss": 0.0278, | |
| "step": 147000 | |
| }, | |
| { | |
| "epoch": 36.80139720558882, | |
| "grad_norm": 0.08526595681905746, | |
| "learning_rate": 1.599301397205589e-07, | |
| "loss": 0.0306, | |
| "step": 147500 | |
| }, | |
| { | |
| "epoch": 36.92614770459082, | |
| "grad_norm": 12.262850761413574, | |
| "learning_rate": 1.5369261477045908e-07, | |
| "loss": 0.0314, | |
| "step": 148000 | |
| }, | |
| { | |
| "epoch": 37.0, | |
| "eval_loss": 0.49442577362060547, | |
| "eval_runtime": 42.7404, | |
| "eval_samples_per_second": 75.011, | |
| "eval_steps_per_second": 18.764, | |
| "step": 148296 | |
| }, | |
| { | |
| "epoch": 37.050898203592816, | |
| "grad_norm": 0.02493446320295334, | |
| "learning_rate": 1.474550898203593e-07, | |
| "loss": 0.0262, | |
| "step": 148500 | |
| }, | |
| { | |
| "epoch": 37.17564870259481, | |
| "grad_norm": 0.14130648970603943, | |
| "learning_rate": 1.4121756487025949e-07, | |
| "loss": 0.0281, | |
| "step": 149000 | |
| }, | |
| { | |
| "epoch": 37.300399201596804, | |
| "grad_norm": 0.035768117755651474, | |
| "learning_rate": 1.3498003992015965e-07, | |
| "loss": 0.0255, | |
| "step": 149500 | |
| }, | |
| { | |
| "epoch": 37.4251497005988, | |
| "grad_norm": 0.18820720911026, | |
| "learning_rate": 1.2874251497005986e-07, | |
| "loss": 0.032, | |
| "step": 150000 | |
| }, | |
| { | |
| "epoch": 37.5499001996008, | |
| "grad_norm": 0.37001463770866394, | |
| "learning_rate": 1.2250499001996008e-07, | |
| "loss": 0.0301, | |
| "step": 150500 | |
| }, | |
| { | |
| "epoch": 37.6746506986028, | |
| "grad_norm": 0.06626907736063004, | |
| "learning_rate": 1.1626746506986028e-07, | |
| "loss": 0.0238, | |
| "step": 151000 | |
| }, | |
| { | |
| "epoch": 37.79940119760479, | |
| "grad_norm": 19.17169189453125, | |
| "learning_rate": 1.1002994011976049e-07, | |
| "loss": 0.0385, | |
| "step": 151500 | |
| }, | |
| { | |
| "epoch": 37.924151696606785, | |
| "grad_norm": 4.972864627838135, | |
| "learning_rate": 1.0379241516966066e-07, | |
| "loss": 0.0337, | |
| "step": 152000 | |
| }, | |
| { | |
| "epoch": 38.0, | |
| "eval_loss": 0.48605817556381226, | |
| "eval_runtime": 40.1954, | |
| "eval_samples_per_second": 79.76, | |
| "eval_steps_per_second": 19.953, | |
| "step": 152304 | |
| }, | |
| { | |
| "epoch": 38.04890219560878, | |
| "grad_norm": 0.002587054157629609, | |
| "learning_rate": 9.755489021956087e-08, | |
| "loss": 0.0334, | |
| "step": 152500 | |
| }, | |
| { | |
| "epoch": 38.17365269461078, | |
| "grad_norm": 70.7108383178711, | |
| "learning_rate": 9.131736526946107e-08, | |
| "loss": 0.0319, | |
| "step": 153000 | |
| }, | |
| { | |
| "epoch": 38.29840319361278, | |
| "grad_norm": 0.5694107413291931, | |
| "learning_rate": 8.507984031936127e-08, | |
| "loss": 0.0313, | |
| "step": 153500 | |
| }, | |
| { | |
| "epoch": 38.42315369261477, | |
| "grad_norm": 0.003176228841766715, | |
| "learning_rate": 7.884231536926148e-08, | |
| "loss": 0.0298, | |
| "step": 154000 | |
| }, | |
| { | |
| "epoch": 38.547904191616766, | |
| "grad_norm": 0.004230498801916838, | |
| "learning_rate": 7.260479041916168e-08, | |
| "loss": 0.0284, | |
| "step": 154500 | |
| }, | |
| { | |
| "epoch": 38.672654690618764, | |
| "grad_norm": 0.13844607770442963, | |
| "learning_rate": 6.636726546906188e-08, | |
| "loss": 0.0305, | |
| "step": 155000 | |
| }, | |
| { | |
| "epoch": 38.79740518962076, | |
| "grad_norm": 0.05394995957612991, | |
| "learning_rate": 6.012974051896207e-08, | |
| "loss": 0.0269, | |
| "step": 155500 | |
| }, | |
| { | |
| "epoch": 38.92215568862275, | |
| "grad_norm": 0.11763022094964981, | |
| "learning_rate": 5.3892215568862274e-08, | |
| "loss": 0.0279, | |
| "step": 156000 | |
| }, | |
| { | |
| "epoch": 39.0, | |
| "eval_loss": 0.4873499870300293, | |
| "eval_runtime": 44.0281, | |
| "eval_samples_per_second": 72.817, | |
| "eval_steps_per_second": 18.216, | |
| "step": 156312 | |
| }, | |
| { | |
| "epoch": 39.04690618762475, | |
| "grad_norm": 0.22139760851860046, | |
| "learning_rate": 4.765469061876248e-08, | |
| "loss": 0.0255, | |
| "step": 156500 | |
| }, | |
| { | |
| "epoch": 39.17165668662675, | |
| "grad_norm": 0.002428988926112652, | |
| "learning_rate": 4.1417165668662674e-08, | |
| "loss": 0.0302, | |
| "step": 157000 | |
| }, | |
| { | |
| "epoch": 39.296407185628745, | |
| "grad_norm": 0.07879871129989624, | |
| "learning_rate": 3.517964071856287e-08, | |
| "loss": 0.027, | |
| "step": 157500 | |
| }, | |
| { | |
| "epoch": 39.421157684630735, | |
| "grad_norm": 0.03594490885734558, | |
| "learning_rate": 2.8942115768463073e-08, | |
| "loss": 0.033, | |
| "step": 158000 | |
| }, | |
| { | |
| "epoch": 39.54590818363273, | |
| "grad_norm": 0.12444847822189331, | |
| "learning_rate": 2.2704590818363273e-08, | |
| "loss": 0.0271, | |
| "step": 158500 | |
| }, | |
| { | |
| "epoch": 39.67065868263473, | |
| "grad_norm": 47.82669448852539, | |
| "learning_rate": 1.6467065868263473e-08, | |
| "loss": 0.0276, | |
| "step": 159000 | |
| }, | |
| { | |
| "epoch": 39.79540918163673, | |
| "grad_norm": 0.1385308802127838, | |
| "learning_rate": 1.0229540918163672e-08, | |
| "loss": 0.03, | |
| "step": 159500 | |
| }, | |
| { | |
| "epoch": 39.920159680638726, | |
| "grad_norm": 0.1429419070482254, | |
| "learning_rate": 3.992015968063871e-09, | |
| "loss": 0.0303, | |
| "step": 160000 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 160320, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 40, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.21770798769152e+16, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |