| { | |
| "best_metric": 0.6869426704202687, | |
| "best_model_checkpoint": "/userstorage/modernbert-llm-grader/checkpoint-31216", | |
| "epoch": 4.0, | |
| "eval_steps": 500, | |
| "global_step": 31216, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.012813941568426449, | |
| "grad_norm": 5.321617603302002, | |
| "learning_rate": 4.987186058431574e-05, | |
| "loss": 1.4033, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.025627883136852898, | |
| "grad_norm": 3.621730089187622, | |
| "learning_rate": 4.974372116863147e-05, | |
| "loss": 1.3035, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.03844182470527934, | |
| "grad_norm": 7.95962381362915, | |
| "learning_rate": 4.961558175294721e-05, | |
| "loss": 1.2506, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.051255766273705795, | |
| "grad_norm": 3.631398916244507, | |
| "learning_rate": 4.9487442337262944e-05, | |
| "loss": 1.2354, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.06406970784213224, | |
| "grad_norm": 2.6680333614349365, | |
| "learning_rate": 4.935930292157868e-05, | |
| "loss": 1.2397, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.07688364941055868, | |
| "grad_norm": 6.042360305786133, | |
| "learning_rate": 4.9231163505894415e-05, | |
| "loss": 1.1811, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.08969759097898514, | |
| "grad_norm": 6.7501959800720215, | |
| "learning_rate": 4.9103024090210154e-05, | |
| "loss": 1.1947, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.10251153254741159, | |
| "grad_norm": 3.4089736938476562, | |
| "learning_rate": 4.8974884674525886e-05, | |
| "loss": 1.1812, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.11532547411583803, | |
| "grad_norm": 5.0775604248046875, | |
| "learning_rate": 4.884674525884162e-05, | |
| "loss": 1.1752, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.12813941568426448, | |
| "grad_norm": 4.529630184173584, | |
| "learning_rate": 4.8718605843157357e-05, | |
| "loss": 1.1867, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.14095335725269093, | |
| "grad_norm": 4.961220741271973, | |
| "learning_rate": 4.859046642747309e-05, | |
| "loss": 1.2129, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.15376729882111737, | |
| "grad_norm": 4.113813400268555, | |
| "learning_rate": 4.846232701178883e-05, | |
| "loss": 1.1293, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.16658124038954383, | |
| "grad_norm": 7.25917387008667, | |
| "learning_rate": 4.8334187596104566e-05, | |
| "loss": 1.1008, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.17939518195797027, | |
| "grad_norm": 5.579372882843018, | |
| "learning_rate": 4.82060481804203e-05, | |
| "loss": 1.1327, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.1922091235263967, | |
| "grad_norm": 9.794898986816406, | |
| "learning_rate": 4.807790876473604e-05, | |
| "loss": 1.1355, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.20502306509482318, | |
| "grad_norm": 9.875951766967773, | |
| "learning_rate": 4.794976934905177e-05, | |
| "loss": 1.0057, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.21783700666324962, | |
| "grad_norm": 7.271333694458008, | |
| "learning_rate": 4.782162993336751e-05, | |
| "loss": 1.1101, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.23065094823167606, | |
| "grad_norm": 6.730026721954346, | |
| "learning_rate": 4.769349051768324e-05, | |
| "loss": 1.0762, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.2434648898001025, | |
| "grad_norm": 5.596224784851074, | |
| "learning_rate": 4.756535110199898e-05, | |
| "loss": 1.0413, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.25627883136852897, | |
| "grad_norm": 4.591865539550781, | |
| "learning_rate": 4.743721168631472e-05, | |
| "loss": 1.0593, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.2690927729369554, | |
| "grad_norm": 6.357232570648193, | |
| "learning_rate": 4.730907227063045e-05, | |
| "loss": 1.0434, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.28190671450538185, | |
| "grad_norm": 5.185873508453369, | |
| "learning_rate": 4.718093285494619e-05, | |
| "loss": 1.021, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.2947206560738083, | |
| "grad_norm": 8.19482135772705, | |
| "learning_rate": 4.705279343926192e-05, | |
| "loss": 1.1102, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.30753459764223473, | |
| "grad_norm": 6.1499176025390625, | |
| "learning_rate": 4.692465402357765e-05, | |
| "loss": 1.0115, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.3203485392106612, | |
| "grad_norm": 11.092570304870605, | |
| "learning_rate": 4.679651460789339e-05, | |
| "loss": 0.9576, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.33316248077908767, | |
| "grad_norm": 5.10243034362793, | |
| "learning_rate": 4.666837519220912e-05, | |
| "loss": 1.0674, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.3459764223475141, | |
| "grad_norm": 4.633431434631348, | |
| "learning_rate": 4.654023577652486e-05, | |
| "loss": 1.0004, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.35879036391594055, | |
| "grad_norm": 5.507874488830566, | |
| "learning_rate": 4.6412096360840594e-05, | |
| "loss": 1.0439, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.371604305484367, | |
| "grad_norm": 5.591798305511475, | |
| "learning_rate": 4.628395694515633e-05, | |
| "loss": 1.0509, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.3844182470527934, | |
| "grad_norm": 4.341959476470947, | |
| "learning_rate": 4.6155817529472065e-05, | |
| "loss": 0.9864, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.3972321886212199, | |
| "grad_norm": 3.6542482376098633, | |
| "learning_rate": 4.6027678113787804e-05, | |
| "loss": 0.9897, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.41004613018964636, | |
| "grad_norm": 6.769758701324463, | |
| "learning_rate": 4.589953869810354e-05, | |
| "loss": 1.0528, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.4228600717580728, | |
| "grad_norm": 5.762277603149414, | |
| "learning_rate": 4.5771399282419274e-05, | |
| "loss": 1.0036, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.43567401332649924, | |
| "grad_norm": 7.389179229736328, | |
| "learning_rate": 4.564325986673501e-05, | |
| "loss": 1.0304, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.44848795489492566, | |
| "grad_norm": 3.9039294719696045, | |
| "learning_rate": 4.5515120451050745e-05, | |
| "loss": 0.9962, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.4613018964633521, | |
| "grad_norm": 3.0561447143554688, | |
| "learning_rate": 4.5386981035366484e-05, | |
| "loss": 0.9777, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.4741158380317786, | |
| "grad_norm": 6.340303897857666, | |
| "learning_rate": 4.5258841619682216e-05, | |
| "loss": 0.9603, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.486929779600205, | |
| "grad_norm": 9.058144569396973, | |
| "learning_rate": 4.5130702203997955e-05, | |
| "loss": 0.9658, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.49974372116863147, | |
| "grad_norm": 8.219672203063965, | |
| "learning_rate": 4.500256278831369e-05, | |
| "loss": 0.9856, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.5125576627370579, | |
| "grad_norm": 3.6466543674468994, | |
| "learning_rate": 4.487442337262942e-05, | |
| "loss": 0.9734, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.5253716043054844, | |
| "grad_norm": 7.289781093597412, | |
| "learning_rate": 4.474628395694516e-05, | |
| "loss": 0.9045, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.5381855458739108, | |
| "grad_norm": 6.18227481842041, | |
| "learning_rate": 4.461814454126089e-05, | |
| "loss": 0.9679, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.5509994874423373, | |
| "grad_norm": 3.994476318359375, | |
| "learning_rate": 4.449000512557663e-05, | |
| "loss": 0.8958, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.5638134290107637, | |
| "grad_norm": 3.913896322250366, | |
| "learning_rate": 4.436186570989236e-05, | |
| "loss": 0.9453, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.5766273705791901, | |
| "grad_norm": 4.39192008972168, | |
| "learning_rate": 4.42337262942081e-05, | |
| "loss": 0.9132, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.5894413121476166, | |
| "grad_norm": 5.574671745300293, | |
| "learning_rate": 4.410558687852384e-05, | |
| "loss": 0.9069, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.602255253716043, | |
| "grad_norm": 4.218778610229492, | |
| "learning_rate": 4.397744746283957e-05, | |
| "loss": 0.9631, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.6150691952844695, | |
| "grad_norm": 7.804980754852295, | |
| "learning_rate": 4.384930804715531e-05, | |
| "loss": 0.9121, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.627883136852896, | |
| "grad_norm": 7.064172744750977, | |
| "learning_rate": 4.372116863147104e-05, | |
| "loss": 0.9387, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.6406970784213224, | |
| "grad_norm": 5.293111324310303, | |
| "learning_rate": 4.359302921578678e-05, | |
| "loss": 0.9264, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.6535110199897488, | |
| "grad_norm": 7.019448757171631, | |
| "learning_rate": 4.346488980010251e-05, | |
| "loss": 0.9452, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.6663249615581753, | |
| "grad_norm": 6.714709758758545, | |
| "learning_rate": 4.333675038441825e-05, | |
| "loss": 0.8648, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.6791389031266017, | |
| "grad_norm": 8.232748031616211, | |
| "learning_rate": 4.320861096873399e-05, | |
| "loss": 0.904, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.6919528446950282, | |
| "grad_norm": 9.853933334350586, | |
| "learning_rate": 4.308047155304972e-05, | |
| "loss": 0.8895, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.7047667862634547, | |
| "grad_norm": 6.8710455894470215, | |
| "learning_rate": 4.2952332137365454e-05, | |
| "loss": 0.86, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.7175807278318811, | |
| "grad_norm": 6.45287561416626, | |
| "learning_rate": 4.2824192721681186e-05, | |
| "loss": 0.8718, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.7303946694003075, | |
| "grad_norm": 5.772899627685547, | |
| "learning_rate": 4.2696053305996924e-05, | |
| "loss": 0.8477, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.743208610968734, | |
| "grad_norm": 6.193540573120117, | |
| "learning_rate": 4.256791389031266e-05, | |
| "loss": 0.9184, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.7560225525371604, | |
| "grad_norm": 2.5397393703460693, | |
| "learning_rate": 4.2439774474628395e-05, | |
| "loss": 0.9537, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.7688364941055869, | |
| "grad_norm": 8.280569076538086, | |
| "learning_rate": 4.2311635058944134e-05, | |
| "loss": 0.8988, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.7816504356740134, | |
| "grad_norm": 10.563502311706543, | |
| "learning_rate": 4.2183495643259866e-05, | |
| "loss": 0.8518, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.7944643772424398, | |
| "grad_norm": 3.090008497238159, | |
| "learning_rate": 4.2055356227575605e-05, | |
| "loss": 0.8731, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.8072783188108662, | |
| "grad_norm": 4.051167011260986, | |
| "learning_rate": 4.192721681189134e-05, | |
| "loss": 0.8713, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.8200922603792927, | |
| "grad_norm": 7.207763671875, | |
| "learning_rate": 4.1799077396207076e-05, | |
| "loss": 0.8781, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.8329062019477191, | |
| "grad_norm": 6.396823883056641, | |
| "learning_rate": 4.1670937980522815e-05, | |
| "loss": 0.8231, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.8457201435161456, | |
| "grad_norm": 6.260582447052002, | |
| "learning_rate": 4.1542798564838547e-05, | |
| "loss": 0.8658, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.858534085084572, | |
| "grad_norm": 8.35356616973877, | |
| "learning_rate": 4.1414659149154285e-05, | |
| "loss": 0.8637, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.8713480266529985, | |
| "grad_norm": 7.236725330352783, | |
| "learning_rate": 4.128651973347002e-05, | |
| "loss": 0.8525, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.8841619682214249, | |
| "grad_norm": 14.001522064208984, | |
| "learning_rate": 4.1158380317785756e-05, | |
| "loss": 0.8628, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.8969759097898513, | |
| "grad_norm": 4.257541179656982, | |
| "learning_rate": 4.103024090210149e-05, | |
| "loss": 0.8443, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.9097898513582778, | |
| "grad_norm": 5.065970420837402, | |
| "learning_rate": 4.090210148641722e-05, | |
| "loss": 0.8329, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.9226037929267042, | |
| "grad_norm": 6.647068977355957, | |
| "learning_rate": 4.077396207073296e-05, | |
| "loss": 0.8585, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.9354177344951307, | |
| "grad_norm": 8.440242767333984, | |
| "learning_rate": 4.064582265504869e-05, | |
| "loss": 0.8749, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.9482316760635572, | |
| "grad_norm": 7.684078216552734, | |
| "learning_rate": 4.051768323936443e-05, | |
| "loss": 0.7771, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.9610456176319836, | |
| "grad_norm": 6.4709577560424805, | |
| "learning_rate": 4.038954382368016e-05, | |
| "loss": 0.8597, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.97385955920041, | |
| "grad_norm": 4.3970489501953125, | |
| "learning_rate": 4.02614044079959e-05, | |
| "loss": 0.7852, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.9866735007688365, | |
| "grad_norm": 9.167794227600098, | |
| "learning_rate": 4.013326499231164e-05, | |
| "loss": 0.8563, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.9994874423372629, | |
| "grad_norm": 6.251096248626709, | |
| "learning_rate": 4.000512557662737e-05, | |
| "loss": 0.8243, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_f1": 0.640692076906927, | |
| "eval_loss": 0.8794865608215332, | |
| "eval_runtime": 744.6214, | |
| "eval_samples_per_second": 10.48, | |
| "eval_steps_per_second": 2.62, | |
| "step": 7804 | |
| }, | |
| { | |
| "epoch": 1.0123013839056894, | |
| "grad_norm": 5.928829669952393, | |
| "learning_rate": 3.987698616094311e-05, | |
| "loss": 0.7046, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 1.0251153254741159, | |
| "grad_norm": 2.885106086730957, | |
| "learning_rate": 3.974884674525884e-05, | |
| "loss": 0.7663, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.0379292670425422, | |
| "grad_norm": 5.951350212097168, | |
| "learning_rate": 3.962070732957458e-05, | |
| "loss": 0.7374, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 1.0507432086109687, | |
| "grad_norm": 2.5160486698150635, | |
| "learning_rate": 3.949256791389031e-05, | |
| "loss": 0.7126, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 1.0635571501793952, | |
| "grad_norm": 6.847401142120361, | |
| "learning_rate": 3.936442849820605e-05, | |
| "loss": 0.6785, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 1.0763710917478215, | |
| "grad_norm": 4.729136943817139, | |
| "learning_rate": 3.923628908252179e-05, | |
| "loss": 0.7085, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 1.089185033316248, | |
| "grad_norm": 5.535890102386475, | |
| "learning_rate": 3.910814966683752e-05, | |
| "loss": 0.7548, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.1019989748846746, | |
| "grad_norm": 6.188892364501953, | |
| "learning_rate": 3.8980010251153255e-05, | |
| "loss": 0.7193, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 1.1148129164531009, | |
| "grad_norm": 5.806282997131348, | |
| "learning_rate": 3.885187083546899e-05, | |
| "loss": 0.7143, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 1.1276268580215274, | |
| "grad_norm": 10.726571083068848, | |
| "learning_rate": 3.8723731419784726e-05, | |
| "loss": 0.6892, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.140440799589954, | |
| "grad_norm": 7.0307512283325195, | |
| "learning_rate": 3.8595592004100465e-05, | |
| "loss": 0.7264, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 1.1532547411583802, | |
| "grad_norm": 20.715412139892578, | |
| "learning_rate": 3.8467452588416197e-05, | |
| "loss": 0.6987, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.1660686827268067, | |
| "grad_norm": 6.620629787445068, | |
| "learning_rate": 3.8339313172731935e-05, | |
| "loss": 0.7041, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 1.1788826242952333, | |
| "grad_norm": 5.27125883102417, | |
| "learning_rate": 3.821117375704767e-05, | |
| "loss": 0.67, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 1.1916965658636596, | |
| "grad_norm": 6.010765552520752, | |
| "learning_rate": 3.8083034341363406e-05, | |
| "loss": 0.6737, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 1.204510507432086, | |
| "grad_norm": 14.393863677978516, | |
| "learning_rate": 3.795489492567914e-05, | |
| "loss": 0.7097, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 1.2173244490005126, | |
| "grad_norm": 6.37823486328125, | |
| "learning_rate": 3.782675550999488e-05, | |
| "loss": 0.7157, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.230138390568939, | |
| "grad_norm": 11.626152992248535, | |
| "learning_rate": 3.7698616094310616e-05, | |
| "loss": 0.7066, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.2429523321373654, | |
| "grad_norm": 5.520190238952637, | |
| "learning_rate": 3.757047667862635e-05, | |
| "loss": 0.7303, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 1.255766273705792, | |
| "grad_norm": 9.865089416503906, | |
| "learning_rate": 3.744233726294209e-05, | |
| "loss": 0.7559, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 1.2685802152742183, | |
| "grad_norm": 7.075952529907227, | |
| "learning_rate": 3.731419784725782e-05, | |
| "loss": 0.6941, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 1.2813941568426448, | |
| "grad_norm": 3.4892656803131104, | |
| "learning_rate": 3.718605843157356e-05, | |
| "loss": 0.7164, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.2942080984110713, | |
| "grad_norm": 9.843413352966309, | |
| "learning_rate": 3.705791901588929e-05, | |
| "loss": 0.695, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 1.3070220399794976, | |
| "grad_norm": 12.128110885620117, | |
| "learning_rate": 3.692977960020502e-05, | |
| "loss": 0.6563, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 1.3198359815479241, | |
| "grad_norm": 11.26876163482666, | |
| "learning_rate": 3.680164018452076e-05, | |
| "loss": 0.6803, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 1.3326499231163507, | |
| "grad_norm": 12.95758056640625, | |
| "learning_rate": 3.667350076883649e-05, | |
| "loss": 0.6864, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 1.345463864684777, | |
| "grad_norm": 4.91602897644043, | |
| "learning_rate": 3.654536135315223e-05, | |
| "loss": 0.7184, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.3582778062532035, | |
| "grad_norm": 4.799069881439209, | |
| "learning_rate": 3.641722193746796e-05, | |
| "loss": 0.7558, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 1.37109174782163, | |
| "grad_norm": 64.9485855102539, | |
| "learning_rate": 3.62890825217837e-05, | |
| "loss": 0.7292, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 1.3839056893900563, | |
| "grad_norm": 6.147428512573242, | |
| "learning_rate": 3.616094310609944e-05, | |
| "loss": 0.6623, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 1.3967196309584828, | |
| "grad_norm": 7.638481140136719, | |
| "learning_rate": 3.603280369041517e-05, | |
| "loss": 0.6981, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 1.4095335725269091, | |
| "grad_norm": 4.798500061035156, | |
| "learning_rate": 3.590466427473091e-05, | |
| "loss": 0.7569, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.4223475140953357, | |
| "grad_norm": 4.413691520690918, | |
| "learning_rate": 3.5776524859046644e-05, | |
| "loss": 0.6391, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 1.4351614556637622, | |
| "grad_norm": 6.2526421546936035, | |
| "learning_rate": 3.564838544336238e-05, | |
| "loss": 0.7045, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 1.4479753972321885, | |
| "grad_norm": 6.3732805252075195, | |
| "learning_rate": 3.5520246027678114e-05, | |
| "loss": 0.6916, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 1.460789338800615, | |
| "grad_norm": 25.24698829650879, | |
| "learning_rate": 3.539210661199385e-05, | |
| "loss": 0.7652, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 1.4736032803690415, | |
| "grad_norm": 4.716599941253662, | |
| "learning_rate": 3.5263967196309585e-05, | |
| "loss": 0.7199, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.4864172219374678, | |
| "grad_norm": 13.750917434692383, | |
| "learning_rate": 3.5135827780625324e-05, | |
| "loss": 0.7032, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 1.4992311635058944, | |
| "grad_norm": 3.6678273677825928, | |
| "learning_rate": 3.500768836494106e-05, | |
| "loss": 0.6821, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 1.5120451050743209, | |
| "grad_norm": 7.891080856323242, | |
| "learning_rate": 3.487954894925679e-05, | |
| "loss": 0.7301, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 1.5248590466427472, | |
| "grad_norm": 3.25317645072937, | |
| "learning_rate": 3.475140953357253e-05, | |
| "loss": 0.6665, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 1.5376729882111737, | |
| "grad_norm": 12.75395679473877, | |
| "learning_rate": 3.462327011788826e-05, | |
| "loss": 0.733, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.5504869297796002, | |
| "grad_norm": 10.9820556640625, | |
| "learning_rate": 3.4495130702204e-05, | |
| "loss": 0.7064, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 1.5633008713480265, | |
| "grad_norm": 6.558383941650391, | |
| "learning_rate": 3.4366991286519737e-05, | |
| "loss": 0.7105, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 1.576114812916453, | |
| "grad_norm": 8.5501070022583, | |
| "learning_rate": 3.423885187083547e-05, | |
| "loss": 0.706, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 1.5889287544848796, | |
| "grad_norm": 5.319694995880127, | |
| "learning_rate": 3.411071245515121e-05, | |
| "loss": 0.7239, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 1.6017426960533059, | |
| "grad_norm": 5.92519474029541, | |
| "learning_rate": 3.398257303946694e-05, | |
| "loss": 0.7043, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 1.6145566376217324, | |
| "grad_norm": 8.853275299072266, | |
| "learning_rate": 3.385443362378268e-05, | |
| "loss": 0.6831, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 1.627370579190159, | |
| "grad_norm": 9.30588150024414, | |
| "learning_rate": 3.372629420809841e-05, | |
| "loss": 0.6756, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 1.6401845207585852, | |
| "grad_norm": 5.903197288513184, | |
| "learning_rate": 3.359815479241415e-05, | |
| "loss": 0.725, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 1.6529984623270118, | |
| "grad_norm": 5.500326156616211, | |
| "learning_rate": 3.347001537672989e-05, | |
| "loss": 0.6801, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 1.6658124038954383, | |
| "grad_norm": 7.896096229553223, | |
| "learning_rate": 3.334187596104562e-05, | |
| "loss": 0.6975, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.6786263454638646, | |
| "grad_norm": 6.674001216888428, | |
| "learning_rate": 3.321373654536136e-05, | |
| "loss": 0.6681, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 1.691440287032291, | |
| "grad_norm": 21.74435806274414, | |
| "learning_rate": 3.308559712967709e-05, | |
| "loss": 0.7045, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 1.7042542286007176, | |
| "grad_norm": 6.329532146453857, | |
| "learning_rate": 3.295745771399282e-05, | |
| "loss": 0.6885, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 1.717068170169144, | |
| "grad_norm": 24.047470092773438, | |
| "learning_rate": 3.282931829830856e-05, | |
| "loss": 0.7003, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 1.7298821117375704, | |
| "grad_norm": 7.407759666442871, | |
| "learning_rate": 3.2701178882624294e-05, | |
| "loss": 0.6856, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 1.742696053305997, | |
| "grad_norm": 5.755215167999268, | |
| "learning_rate": 3.257303946694003e-05, | |
| "loss": 0.7005, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 1.7555099948744233, | |
| "grad_norm": 11.444562911987305, | |
| "learning_rate": 3.2444900051255764e-05, | |
| "loss": 0.7136, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 1.7683239364428498, | |
| "grad_norm": 8.267853736877441, | |
| "learning_rate": 3.23167606355715e-05, | |
| "loss": 0.7029, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 1.7811378780112763, | |
| "grad_norm": 6.73785924911499, | |
| "learning_rate": 3.2188621219887235e-05, | |
| "loss": 0.6572, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 1.7939518195797026, | |
| "grad_norm": 5.369395732879639, | |
| "learning_rate": 3.2060481804202974e-05, | |
| "loss": 0.6617, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.8067657611481291, | |
| "grad_norm": 2.288243293762207, | |
| "learning_rate": 3.193234238851871e-05, | |
| "loss": 0.6688, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 1.8195797027165557, | |
| "grad_norm": 14.942804336547852, | |
| "learning_rate": 3.1804202972834445e-05, | |
| "loss": 0.6792, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 1.832393644284982, | |
| "grad_norm": 8.988631248474121, | |
| "learning_rate": 3.1676063557150184e-05, | |
| "loss": 0.6518, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 1.8452075858534085, | |
| "grad_norm": 7.9590630531311035, | |
| "learning_rate": 3.1547924141465916e-05, | |
| "loss": 0.6503, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 1.858021527421835, | |
| "grad_norm": 9.33973503112793, | |
| "learning_rate": 3.1419784725781655e-05, | |
| "loss": 0.6647, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 1.8708354689902613, | |
| "grad_norm": 9.39842700958252, | |
| "learning_rate": 3.1291645310097387e-05, | |
| "loss": 0.6515, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 1.8836494105586878, | |
| "grad_norm": 10.142439842224121, | |
| "learning_rate": 3.1163505894413125e-05, | |
| "loss": 0.6794, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 1.8964633521271144, | |
| "grad_norm": 11.658042907714844, | |
| "learning_rate": 3.1035366478728864e-05, | |
| "loss": 0.6931, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 1.9092772936955407, | |
| "grad_norm": 8.672663688659668, | |
| "learning_rate": 3.090722706304459e-05, | |
| "loss": 0.6377, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 1.9220912352639672, | |
| "grad_norm": 6.620725631713867, | |
| "learning_rate": 3.077908764736033e-05, | |
| "loss": 0.7044, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.9349051768323937, | |
| "grad_norm": 8.3103609085083, | |
| "learning_rate": 3.065094823167606e-05, | |
| "loss": 0.641, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 1.94771911840082, | |
| "grad_norm": 8.163315773010254, | |
| "learning_rate": 3.05228088159918e-05, | |
| "loss": 0.7094, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 1.9605330599692465, | |
| "grad_norm": 3.6365621089935303, | |
| "learning_rate": 3.0394669400307534e-05, | |
| "loss": 0.7022, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 1.973347001537673, | |
| "grad_norm": 4.264801502227783, | |
| "learning_rate": 3.026652998462327e-05, | |
| "loss": 0.6833, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 1.9861609431060994, | |
| "grad_norm": 6.547428131103516, | |
| "learning_rate": 3.0138390568939005e-05, | |
| "loss": 0.6126, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 1.9989748846745259, | |
| "grad_norm": 6.155936241149902, | |
| "learning_rate": 3.0010251153254744e-05, | |
| "loss": 0.6851, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_f1": 0.6772664805551888, | |
| "eval_loss": 0.781230092048645, | |
| "eval_runtime": 778.3436, | |
| "eval_samples_per_second": 10.026, | |
| "eval_steps_per_second": 2.507, | |
| "step": 15608 | |
| }, | |
| { | |
| "epoch": 2.0117888262429524, | |
| "grad_norm": 8.777030944824219, | |
| "learning_rate": 2.988211173757048e-05, | |
| "loss": 0.4707, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 2.0246027678113787, | |
| "grad_norm": 4.798321723937988, | |
| "learning_rate": 2.9753972321886215e-05, | |
| "loss": 0.4366, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 2.037416709379805, | |
| "grad_norm": 2.5244762897491455, | |
| "learning_rate": 2.962583290620195e-05, | |
| "loss": 0.504, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 2.0502306509482318, | |
| "grad_norm": 15.636524200439453, | |
| "learning_rate": 2.9497693490517686e-05, | |
| "loss": 0.4234, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 2.063044592516658, | |
| "grad_norm": 8.811060905456543, | |
| "learning_rate": 2.936955407483342e-05, | |
| "loss": 0.3911, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 2.0758585340850844, | |
| "grad_norm": 4.1310930252075195, | |
| "learning_rate": 2.9241414659149157e-05, | |
| "loss": 0.4538, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 2.088672475653511, | |
| "grad_norm": 9.516937255859375, | |
| "learning_rate": 2.9113275243464892e-05, | |
| "loss": 0.4461, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 2.1014864172219374, | |
| "grad_norm": 4.6523756980896, | |
| "learning_rate": 2.8985135827780624e-05, | |
| "loss": 0.4808, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 2.1143003587903637, | |
| "grad_norm": 4.160647392272949, | |
| "learning_rate": 2.885699641209636e-05, | |
| "loss": 0.4879, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 2.1271143003587905, | |
| "grad_norm": 11.32701587677002, | |
| "learning_rate": 2.8728856996412095e-05, | |
| "loss": 0.4544, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 2.1399282419272168, | |
| "grad_norm": 4.703444004058838, | |
| "learning_rate": 2.860071758072783e-05, | |
| "loss": 0.466, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 2.152742183495643, | |
| "grad_norm": 8.985660552978516, | |
| "learning_rate": 2.847257816504357e-05, | |
| "loss": 0.4734, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 2.16555612506407, | |
| "grad_norm": 12.306890487670898, | |
| "learning_rate": 2.8344438749359304e-05, | |
| "loss": 0.4287, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 2.178370066632496, | |
| "grad_norm": 5.025609016418457, | |
| "learning_rate": 2.821629933367504e-05, | |
| "loss": 0.4657, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 2.1911840082009224, | |
| "grad_norm": 31.554025650024414, | |
| "learning_rate": 2.8088159917990775e-05, | |
| "loss": 0.4378, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 2.203997949769349, | |
| "grad_norm": 9.015434265136719, | |
| "learning_rate": 2.796002050230651e-05, | |
| "loss": 0.4538, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 2.2168118913377755, | |
| "grad_norm": 15.61099624633789, | |
| "learning_rate": 2.7831881086622246e-05, | |
| "loss": 0.4134, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 2.2296258329062018, | |
| "grad_norm": 10.191957473754883, | |
| "learning_rate": 2.770374167093798e-05, | |
| "loss": 0.5188, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 2.2424397744746285, | |
| "grad_norm": 2.2506730556488037, | |
| "learning_rate": 2.7575602255253717e-05, | |
| "loss": 0.4028, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 2.255253716043055, | |
| "grad_norm": 23.088764190673828, | |
| "learning_rate": 2.7447462839569456e-05, | |
| "loss": 0.4814, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 2.268067657611481, | |
| "grad_norm": 4.473659515380859, | |
| "learning_rate": 2.731932342388519e-05, | |
| "loss": 0.4822, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 2.280881599179908, | |
| "grad_norm": 2.1489970684051514, | |
| "learning_rate": 2.7191184008200927e-05, | |
| "loss": 0.4934, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 2.293695540748334, | |
| "grad_norm": 1.4255170822143555, | |
| "learning_rate": 2.7063044592516662e-05, | |
| "loss": 0.4314, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 2.3065094823167605, | |
| "grad_norm": 4.612204074859619, | |
| "learning_rate": 2.693490517683239e-05, | |
| "loss": 0.4322, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 2.319323423885187, | |
| "grad_norm": 3.1022679805755615, | |
| "learning_rate": 2.680676576114813e-05, | |
| "loss": 0.424, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 2.3321373654536135, | |
| "grad_norm": 3.745171070098877, | |
| "learning_rate": 2.6678626345463865e-05, | |
| "loss": 0.4269, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 2.34495130702204, | |
| "grad_norm": 4.0442328453063965, | |
| "learning_rate": 2.65504869297796e-05, | |
| "loss": 0.4698, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 2.3577652485904665, | |
| "grad_norm": 21.303607940673828, | |
| "learning_rate": 2.6422347514095336e-05, | |
| "loss": 0.4909, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 2.370579190158893, | |
| "grad_norm": 9.175422668457031, | |
| "learning_rate": 2.629420809841107e-05, | |
| "loss": 0.4598, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 2.383393131727319, | |
| "grad_norm": 5.787283420562744, | |
| "learning_rate": 2.6166068682726807e-05, | |
| "loss": 0.4409, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 2.396207073295746, | |
| "grad_norm": 7.338250637054443, | |
| "learning_rate": 2.6037929267042542e-05, | |
| "loss": 0.4157, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 2.409021014864172, | |
| "grad_norm": 13.879666328430176, | |
| "learning_rate": 2.590978985135828e-05, | |
| "loss": 0.4584, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 2.4218349564325985, | |
| "grad_norm": 9.484577178955078, | |
| "learning_rate": 2.5781650435674016e-05, | |
| "loss": 0.4914, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 2.4346488980010252, | |
| "grad_norm": 10.865300178527832, | |
| "learning_rate": 2.565351101998975e-05, | |
| "loss": 0.4259, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 2.4474628395694515, | |
| "grad_norm": 16.69988441467285, | |
| "learning_rate": 2.5525371604305487e-05, | |
| "loss": 0.4563, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 2.460276781137878, | |
| "grad_norm": 19.711631774902344, | |
| "learning_rate": 2.5397232188621222e-05, | |
| "loss": 0.4159, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 2.4730907227063046, | |
| "grad_norm": 13.3755521774292, | |
| "learning_rate": 2.5269092772936958e-05, | |
| "loss": 0.537, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 2.485904664274731, | |
| "grad_norm": 6.953076362609863, | |
| "learning_rate": 2.5140953357252693e-05, | |
| "loss": 0.4288, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 2.498718605843157, | |
| "grad_norm": 47.91322708129883, | |
| "learning_rate": 2.5012813941568432e-05, | |
| "loss": 0.5049, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 2.511532547411584, | |
| "grad_norm": 1.6553832292556763, | |
| "learning_rate": 2.4884674525884164e-05, | |
| "loss": 0.4779, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 2.5243464889800102, | |
| "grad_norm": 12.199808120727539, | |
| "learning_rate": 2.47565351101999e-05, | |
| "loss": 0.4246, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 2.5371604305484365, | |
| "grad_norm": 11.326825141906738, | |
| "learning_rate": 2.4628395694515635e-05, | |
| "loss": 0.4482, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 2.5499743721168633, | |
| "grad_norm": 9.247246742248535, | |
| "learning_rate": 2.450025627883137e-05, | |
| "loss": 0.4656, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 2.5627883136852896, | |
| "grad_norm": 1.773540735244751, | |
| "learning_rate": 2.4372116863147106e-05, | |
| "loss": 0.4776, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 2.575602255253716, | |
| "grad_norm": 7.454749584197998, | |
| "learning_rate": 2.424397744746284e-05, | |
| "loss": 0.4161, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 2.5884161968221426, | |
| "grad_norm": 19.77891731262207, | |
| "learning_rate": 2.4115838031778577e-05, | |
| "loss": 0.4609, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 2.601230138390569, | |
| "grad_norm": 12.208200454711914, | |
| "learning_rate": 2.3987698616094312e-05, | |
| "loss": 0.453, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 2.6140440799589952, | |
| "grad_norm": 11.438812255859375, | |
| "learning_rate": 2.3859559200410047e-05, | |
| "loss": 0.4439, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 2.626858021527422, | |
| "grad_norm": 1.6863147020339966, | |
| "learning_rate": 2.3731419784725783e-05, | |
| "loss": 0.3987, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 2.6396719630958483, | |
| "grad_norm": 1.3637946844100952, | |
| "learning_rate": 2.3603280369041518e-05, | |
| "loss": 0.4523, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 2.6524859046642746, | |
| "grad_norm": 21.555208206176758, | |
| "learning_rate": 2.3475140953357254e-05, | |
| "loss": 0.4624, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 2.6652998462327013, | |
| "grad_norm": 8.768684387207031, | |
| "learning_rate": 2.334700153767299e-05, | |
| "loss": 0.4585, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 2.6781137878011276, | |
| "grad_norm": 3.2959704399108887, | |
| "learning_rate": 2.3218862121988724e-05, | |
| "loss": 0.4579, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 2.690927729369554, | |
| "grad_norm": 16.97565269470215, | |
| "learning_rate": 2.309072270630446e-05, | |
| "loss": 0.4132, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 2.7037416709379807, | |
| "grad_norm": 14.613641738891602, | |
| "learning_rate": 2.2962583290620195e-05, | |
| "loss": 0.4297, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 2.716555612506407, | |
| "grad_norm": 28.61090087890625, | |
| "learning_rate": 2.283444387493593e-05, | |
| "loss": 0.4479, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 2.7293695540748333, | |
| "grad_norm": 9.84257984161377, | |
| "learning_rate": 2.2706304459251666e-05, | |
| "loss": 0.4428, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 2.74218349564326, | |
| "grad_norm": 8.199345588684082, | |
| "learning_rate": 2.2578165043567405e-05, | |
| "loss": 0.3999, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 2.7549974372116863, | |
| "grad_norm": 15.411248207092285, | |
| "learning_rate": 2.2450025627883137e-05, | |
| "loss": 0.4423, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 2.7678113787801126, | |
| "grad_norm": 7.122200012207031, | |
| "learning_rate": 2.2321886212198872e-05, | |
| "loss": 0.4675, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 2.7806253203485394, | |
| "grad_norm": 11.358266830444336, | |
| "learning_rate": 2.2193746796514608e-05, | |
| "loss": 0.4885, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 2.7934392619169657, | |
| "grad_norm": 9.456644058227539, | |
| "learning_rate": 2.2065607380830343e-05, | |
| "loss": 0.4973, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 2.806253203485392, | |
| "grad_norm": 28.7235164642334, | |
| "learning_rate": 2.193746796514608e-05, | |
| "loss": 0.429, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 2.8190671450538183, | |
| "grad_norm": 14.859136581420898, | |
| "learning_rate": 2.1809328549461817e-05, | |
| "loss": 0.4867, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 2.831881086622245, | |
| "grad_norm": 3.089897394180298, | |
| "learning_rate": 2.1681189133777553e-05, | |
| "loss": 0.4249, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 2.8446950281906713, | |
| "grad_norm": 14.606719970703125, | |
| "learning_rate": 2.1553049718093288e-05, | |
| "loss": 0.4429, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 2.857508969759098, | |
| "grad_norm": 7.761451244354248, | |
| "learning_rate": 2.142491030240902e-05, | |
| "loss": 0.4639, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 2.8703229113275244, | |
| "grad_norm": 6.9101362228393555, | |
| "learning_rate": 2.1296770886724756e-05, | |
| "loss": 0.4606, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 2.8831368528959507, | |
| "grad_norm": 6.754969120025635, | |
| "learning_rate": 2.116863147104049e-05, | |
| "loss": 0.4784, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 2.895950794464377, | |
| "grad_norm": 20.884119033813477, | |
| "learning_rate": 2.104049205535623e-05, | |
| "loss": 0.4625, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 2.9087647360328037, | |
| "grad_norm": 18.428529739379883, | |
| "learning_rate": 2.0912352639671965e-05, | |
| "loss": 0.4121, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 2.92157867760123, | |
| "grad_norm": 9.211915969848633, | |
| "learning_rate": 2.07842132239877e-05, | |
| "loss": 0.457, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 2.9343926191696568, | |
| "grad_norm": 5.744906425476074, | |
| "learning_rate": 2.0656073808303436e-05, | |
| "loss": 0.4169, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 2.947206560738083, | |
| "grad_norm": 10.679366111755371, | |
| "learning_rate": 2.052793439261917e-05, | |
| "loss": 0.4719, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 2.9600205023065094, | |
| "grad_norm": 8.72630500793457, | |
| "learning_rate": 2.0399794976934904e-05, | |
| "loss": 0.4743, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 2.9728344438749357, | |
| "grad_norm": 5.53284215927124, | |
| "learning_rate": 2.0271655561250642e-05, | |
| "loss": 0.4592, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 2.9856483854433624, | |
| "grad_norm": 10.75283432006836, | |
| "learning_rate": 2.0143516145566378e-05, | |
| "loss": 0.3971, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 2.9984623270117887, | |
| "grad_norm": 10.634764671325684, | |
| "learning_rate": 2.0015376729882113e-05, | |
| "loss": 0.4295, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_f1": 0.6853715205850849, | |
| "eval_loss": 1.0191140174865723, | |
| "eval_runtime": 837.6905, | |
| "eval_samples_per_second": 9.316, | |
| "eval_steps_per_second": 2.329, | |
| "step": 23412 | |
| }, | |
| { | |
| "epoch": 3.0112762685802155, | |
| "grad_norm": 3.43902587890625, | |
| "learning_rate": 1.988723731419785e-05, | |
| "loss": 0.2448, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 3.0240902101486418, | |
| "grad_norm": 0.5649552941322327, | |
| "learning_rate": 1.9759097898513584e-05, | |
| "loss": 0.1908, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 3.036904151717068, | |
| "grad_norm": 1.3035610914230347, | |
| "learning_rate": 1.963095848282932e-05, | |
| "loss": 0.275, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 3.049718093285495, | |
| "grad_norm": 27.42232322692871, | |
| "learning_rate": 1.9502819067145055e-05, | |
| "loss": 0.2727, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 3.062532034853921, | |
| "grad_norm": 1.675907015800476, | |
| "learning_rate": 1.937467965146079e-05, | |
| "loss": 0.2916, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 3.0753459764223474, | |
| "grad_norm": 9.602179527282715, | |
| "learning_rate": 1.9246540235776526e-05, | |
| "loss": 0.2645, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 3.088159917990774, | |
| "grad_norm": 16.757831573486328, | |
| "learning_rate": 1.911840082009226e-05, | |
| "loss": 0.2476, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 3.1009738595592005, | |
| "grad_norm": 5.842043876647949, | |
| "learning_rate": 1.8990261404407997e-05, | |
| "loss": 0.2829, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 3.1137878011276268, | |
| "grad_norm": 0.593449592590332, | |
| "learning_rate": 1.8862121988723732e-05, | |
| "loss": 0.289, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 3.1266017426960535, | |
| "grad_norm": 5.712982177734375, | |
| "learning_rate": 1.8733982573039467e-05, | |
| "loss": 0.2355, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 3.13941568426448, | |
| "grad_norm": 0.3152589201927185, | |
| "learning_rate": 1.8605843157355203e-05, | |
| "loss": 0.2491, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 3.152229625832906, | |
| "grad_norm": 19.951833724975586, | |
| "learning_rate": 1.8477703741670938e-05, | |
| "loss": 0.271, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 3.165043567401333, | |
| "grad_norm": 5.257028579711914, | |
| "learning_rate": 1.8349564325986674e-05, | |
| "loss": 0.277, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 3.177857508969759, | |
| "grad_norm": 3.6717381477355957, | |
| "learning_rate": 1.822142491030241e-05, | |
| "loss": 0.2736, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 3.1906714505381855, | |
| "grad_norm": 38.49631881713867, | |
| "learning_rate": 1.8093285494618144e-05, | |
| "loss": 0.2789, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 3.2034853921066118, | |
| "grad_norm": 5.944704055786133, | |
| "learning_rate": 1.796514607893388e-05, | |
| "loss": 0.3111, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 3.2162993336750385, | |
| "grad_norm": 3.278078079223633, | |
| "learning_rate": 1.7837006663249615e-05, | |
| "loss": 0.287, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 3.229113275243465, | |
| "grad_norm": 13.320869445800781, | |
| "learning_rate": 1.7708867247565354e-05, | |
| "loss": 0.2708, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 3.2419272168118916, | |
| "grad_norm": 9.01321029663086, | |
| "learning_rate": 1.758072783188109e-05, | |
| "loss": 0.2891, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 3.254741158380318, | |
| "grad_norm": 14.35201644897461, | |
| "learning_rate": 1.745258841619682e-05, | |
| "loss": 0.1523, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 3.267555099948744, | |
| "grad_norm": 5.268370628356934, | |
| "learning_rate": 1.7324449000512557e-05, | |
| "loss": 0.3608, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 3.2803690415171705, | |
| "grad_norm": 3.338168144226074, | |
| "learning_rate": 1.7196309584828292e-05, | |
| "loss": 0.2829, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 3.293182983085597, | |
| "grad_norm": 12.441572189331055, | |
| "learning_rate": 1.7068170169144028e-05, | |
| "loss": 0.2563, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 3.3059969246540235, | |
| "grad_norm": 2.870978832244873, | |
| "learning_rate": 1.6940030753459767e-05, | |
| "loss": 0.2957, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 3.3188108662224503, | |
| "grad_norm": 10.626642227172852, | |
| "learning_rate": 1.6811891337775502e-05, | |
| "loss": 0.3493, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 3.3316248077908766, | |
| "grad_norm": 1.1796225309371948, | |
| "learning_rate": 1.6683751922091237e-05, | |
| "loss": 0.293, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 3.344438749359303, | |
| "grad_norm": 46.64753341674805, | |
| "learning_rate": 1.6555612506406973e-05, | |
| "loss": 0.2739, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 3.357252690927729, | |
| "grad_norm": 17.778207778930664, | |
| "learning_rate": 1.6427473090722705e-05, | |
| "loss": 0.2897, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 3.370066632496156, | |
| "grad_norm": 1.6698403358459473, | |
| "learning_rate": 1.629933367503844e-05, | |
| "loss": 0.2661, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 3.382880574064582, | |
| "grad_norm": 0.18206116557121277, | |
| "learning_rate": 1.617119425935418e-05, | |
| "loss": 0.2847, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 3.395694515633009, | |
| "grad_norm": 6.839690208435059, | |
| "learning_rate": 1.6043054843669915e-05, | |
| "loss": 0.3044, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 3.4085084572014352, | |
| "grad_norm": 0.6313930749893188, | |
| "learning_rate": 1.591491542798565e-05, | |
| "loss": 0.2623, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 3.4213223987698616, | |
| "grad_norm": 70.23905181884766, | |
| "learning_rate": 1.5786776012301385e-05, | |
| "loss": 0.2573, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 3.434136340338288, | |
| "grad_norm": 16.72913360595703, | |
| "learning_rate": 1.565863659661712e-05, | |
| "loss": 0.2626, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 3.4469502819067146, | |
| "grad_norm": 43.662845611572266, | |
| "learning_rate": 1.5530497180932856e-05, | |
| "loss": 0.2679, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 3.459764223475141, | |
| "grad_norm": 20.96466064453125, | |
| "learning_rate": 1.540235776524859e-05, | |
| "loss": 0.3082, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 3.4725781650435676, | |
| "grad_norm": 45.02407455444336, | |
| "learning_rate": 1.5274218349564327e-05, | |
| "loss": 0.2492, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 3.485392106611994, | |
| "grad_norm": 14.404077529907227, | |
| "learning_rate": 1.5146078933880062e-05, | |
| "loss": 0.2704, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 3.4982060481804202, | |
| "grad_norm": 19.40283966064453, | |
| "learning_rate": 1.5017939518195798e-05, | |
| "loss": 0.3089, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 3.5110199897488465, | |
| "grad_norm": 13.016902923583984, | |
| "learning_rate": 1.4889800102511533e-05, | |
| "loss": 0.2953, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 3.5238339313172733, | |
| "grad_norm": 6.934922695159912, | |
| "learning_rate": 1.4761660686827269e-05, | |
| "loss": 0.2132, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 3.5366478728856996, | |
| "grad_norm": 49.58895492553711, | |
| "learning_rate": 1.4633521271143006e-05, | |
| "loss": 0.271, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 3.5494618144541263, | |
| "grad_norm": 4.814508438110352, | |
| "learning_rate": 1.4505381855458741e-05, | |
| "loss": 0.3195, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 3.5622757560225526, | |
| "grad_norm": 28.65342903137207, | |
| "learning_rate": 1.4377242439774475e-05, | |
| "loss": 0.2869, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 3.575089697590979, | |
| "grad_norm": 5.931487083435059, | |
| "learning_rate": 1.424910302409021e-05, | |
| "loss": 0.2982, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 3.5879036391594052, | |
| "grad_norm": 0.22432470321655273, | |
| "learning_rate": 1.4120963608405946e-05, | |
| "loss": 0.3167, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 3.600717580727832, | |
| "grad_norm": 27.89299964904785, | |
| "learning_rate": 1.3992824192721681e-05, | |
| "loss": 0.2831, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 3.6135315222962583, | |
| "grad_norm": 6.232203006744385, | |
| "learning_rate": 1.3864684777037418e-05, | |
| "loss": 0.2328, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 3.626345463864685, | |
| "grad_norm": 0.3798358738422394, | |
| "learning_rate": 1.3736545361353154e-05, | |
| "loss": 0.2565, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 3.6391594054331113, | |
| "grad_norm": 2.3177566528320312, | |
| "learning_rate": 1.3608405945668889e-05, | |
| "loss": 0.2822, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 3.6519733470015376, | |
| "grad_norm": 0.9287611246109009, | |
| "learning_rate": 1.3480266529984623e-05, | |
| "loss": 0.2206, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 3.664787288569964, | |
| "grad_norm": 19.89398765563965, | |
| "learning_rate": 1.3352127114300358e-05, | |
| "loss": 0.2934, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 3.6776012301383907, | |
| "grad_norm": 14.735712051391602, | |
| "learning_rate": 1.3223987698616094e-05, | |
| "loss": 0.2667, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 3.690415171706817, | |
| "grad_norm": 2.782954454421997, | |
| "learning_rate": 1.309584828293183e-05, | |
| "loss": 0.2565, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 3.7032291132752437, | |
| "grad_norm": 20.082395553588867, | |
| "learning_rate": 1.2967708867247566e-05, | |
| "loss": 0.3069, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 3.71604305484367, | |
| "grad_norm": 1.8632967472076416, | |
| "learning_rate": 1.2839569451563302e-05, | |
| "loss": 0.2484, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 3.7288569964120963, | |
| "grad_norm": 6.2880330085754395, | |
| "learning_rate": 1.2711430035879037e-05, | |
| "loss": 0.2769, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 3.7416709379805226, | |
| "grad_norm": 18.328922271728516, | |
| "learning_rate": 1.2583290620194774e-05, | |
| "loss": 0.284, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 3.7544848795489494, | |
| "grad_norm": 0.2658964991569519, | |
| "learning_rate": 1.2455151204510508e-05, | |
| "loss": 0.2725, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 3.7672988211173757, | |
| "grad_norm": 7.819123268127441, | |
| "learning_rate": 1.2327011788826243e-05, | |
| "loss": 0.2513, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 3.7801127626858024, | |
| "grad_norm": 4.6279144287109375, | |
| "learning_rate": 1.2198872373141979e-05, | |
| "loss": 0.2573, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 3.7929267042542287, | |
| "grad_norm": 24.996662139892578, | |
| "learning_rate": 1.2070732957457714e-05, | |
| "loss": 0.2621, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 3.805740645822655, | |
| "grad_norm": 20.87746810913086, | |
| "learning_rate": 1.194259354177345e-05, | |
| "loss": 0.2499, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 3.8185545873910813, | |
| "grad_norm": 1.5061414241790771, | |
| "learning_rate": 1.1814454126089187e-05, | |
| "loss": 0.265, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 3.831368528959508, | |
| "grad_norm": 2.7230064868927, | |
| "learning_rate": 1.168631471040492e-05, | |
| "loss": 0.2469, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 3.8441824705279344, | |
| "grad_norm": 0.6768075823783875, | |
| "learning_rate": 1.1558175294720656e-05, | |
| "loss": 0.2686, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 3.8569964120963607, | |
| "grad_norm": 0.08343211561441422, | |
| "learning_rate": 1.1430035879036393e-05, | |
| "loss": 0.2565, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 3.8698103536647874, | |
| "grad_norm": 25.58348274230957, | |
| "learning_rate": 1.1301896463352128e-05, | |
| "loss": 0.2967, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 3.8826242952332137, | |
| "grad_norm": 1.0459709167480469, | |
| "learning_rate": 1.1173757047667862e-05, | |
| "loss": 0.3028, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 3.89543823680164, | |
| "grad_norm": 0.33878639340400696, | |
| "learning_rate": 1.1045617631983599e-05, | |
| "loss": 0.2243, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 3.9082521783700668, | |
| "grad_norm": 2.021047592163086, | |
| "learning_rate": 1.0917478216299335e-05, | |
| "loss": 0.3656, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 3.921066119938493, | |
| "grad_norm": 1.6855653524398804, | |
| "learning_rate": 1.078933880061507e-05, | |
| "loss": 0.2323, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 3.9338800615069194, | |
| "grad_norm": 21.66104507446289, | |
| "learning_rate": 1.0661199384930805e-05, | |
| "loss": 0.2205, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 3.946694003075346, | |
| "grad_norm": 2.4428458213806152, | |
| "learning_rate": 1.053305996924654e-05, | |
| "loss": 0.2436, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 3.9595079446437724, | |
| "grad_norm": 39.37623596191406, | |
| "learning_rate": 1.0404920553562276e-05, | |
| "loss": 0.2831, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 3.9723218862121987, | |
| "grad_norm": 44.4313850402832, | |
| "learning_rate": 1.0276781137878012e-05, | |
| "loss": 0.2522, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 3.9851358277806255, | |
| "grad_norm": 2.6004929542541504, | |
| "learning_rate": 1.0148641722193747e-05, | |
| "loss": 0.3209, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 3.9979497693490518, | |
| "grad_norm": 2.536029815673828, | |
| "learning_rate": 1.0020502306509482e-05, | |
| "loss": 0.2807, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_f1": 0.6869426704202687, | |
| "eval_loss": 2.03011155128479, | |
| "eval_runtime": 825.2142, | |
| "eval_samples_per_second": 9.457, | |
| "eval_steps_per_second": 2.364, | |
| "step": 31216 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 39020, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.105948110057636e+17, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |