{ "best_metric": 0.6869426704202687, "best_model_checkpoint": "/userstorage/modernbert-llm-grader/checkpoint-31216", "epoch": 4.0, "eval_steps": 500, "global_step": 31216, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012813941568426449, "grad_norm": 5.321617603302002, "learning_rate": 4.987186058431574e-05, "loss": 1.4033, "step": 100 }, { "epoch": 0.025627883136852898, "grad_norm": 3.621730089187622, "learning_rate": 4.974372116863147e-05, "loss": 1.3035, "step": 200 }, { "epoch": 0.03844182470527934, "grad_norm": 7.95962381362915, "learning_rate": 4.961558175294721e-05, "loss": 1.2506, "step": 300 }, { "epoch": 0.051255766273705795, "grad_norm": 3.631398916244507, "learning_rate": 4.9487442337262944e-05, "loss": 1.2354, "step": 400 }, { "epoch": 0.06406970784213224, "grad_norm": 2.6680333614349365, "learning_rate": 4.935930292157868e-05, "loss": 1.2397, "step": 500 }, { "epoch": 0.07688364941055868, "grad_norm": 6.042360305786133, "learning_rate": 4.9231163505894415e-05, "loss": 1.1811, "step": 600 }, { "epoch": 0.08969759097898514, "grad_norm": 6.7501959800720215, "learning_rate": 4.9103024090210154e-05, "loss": 1.1947, "step": 700 }, { "epoch": 0.10251153254741159, "grad_norm": 3.4089736938476562, "learning_rate": 4.8974884674525886e-05, "loss": 1.1812, "step": 800 }, { "epoch": 0.11532547411583803, "grad_norm": 5.0775604248046875, "learning_rate": 4.884674525884162e-05, "loss": 1.1752, "step": 900 }, { "epoch": 0.12813941568426448, "grad_norm": 4.529630184173584, "learning_rate": 4.8718605843157357e-05, "loss": 1.1867, "step": 1000 }, { "epoch": 0.14095335725269093, "grad_norm": 4.961220741271973, "learning_rate": 4.859046642747309e-05, "loss": 1.2129, "step": 1100 }, { "epoch": 0.15376729882111737, "grad_norm": 4.113813400268555, "learning_rate": 4.846232701178883e-05, "loss": 1.1293, "step": 1200 }, { "epoch": 0.16658124038954383, "grad_norm": 7.25917387008667, "learning_rate": 4.8334187596104566e-05, "loss": 1.1008, "step": 1300 }, { "epoch": 0.17939518195797027, "grad_norm": 5.579372882843018, "learning_rate": 4.82060481804203e-05, "loss": 1.1327, "step": 1400 }, { "epoch": 0.1922091235263967, "grad_norm": 9.794898986816406, "learning_rate": 4.807790876473604e-05, "loss": 1.1355, "step": 1500 }, { "epoch": 0.20502306509482318, "grad_norm": 9.875951766967773, "learning_rate": 4.794976934905177e-05, "loss": 1.0057, "step": 1600 }, { "epoch": 0.21783700666324962, "grad_norm": 7.271333694458008, "learning_rate": 4.782162993336751e-05, "loss": 1.1101, "step": 1700 }, { "epoch": 0.23065094823167606, "grad_norm": 6.730026721954346, "learning_rate": 4.769349051768324e-05, "loss": 1.0762, "step": 1800 }, { "epoch": 0.2434648898001025, "grad_norm": 5.596224784851074, "learning_rate": 4.756535110199898e-05, "loss": 1.0413, "step": 1900 }, { "epoch": 0.25627883136852897, "grad_norm": 4.591865539550781, "learning_rate": 4.743721168631472e-05, "loss": 1.0593, "step": 2000 }, { "epoch": 0.2690927729369554, "grad_norm": 6.357232570648193, "learning_rate": 4.730907227063045e-05, "loss": 1.0434, "step": 2100 }, { "epoch": 0.28190671450538185, "grad_norm": 5.185873508453369, "learning_rate": 4.718093285494619e-05, "loss": 1.021, "step": 2200 }, { "epoch": 0.2947206560738083, "grad_norm": 8.19482135772705, "learning_rate": 4.705279343926192e-05, "loss": 1.1102, "step": 2300 }, { "epoch": 0.30753459764223473, "grad_norm": 6.1499176025390625, "learning_rate": 4.692465402357765e-05, "loss": 1.0115, "step": 2400 }, { "epoch": 0.3203485392106612, "grad_norm": 11.092570304870605, "learning_rate": 4.679651460789339e-05, "loss": 0.9576, "step": 2500 }, { "epoch": 0.33316248077908767, "grad_norm": 5.10243034362793, "learning_rate": 4.666837519220912e-05, "loss": 1.0674, "step": 2600 }, { "epoch": 0.3459764223475141, "grad_norm": 4.633431434631348, "learning_rate": 4.654023577652486e-05, "loss": 1.0004, "step": 2700 }, { "epoch": 0.35879036391594055, "grad_norm": 5.507874488830566, "learning_rate": 4.6412096360840594e-05, "loss": 1.0439, "step": 2800 }, { "epoch": 0.371604305484367, "grad_norm": 5.591798305511475, "learning_rate": 4.628395694515633e-05, "loss": 1.0509, "step": 2900 }, { "epoch": 0.3844182470527934, "grad_norm": 4.341959476470947, "learning_rate": 4.6155817529472065e-05, "loss": 0.9864, "step": 3000 }, { "epoch": 0.3972321886212199, "grad_norm": 3.6542482376098633, "learning_rate": 4.6027678113787804e-05, "loss": 0.9897, "step": 3100 }, { "epoch": 0.41004613018964636, "grad_norm": 6.769758701324463, "learning_rate": 4.589953869810354e-05, "loss": 1.0528, "step": 3200 }, { "epoch": 0.4228600717580728, "grad_norm": 5.762277603149414, "learning_rate": 4.5771399282419274e-05, "loss": 1.0036, "step": 3300 }, { "epoch": 0.43567401332649924, "grad_norm": 7.389179229736328, "learning_rate": 4.564325986673501e-05, "loss": 1.0304, "step": 3400 }, { "epoch": 0.44848795489492566, "grad_norm": 3.9039294719696045, "learning_rate": 4.5515120451050745e-05, "loss": 0.9962, "step": 3500 }, { "epoch": 0.4613018964633521, "grad_norm": 3.0561447143554688, "learning_rate": 4.5386981035366484e-05, "loss": 0.9777, "step": 3600 }, { "epoch": 0.4741158380317786, "grad_norm": 6.340303897857666, "learning_rate": 4.5258841619682216e-05, "loss": 0.9603, "step": 3700 }, { "epoch": 0.486929779600205, "grad_norm": 9.058144569396973, "learning_rate": 4.5130702203997955e-05, "loss": 0.9658, "step": 3800 }, { "epoch": 0.49974372116863147, "grad_norm": 8.219672203063965, "learning_rate": 4.500256278831369e-05, "loss": 0.9856, "step": 3900 }, { "epoch": 0.5125576627370579, "grad_norm": 3.6466543674468994, "learning_rate": 4.487442337262942e-05, "loss": 0.9734, "step": 4000 }, { "epoch": 0.5253716043054844, "grad_norm": 7.289781093597412, "learning_rate": 4.474628395694516e-05, "loss": 0.9045, "step": 4100 }, { "epoch": 0.5381855458739108, "grad_norm": 6.18227481842041, "learning_rate": 4.461814454126089e-05, "loss": 0.9679, "step": 4200 }, { "epoch": 0.5509994874423373, "grad_norm": 3.994476318359375, "learning_rate": 4.449000512557663e-05, "loss": 0.8958, "step": 4300 }, { "epoch": 0.5638134290107637, "grad_norm": 3.913896322250366, "learning_rate": 4.436186570989236e-05, "loss": 0.9453, "step": 4400 }, { "epoch": 0.5766273705791901, "grad_norm": 4.39192008972168, "learning_rate": 4.42337262942081e-05, "loss": 0.9132, "step": 4500 }, { "epoch": 0.5894413121476166, "grad_norm": 5.574671745300293, "learning_rate": 4.410558687852384e-05, "loss": 0.9069, "step": 4600 }, { "epoch": 0.602255253716043, "grad_norm": 4.218778610229492, "learning_rate": 4.397744746283957e-05, "loss": 0.9631, "step": 4700 }, { "epoch": 0.6150691952844695, "grad_norm": 7.804980754852295, "learning_rate": 4.384930804715531e-05, "loss": 0.9121, "step": 4800 }, { "epoch": 0.627883136852896, "grad_norm": 7.064172744750977, "learning_rate": 4.372116863147104e-05, "loss": 0.9387, "step": 4900 }, { "epoch": 0.6406970784213224, "grad_norm": 5.293111324310303, "learning_rate": 4.359302921578678e-05, "loss": 0.9264, "step": 5000 }, { "epoch": 0.6535110199897488, "grad_norm": 7.019448757171631, "learning_rate": 4.346488980010251e-05, "loss": 0.9452, "step": 5100 }, { "epoch": 0.6663249615581753, "grad_norm": 6.714709758758545, "learning_rate": 4.333675038441825e-05, "loss": 0.8648, "step": 5200 }, { "epoch": 0.6791389031266017, "grad_norm": 8.232748031616211, "learning_rate": 4.320861096873399e-05, "loss": 0.904, "step": 5300 }, { "epoch": 0.6919528446950282, "grad_norm": 9.853933334350586, "learning_rate": 4.308047155304972e-05, "loss": 0.8895, "step": 5400 }, { "epoch": 0.7047667862634547, "grad_norm": 6.8710455894470215, "learning_rate": 4.2952332137365454e-05, "loss": 0.86, "step": 5500 }, { "epoch": 0.7175807278318811, "grad_norm": 6.45287561416626, "learning_rate": 4.2824192721681186e-05, "loss": 0.8718, "step": 5600 }, { "epoch": 0.7303946694003075, "grad_norm": 5.772899627685547, "learning_rate": 4.2696053305996924e-05, "loss": 0.8477, "step": 5700 }, { "epoch": 0.743208610968734, "grad_norm": 6.193540573120117, "learning_rate": 4.256791389031266e-05, "loss": 0.9184, "step": 5800 }, { "epoch": 0.7560225525371604, "grad_norm": 2.5397393703460693, "learning_rate": 4.2439774474628395e-05, "loss": 0.9537, "step": 5900 }, { "epoch": 0.7688364941055869, "grad_norm": 8.280569076538086, "learning_rate": 4.2311635058944134e-05, "loss": 0.8988, "step": 6000 }, { "epoch": 0.7816504356740134, "grad_norm": 10.563502311706543, "learning_rate": 4.2183495643259866e-05, "loss": 0.8518, "step": 6100 }, { "epoch": 0.7944643772424398, "grad_norm": 3.090008497238159, "learning_rate": 4.2055356227575605e-05, "loss": 0.8731, "step": 6200 }, { "epoch": 0.8072783188108662, "grad_norm": 4.051167011260986, "learning_rate": 4.192721681189134e-05, "loss": 0.8713, "step": 6300 }, { "epoch": 0.8200922603792927, "grad_norm": 7.207763671875, "learning_rate": 4.1799077396207076e-05, "loss": 0.8781, "step": 6400 }, { "epoch": 0.8329062019477191, "grad_norm": 6.396823883056641, "learning_rate": 4.1670937980522815e-05, "loss": 0.8231, "step": 6500 }, { "epoch": 0.8457201435161456, "grad_norm": 6.260582447052002, "learning_rate": 4.1542798564838547e-05, "loss": 0.8658, "step": 6600 }, { "epoch": 0.858534085084572, "grad_norm": 8.35356616973877, "learning_rate": 4.1414659149154285e-05, "loss": 0.8637, "step": 6700 }, { "epoch": 0.8713480266529985, "grad_norm": 7.236725330352783, "learning_rate": 4.128651973347002e-05, "loss": 0.8525, "step": 6800 }, { "epoch": 0.8841619682214249, "grad_norm": 14.001522064208984, "learning_rate": 4.1158380317785756e-05, "loss": 0.8628, "step": 6900 }, { "epoch": 0.8969759097898513, "grad_norm": 4.257541179656982, "learning_rate": 4.103024090210149e-05, "loss": 0.8443, "step": 7000 }, { "epoch": 0.9097898513582778, "grad_norm": 5.065970420837402, "learning_rate": 4.090210148641722e-05, "loss": 0.8329, "step": 7100 }, { "epoch": 0.9226037929267042, "grad_norm": 6.647068977355957, "learning_rate": 4.077396207073296e-05, "loss": 0.8585, "step": 7200 }, { "epoch": 0.9354177344951307, "grad_norm": 8.440242767333984, "learning_rate": 4.064582265504869e-05, "loss": 0.8749, "step": 7300 }, { "epoch": 0.9482316760635572, "grad_norm": 7.684078216552734, "learning_rate": 4.051768323936443e-05, "loss": 0.7771, "step": 7400 }, { "epoch": 0.9610456176319836, "grad_norm": 6.4709577560424805, "learning_rate": 4.038954382368016e-05, "loss": 0.8597, "step": 7500 }, { "epoch": 0.97385955920041, "grad_norm": 4.3970489501953125, "learning_rate": 4.02614044079959e-05, "loss": 0.7852, "step": 7600 }, { "epoch": 0.9866735007688365, "grad_norm": 9.167794227600098, "learning_rate": 4.013326499231164e-05, "loss": 0.8563, "step": 7700 }, { "epoch": 0.9994874423372629, "grad_norm": 6.251096248626709, "learning_rate": 4.000512557662737e-05, "loss": 0.8243, "step": 7800 }, { "epoch": 1.0, "eval_f1": 0.640692076906927, "eval_loss": 0.8794865608215332, "eval_runtime": 744.6214, "eval_samples_per_second": 10.48, "eval_steps_per_second": 2.62, "step": 7804 }, { "epoch": 1.0123013839056894, "grad_norm": 5.928829669952393, "learning_rate": 3.987698616094311e-05, "loss": 0.7046, "step": 7900 }, { "epoch": 1.0251153254741159, "grad_norm": 2.885106086730957, "learning_rate": 3.974884674525884e-05, "loss": 0.7663, "step": 8000 }, { "epoch": 1.0379292670425422, "grad_norm": 5.951350212097168, "learning_rate": 3.962070732957458e-05, "loss": 0.7374, "step": 8100 }, { "epoch": 1.0507432086109687, "grad_norm": 2.5160486698150635, "learning_rate": 3.949256791389031e-05, "loss": 0.7126, "step": 8200 }, { "epoch": 1.0635571501793952, "grad_norm": 6.847401142120361, "learning_rate": 3.936442849820605e-05, "loss": 0.6785, "step": 8300 }, { "epoch": 1.0763710917478215, "grad_norm": 4.729136943817139, "learning_rate": 3.923628908252179e-05, "loss": 0.7085, "step": 8400 }, { "epoch": 1.089185033316248, "grad_norm": 5.535890102386475, "learning_rate": 3.910814966683752e-05, "loss": 0.7548, "step": 8500 }, { "epoch": 1.1019989748846746, "grad_norm": 6.188892364501953, "learning_rate": 3.8980010251153255e-05, "loss": 0.7193, "step": 8600 }, { "epoch": 1.1148129164531009, "grad_norm": 5.806282997131348, "learning_rate": 3.885187083546899e-05, "loss": 0.7143, "step": 8700 }, { "epoch": 1.1276268580215274, "grad_norm": 10.726571083068848, "learning_rate": 3.8723731419784726e-05, "loss": 0.6892, "step": 8800 }, { "epoch": 1.140440799589954, "grad_norm": 7.0307512283325195, "learning_rate": 3.8595592004100465e-05, "loss": 0.7264, "step": 8900 }, { "epoch": 1.1532547411583802, "grad_norm": 20.715412139892578, "learning_rate": 3.8467452588416197e-05, "loss": 0.6987, "step": 9000 }, { "epoch": 1.1660686827268067, "grad_norm": 6.620629787445068, "learning_rate": 3.8339313172731935e-05, "loss": 0.7041, "step": 9100 }, { "epoch": 1.1788826242952333, "grad_norm": 5.27125883102417, "learning_rate": 3.821117375704767e-05, "loss": 0.67, "step": 9200 }, { "epoch": 1.1916965658636596, "grad_norm": 6.010765552520752, "learning_rate": 3.8083034341363406e-05, "loss": 0.6737, "step": 9300 }, { "epoch": 1.204510507432086, "grad_norm": 14.393863677978516, "learning_rate": 3.795489492567914e-05, "loss": 0.7097, "step": 9400 }, { "epoch": 1.2173244490005126, "grad_norm": 6.37823486328125, "learning_rate": 3.782675550999488e-05, "loss": 0.7157, "step": 9500 }, { "epoch": 1.230138390568939, "grad_norm": 11.626152992248535, "learning_rate": 3.7698616094310616e-05, "loss": 0.7066, "step": 9600 }, { "epoch": 1.2429523321373654, "grad_norm": 5.520190238952637, "learning_rate": 3.757047667862635e-05, "loss": 0.7303, "step": 9700 }, { "epoch": 1.255766273705792, "grad_norm": 9.865089416503906, "learning_rate": 3.744233726294209e-05, "loss": 0.7559, "step": 9800 }, { "epoch": 1.2685802152742183, "grad_norm": 7.075952529907227, "learning_rate": 3.731419784725782e-05, "loss": 0.6941, "step": 9900 }, { "epoch": 1.2813941568426448, "grad_norm": 3.4892656803131104, "learning_rate": 3.718605843157356e-05, "loss": 0.7164, "step": 10000 }, { "epoch": 1.2942080984110713, "grad_norm": 9.843413352966309, "learning_rate": 3.705791901588929e-05, "loss": 0.695, "step": 10100 }, { "epoch": 1.3070220399794976, "grad_norm": 12.128110885620117, "learning_rate": 3.692977960020502e-05, "loss": 0.6563, "step": 10200 }, { "epoch": 1.3198359815479241, "grad_norm": 11.26876163482666, "learning_rate": 3.680164018452076e-05, "loss": 0.6803, "step": 10300 }, { "epoch": 1.3326499231163507, "grad_norm": 12.95758056640625, "learning_rate": 3.667350076883649e-05, "loss": 0.6864, "step": 10400 }, { "epoch": 1.345463864684777, "grad_norm": 4.91602897644043, "learning_rate": 3.654536135315223e-05, "loss": 0.7184, "step": 10500 }, { "epoch": 1.3582778062532035, "grad_norm": 4.799069881439209, "learning_rate": 3.641722193746796e-05, "loss": 0.7558, "step": 10600 }, { "epoch": 1.37109174782163, "grad_norm": 64.9485855102539, "learning_rate": 3.62890825217837e-05, "loss": 0.7292, "step": 10700 }, { "epoch": 1.3839056893900563, "grad_norm": 6.147428512573242, "learning_rate": 3.616094310609944e-05, "loss": 0.6623, "step": 10800 }, { "epoch": 1.3967196309584828, "grad_norm": 7.638481140136719, "learning_rate": 3.603280369041517e-05, "loss": 0.6981, "step": 10900 }, { "epoch": 1.4095335725269091, "grad_norm": 4.798500061035156, "learning_rate": 3.590466427473091e-05, "loss": 0.7569, "step": 11000 }, { "epoch": 1.4223475140953357, "grad_norm": 4.413691520690918, "learning_rate": 3.5776524859046644e-05, "loss": 0.6391, "step": 11100 }, { "epoch": 1.4351614556637622, "grad_norm": 6.2526421546936035, "learning_rate": 3.564838544336238e-05, "loss": 0.7045, "step": 11200 }, { "epoch": 1.4479753972321885, "grad_norm": 6.3732805252075195, "learning_rate": 3.5520246027678114e-05, "loss": 0.6916, "step": 11300 }, { "epoch": 1.460789338800615, "grad_norm": 25.24698829650879, "learning_rate": 3.539210661199385e-05, "loss": 0.7652, "step": 11400 }, { "epoch": 1.4736032803690415, "grad_norm": 4.716599941253662, "learning_rate": 3.5263967196309585e-05, "loss": 0.7199, "step": 11500 }, { "epoch": 1.4864172219374678, "grad_norm": 13.750917434692383, "learning_rate": 3.5135827780625324e-05, "loss": 0.7032, "step": 11600 }, { "epoch": 1.4992311635058944, "grad_norm": 3.6678273677825928, "learning_rate": 3.500768836494106e-05, "loss": 0.6821, "step": 11700 }, { "epoch": 1.5120451050743209, "grad_norm": 7.891080856323242, "learning_rate": 3.487954894925679e-05, "loss": 0.7301, "step": 11800 }, { "epoch": 1.5248590466427472, "grad_norm": 3.25317645072937, "learning_rate": 3.475140953357253e-05, "loss": 0.6665, "step": 11900 }, { "epoch": 1.5376729882111737, "grad_norm": 12.75395679473877, "learning_rate": 3.462327011788826e-05, "loss": 0.733, "step": 12000 }, { "epoch": 1.5504869297796002, "grad_norm": 10.9820556640625, "learning_rate": 3.4495130702204e-05, "loss": 0.7064, "step": 12100 }, { "epoch": 1.5633008713480265, "grad_norm": 6.558383941650391, "learning_rate": 3.4366991286519737e-05, "loss": 0.7105, "step": 12200 }, { "epoch": 1.576114812916453, "grad_norm": 8.5501070022583, "learning_rate": 3.423885187083547e-05, "loss": 0.706, "step": 12300 }, { "epoch": 1.5889287544848796, "grad_norm": 5.319694995880127, "learning_rate": 3.411071245515121e-05, "loss": 0.7239, "step": 12400 }, { "epoch": 1.6017426960533059, "grad_norm": 5.92519474029541, "learning_rate": 3.398257303946694e-05, "loss": 0.7043, "step": 12500 }, { "epoch": 1.6145566376217324, "grad_norm": 8.853275299072266, "learning_rate": 3.385443362378268e-05, "loss": 0.6831, "step": 12600 }, { "epoch": 1.627370579190159, "grad_norm": 9.30588150024414, "learning_rate": 3.372629420809841e-05, "loss": 0.6756, "step": 12700 }, { "epoch": 1.6401845207585852, "grad_norm": 5.903197288513184, "learning_rate": 3.359815479241415e-05, "loss": 0.725, "step": 12800 }, { "epoch": 1.6529984623270118, "grad_norm": 5.500326156616211, "learning_rate": 3.347001537672989e-05, "loss": 0.6801, "step": 12900 }, { "epoch": 1.6658124038954383, "grad_norm": 7.896096229553223, "learning_rate": 3.334187596104562e-05, "loss": 0.6975, "step": 13000 }, { "epoch": 1.6786263454638646, "grad_norm": 6.674001216888428, "learning_rate": 3.321373654536136e-05, "loss": 0.6681, "step": 13100 }, { "epoch": 1.691440287032291, "grad_norm": 21.74435806274414, "learning_rate": 3.308559712967709e-05, "loss": 0.7045, "step": 13200 }, { "epoch": 1.7042542286007176, "grad_norm": 6.329532146453857, "learning_rate": 3.295745771399282e-05, "loss": 0.6885, "step": 13300 }, { "epoch": 1.717068170169144, "grad_norm": 24.047470092773438, "learning_rate": 3.282931829830856e-05, "loss": 0.7003, "step": 13400 }, { "epoch": 1.7298821117375704, "grad_norm": 7.407759666442871, "learning_rate": 3.2701178882624294e-05, "loss": 0.6856, "step": 13500 }, { "epoch": 1.742696053305997, "grad_norm": 5.755215167999268, "learning_rate": 3.257303946694003e-05, "loss": 0.7005, "step": 13600 }, { "epoch": 1.7555099948744233, "grad_norm": 11.444562911987305, "learning_rate": 3.2444900051255764e-05, "loss": 0.7136, "step": 13700 }, { "epoch": 1.7683239364428498, "grad_norm": 8.267853736877441, "learning_rate": 3.23167606355715e-05, "loss": 0.7029, "step": 13800 }, { "epoch": 1.7811378780112763, "grad_norm": 6.73785924911499, "learning_rate": 3.2188621219887235e-05, "loss": 0.6572, "step": 13900 }, { "epoch": 1.7939518195797026, "grad_norm": 5.369395732879639, "learning_rate": 3.2060481804202974e-05, "loss": 0.6617, "step": 14000 }, { "epoch": 1.8067657611481291, "grad_norm": 2.288243293762207, "learning_rate": 3.193234238851871e-05, "loss": 0.6688, "step": 14100 }, { "epoch": 1.8195797027165557, "grad_norm": 14.942804336547852, "learning_rate": 3.1804202972834445e-05, "loss": 0.6792, "step": 14200 }, { "epoch": 1.832393644284982, "grad_norm": 8.988631248474121, "learning_rate": 3.1676063557150184e-05, "loss": 0.6518, "step": 14300 }, { "epoch": 1.8452075858534085, "grad_norm": 7.9590630531311035, "learning_rate": 3.1547924141465916e-05, "loss": 0.6503, "step": 14400 }, { "epoch": 1.858021527421835, "grad_norm": 9.33973503112793, "learning_rate": 3.1419784725781655e-05, "loss": 0.6647, "step": 14500 }, { "epoch": 1.8708354689902613, "grad_norm": 9.39842700958252, "learning_rate": 3.1291645310097387e-05, "loss": 0.6515, "step": 14600 }, { "epoch": 1.8836494105586878, "grad_norm": 10.142439842224121, "learning_rate": 3.1163505894413125e-05, "loss": 0.6794, "step": 14700 }, { "epoch": 1.8964633521271144, "grad_norm": 11.658042907714844, "learning_rate": 3.1035366478728864e-05, "loss": 0.6931, "step": 14800 }, { "epoch": 1.9092772936955407, "grad_norm": 8.672663688659668, "learning_rate": 3.090722706304459e-05, "loss": 0.6377, "step": 14900 }, { "epoch": 1.9220912352639672, "grad_norm": 6.620725631713867, "learning_rate": 3.077908764736033e-05, "loss": 0.7044, "step": 15000 }, { "epoch": 1.9349051768323937, "grad_norm": 8.3103609085083, "learning_rate": 3.065094823167606e-05, "loss": 0.641, "step": 15100 }, { "epoch": 1.94771911840082, "grad_norm": 8.163315773010254, "learning_rate": 3.05228088159918e-05, "loss": 0.7094, "step": 15200 }, { "epoch": 1.9605330599692465, "grad_norm": 3.6365621089935303, "learning_rate": 3.0394669400307534e-05, "loss": 0.7022, "step": 15300 }, { "epoch": 1.973347001537673, "grad_norm": 4.264801502227783, "learning_rate": 3.026652998462327e-05, "loss": 0.6833, "step": 15400 }, { "epoch": 1.9861609431060994, "grad_norm": 6.547428131103516, "learning_rate": 3.0138390568939005e-05, "loss": 0.6126, "step": 15500 }, { "epoch": 1.9989748846745259, "grad_norm": 6.155936241149902, "learning_rate": 3.0010251153254744e-05, "loss": 0.6851, "step": 15600 }, { "epoch": 2.0, "eval_f1": 0.6772664805551888, "eval_loss": 0.781230092048645, "eval_runtime": 778.3436, "eval_samples_per_second": 10.026, "eval_steps_per_second": 2.507, "step": 15608 }, { "epoch": 2.0117888262429524, "grad_norm": 8.777030944824219, "learning_rate": 2.988211173757048e-05, "loss": 0.4707, "step": 15700 }, { "epoch": 2.0246027678113787, "grad_norm": 4.798321723937988, "learning_rate": 2.9753972321886215e-05, "loss": 0.4366, "step": 15800 }, { "epoch": 2.037416709379805, "grad_norm": 2.5244762897491455, "learning_rate": 2.962583290620195e-05, "loss": 0.504, "step": 15900 }, { "epoch": 2.0502306509482318, "grad_norm": 15.636524200439453, "learning_rate": 2.9497693490517686e-05, "loss": 0.4234, "step": 16000 }, { "epoch": 2.063044592516658, "grad_norm": 8.811060905456543, "learning_rate": 2.936955407483342e-05, "loss": 0.3911, "step": 16100 }, { "epoch": 2.0758585340850844, "grad_norm": 4.1310930252075195, "learning_rate": 2.9241414659149157e-05, "loss": 0.4538, "step": 16200 }, { "epoch": 2.088672475653511, "grad_norm": 9.516937255859375, "learning_rate": 2.9113275243464892e-05, "loss": 0.4461, "step": 16300 }, { "epoch": 2.1014864172219374, "grad_norm": 4.6523756980896, "learning_rate": 2.8985135827780624e-05, "loss": 0.4808, "step": 16400 }, { "epoch": 2.1143003587903637, "grad_norm": 4.160647392272949, "learning_rate": 2.885699641209636e-05, "loss": 0.4879, "step": 16500 }, { "epoch": 2.1271143003587905, "grad_norm": 11.32701587677002, "learning_rate": 2.8728856996412095e-05, "loss": 0.4544, "step": 16600 }, { "epoch": 2.1399282419272168, "grad_norm": 4.703444004058838, "learning_rate": 2.860071758072783e-05, "loss": 0.466, "step": 16700 }, { "epoch": 2.152742183495643, "grad_norm": 8.985660552978516, "learning_rate": 2.847257816504357e-05, "loss": 0.4734, "step": 16800 }, { "epoch": 2.16555612506407, "grad_norm": 12.306890487670898, "learning_rate": 2.8344438749359304e-05, "loss": 0.4287, "step": 16900 }, { "epoch": 2.178370066632496, "grad_norm": 5.025609016418457, "learning_rate": 2.821629933367504e-05, "loss": 0.4657, "step": 17000 }, { "epoch": 2.1911840082009224, "grad_norm": 31.554025650024414, "learning_rate": 2.8088159917990775e-05, "loss": 0.4378, "step": 17100 }, { "epoch": 2.203997949769349, "grad_norm": 9.015434265136719, "learning_rate": 2.796002050230651e-05, "loss": 0.4538, "step": 17200 }, { "epoch": 2.2168118913377755, "grad_norm": 15.61099624633789, "learning_rate": 2.7831881086622246e-05, "loss": 0.4134, "step": 17300 }, { "epoch": 2.2296258329062018, "grad_norm": 10.191957473754883, "learning_rate": 2.770374167093798e-05, "loss": 0.5188, "step": 17400 }, { "epoch": 2.2424397744746285, "grad_norm": 2.2506730556488037, "learning_rate": 2.7575602255253717e-05, "loss": 0.4028, "step": 17500 }, { "epoch": 2.255253716043055, "grad_norm": 23.088764190673828, "learning_rate": 2.7447462839569456e-05, "loss": 0.4814, "step": 17600 }, { "epoch": 2.268067657611481, "grad_norm": 4.473659515380859, "learning_rate": 2.731932342388519e-05, "loss": 0.4822, "step": 17700 }, { "epoch": 2.280881599179908, "grad_norm": 2.1489970684051514, "learning_rate": 2.7191184008200927e-05, "loss": 0.4934, "step": 17800 }, { "epoch": 2.293695540748334, "grad_norm": 1.4255170822143555, "learning_rate": 2.7063044592516662e-05, "loss": 0.4314, "step": 17900 }, { "epoch": 2.3065094823167605, "grad_norm": 4.612204074859619, "learning_rate": 2.693490517683239e-05, "loss": 0.4322, "step": 18000 }, { "epoch": 2.319323423885187, "grad_norm": 3.1022679805755615, "learning_rate": 2.680676576114813e-05, "loss": 0.424, "step": 18100 }, { "epoch": 2.3321373654536135, "grad_norm": 3.745171070098877, "learning_rate": 2.6678626345463865e-05, "loss": 0.4269, "step": 18200 }, { "epoch": 2.34495130702204, "grad_norm": 4.0442328453063965, "learning_rate": 2.65504869297796e-05, "loss": 0.4698, "step": 18300 }, { "epoch": 2.3577652485904665, "grad_norm": 21.303607940673828, "learning_rate": 2.6422347514095336e-05, "loss": 0.4909, "step": 18400 }, { "epoch": 2.370579190158893, "grad_norm": 9.175422668457031, "learning_rate": 2.629420809841107e-05, "loss": 0.4598, "step": 18500 }, { "epoch": 2.383393131727319, "grad_norm": 5.787283420562744, "learning_rate": 2.6166068682726807e-05, "loss": 0.4409, "step": 18600 }, { "epoch": 2.396207073295746, "grad_norm": 7.338250637054443, "learning_rate": 2.6037929267042542e-05, "loss": 0.4157, "step": 18700 }, { "epoch": 2.409021014864172, "grad_norm": 13.879666328430176, "learning_rate": 2.590978985135828e-05, "loss": 0.4584, "step": 18800 }, { "epoch": 2.4218349564325985, "grad_norm": 9.484577178955078, "learning_rate": 2.5781650435674016e-05, "loss": 0.4914, "step": 18900 }, { "epoch": 2.4346488980010252, "grad_norm": 10.865300178527832, "learning_rate": 2.565351101998975e-05, "loss": 0.4259, "step": 19000 }, { "epoch": 2.4474628395694515, "grad_norm": 16.69988441467285, "learning_rate": 2.5525371604305487e-05, "loss": 0.4563, "step": 19100 }, { "epoch": 2.460276781137878, "grad_norm": 19.711631774902344, "learning_rate": 2.5397232188621222e-05, "loss": 0.4159, "step": 19200 }, { "epoch": 2.4730907227063046, "grad_norm": 13.3755521774292, "learning_rate": 2.5269092772936958e-05, "loss": 0.537, "step": 19300 }, { "epoch": 2.485904664274731, "grad_norm": 6.953076362609863, "learning_rate": 2.5140953357252693e-05, "loss": 0.4288, "step": 19400 }, { "epoch": 2.498718605843157, "grad_norm": 47.91322708129883, "learning_rate": 2.5012813941568432e-05, "loss": 0.5049, "step": 19500 }, { "epoch": 2.511532547411584, "grad_norm": 1.6553832292556763, "learning_rate": 2.4884674525884164e-05, "loss": 0.4779, "step": 19600 }, { "epoch": 2.5243464889800102, "grad_norm": 12.199808120727539, "learning_rate": 2.47565351101999e-05, "loss": 0.4246, "step": 19700 }, { "epoch": 2.5371604305484365, "grad_norm": 11.326825141906738, "learning_rate": 2.4628395694515635e-05, "loss": 0.4482, "step": 19800 }, { "epoch": 2.5499743721168633, "grad_norm": 9.247246742248535, "learning_rate": 2.450025627883137e-05, "loss": 0.4656, "step": 19900 }, { "epoch": 2.5627883136852896, "grad_norm": 1.773540735244751, "learning_rate": 2.4372116863147106e-05, "loss": 0.4776, "step": 20000 }, { "epoch": 2.575602255253716, "grad_norm": 7.454749584197998, "learning_rate": 2.424397744746284e-05, "loss": 0.4161, "step": 20100 }, { "epoch": 2.5884161968221426, "grad_norm": 19.77891731262207, "learning_rate": 2.4115838031778577e-05, "loss": 0.4609, "step": 20200 }, { "epoch": 2.601230138390569, "grad_norm": 12.208200454711914, "learning_rate": 2.3987698616094312e-05, "loss": 0.453, "step": 20300 }, { "epoch": 2.6140440799589952, "grad_norm": 11.438812255859375, "learning_rate": 2.3859559200410047e-05, "loss": 0.4439, "step": 20400 }, { "epoch": 2.626858021527422, "grad_norm": 1.6863147020339966, "learning_rate": 2.3731419784725783e-05, "loss": 0.3987, "step": 20500 }, { "epoch": 2.6396719630958483, "grad_norm": 1.3637946844100952, "learning_rate": 2.3603280369041518e-05, "loss": 0.4523, "step": 20600 }, { "epoch": 2.6524859046642746, "grad_norm": 21.555208206176758, "learning_rate": 2.3475140953357254e-05, "loss": 0.4624, "step": 20700 }, { "epoch": 2.6652998462327013, "grad_norm": 8.768684387207031, "learning_rate": 2.334700153767299e-05, "loss": 0.4585, "step": 20800 }, { "epoch": 2.6781137878011276, "grad_norm": 3.2959704399108887, "learning_rate": 2.3218862121988724e-05, "loss": 0.4579, "step": 20900 }, { "epoch": 2.690927729369554, "grad_norm": 16.97565269470215, "learning_rate": 2.309072270630446e-05, "loss": 0.4132, "step": 21000 }, { "epoch": 2.7037416709379807, "grad_norm": 14.613641738891602, "learning_rate": 2.2962583290620195e-05, "loss": 0.4297, "step": 21100 }, { "epoch": 2.716555612506407, "grad_norm": 28.61090087890625, "learning_rate": 2.283444387493593e-05, "loss": 0.4479, "step": 21200 }, { "epoch": 2.7293695540748333, "grad_norm": 9.84257984161377, "learning_rate": 2.2706304459251666e-05, "loss": 0.4428, "step": 21300 }, { "epoch": 2.74218349564326, "grad_norm": 8.199345588684082, "learning_rate": 2.2578165043567405e-05, "loss": 0.3999, "step": 21400 }, { "epoch": 2.7549974372116863, "grad_norm": 15.411248207092285, "learning_rate": 2.2450025627883137e-05, "loss": 0.4423, "step": 21500 }, { "epoch": 2.7678113787801126, "grad_norm": 7.122200012207031, "learning_rate": 2.2321886212198872e-05, "loss": 0.4675, "step": 21600 }, { "epoch": 2.7806253203485394, "grad_norm": 11.358266830444336, "learning_rate": 2.2193746796514608e-05, "loss": 0.4885, "step": 21700 }, { "epoch": 2.7934392619169657, "grad_norm": 9.456644058227539, "learning_rate": 2.2065607380830343e-05, "loss": 0.4973, "step": 21800 }, { "epoch": 2.806253203485392, "grad_norm": 28.7235164642334, "learning_rate": 2.193746796514608e-05, "loss": 0.429, "step": 21900 }, { "epoch": 2.8190671450538183, "grad_norm": 14.859136581420898, "learning_rate": 2.1809328549461817e-05, "loss": 0.4867, "step": 22000 }, { "epoch": 2.831881086622245, "grad_norm": 3.089897394180298, "learning_rate": 2.1681189133777553e-05, "loss": 0.4249, "step": 22100 }, { "epoch": 2.8446950281906713, "grad_norm": 14.606719970703125, "learning_rate": 2.1553049718093288e-05, "loss": 0.4429, "step": 22200 }, { "epoch": 2.857508969759098, "grad_norm": 7.761451244354248, "learning_rate": 2.142491030240902e-05, "loss": 0.4639, "step": 22300 }, { "epoch": 2.8703229113275244, "grad_norm": 6.9101362228393555, "learning_rate": 2.1296770886724756e-05, "loss": 0.4606, "step": 22400 }, { "epoch": 2.8831368528959507, "grad_norm": 6.754969120025635, "learning_rate": 2.116863147104049e-05, "loss": 0.4784, "step": 22500 }, { "epoch": 2.895950794464377, "grad_norm": 20.884119033813477, "learning_rate": 2.104049205535623e-05, "loss": 0.4625, "step": 22600 }, { "epoch": 2.9087647360328037, "grad_norm": 18.428529739379883, "learning_rate": 2.0912352639671965e-05, "loss": 0.4121, "step": 22700 }, { "epoch": 2.92157867760123, "grad_norm": 9.211915969848633, "learning_rate": 2.07842132239877e-05, "loss": 0.457, "step": 22800 }, { "epoch": 2.9343926191696568, "grad_norm": 5.744906425476074, "learning_rate": 2.0656073808303436e-05, "loss": 0.4169, "step": 22900 }, { "epoch": 2.947206560738083, "grad_norm": 10.679366111755371, "learning_rate": 2.052793439261917e-05, "loss": 0.4719, "step": 23000 }, { "epoch": 2.9600205023065094, "grad_norm": 8.72630500793457, "learning_rate": 2.0399794976934904e-05, "loss": 0.4743, "step": 23100 }, { "epoch": 2.9728344438749357, "grad_norm": 5.53284215927124, "learning_rate": 2.0271655561250642e-05, "loss": 0.4592, "step": 23200 }, { "epoch": 2.9856483854433624, "grad_norm": 10.75283432006836, "learning_rate": 2.0143516145566378e-05, "loss": 0.3971, "step": 23300 }, { "epoch": 2.9984623270117887, "grad_norm": 10.634764671325684, "learning_rate": 2.0015376729882113e-05, "loss": 0.4295, "step": 23400 }, { "epoch": 3.0, "eval_f1": 0.6853715205850849, "eval_loss": 1.0191140174865723, "eval_runtime": 837.6905, "eval_samples_per_second": 9.316, "eval_steps_per_second": 2.329, "step": 23412 }, { "epoch": 3.0112762685802155, "grad_norm": 3.43902587890625, "learning_rate": 1.988723731419785e-05, "loss": 0.2448, "step": 23500 }, { "epoch": 3.0240902101486418, "grad_norm": 0.5649552941322327, "learning_rate": 1.9759097898513584e-05, "loss": 0.1908, "step": 23600 }, { "epoch": 3.036904151717068, "grad_norm": 1.3035610914230347, "learning_rate": 1.963095848282932e-05, "loss": 0.275, "step": 23700 }, { "epoch": 3.049718093285495, "grad_norm": 27.42232322692871, "learning_rate": 1.9502819067145055e-05, "loss": 0.2727, "step": 23800 }, { "epoch": 3.062532034853921, "grad_norm": 1.675907015800476, "learning_rate": 1.937467965146079e-05, "loss": 0.2916, "step": 23900 }, { "epoch": 3.0753459764223474, "grad_norm": 9.602179527282715, "learning_rate": 1.9246540235776526e-05, "loss": 0.2645, "step": 24000 }, { "epoch": 3.088159917990774, "grad_norm": 16.757831573486328, "learning_rate": 1.911840082009226e-05, "loss": 0.2476, "step": 24100 }, { "epoch": 3.1009738595592005, "grad_norm": 5.842043876647949, "learning_rate": 1.8990261404407997e-05, "loss": 0.2829, "step": 24200 }, { "epoch": 3.1137878011276268, "grad_norm": 0.593449592590332, "learning_rate": 1.8862121988723732e-05, "loss": 0.289, "step": 24300 }, { "epoch": 3.1266017426960535, "grad_norm": 5.712982177734375, "learning_rate": 1.8733982573039467e-05, "loss": 0.2355, "step": 24400 }, { "epoch": 3.13941568426448, "grad_norm": 0.3152589201927185, "learning_rate": 1.8605843157355203e-05, "loss": 0.2491, "step": 24500 }, { "epoch": 3.152229625832906, "grad_norm": 19.951833724975586, "learning_rate": 1.8477703741670938e-05, "loss": 0.271, "step": 24600 }, { "epoch": 3.165043567401333, "grad_norm": 5.257028579711914, "learning_rate": 1.8349564325986674e-05, "loss": 0.277, "step": 24700 }, { "epoch": 3.177857508969759, "grad_norm": 3.6717381477355957, "learning_rate": 1.822142491030241e-05, "loss": 0.2736, "step": 24800 }, { "epoch": 3.1906714505381855, "grad_norm": 38.49631881713867, "learning_rate": 1.8093285494618144e-05, "loss": 0.2789, "step": 24900 }, { "epoch": 3.2034853921066118, "grad_norm": 5.944704055786133, "learning_rate": 1.796514607893388e-05, "loss": 0.3111, "step": 25000 }, { "epoch": 3.2162993336750385, "grad_norm": 3.278078079223633, "learning_rate": 1.7837006663249615e-05, "loss": 0.287, "step": 25100 }, { "epoch": 3.229113275243465, "grad_norm": 13.320869445800781, "learning_rate": 1.7708867247565354e-05, "loss": 0.2708, "step": 25200 }, { "epoch": 3.2419272168118916, "grad_norm": 9.01321029663086, "learning_rate": 1.758072783188109e-05, "loss": 0.2891, "step": 25300 }, { "epoch": 3.254741158380318, "grad_norm": 14.35201644897461, "learning_rate": 1.745258841619682e-05, "loss": 0.1523, "step": 25400 }, { "epoch": 3.267555099948744, "grad_norm": 5.268370628356934, "learning_rate": 1.7324449000512557e-05, "loss": 0.3608, "step": 25500 }, { "epoch": 3.2803690415171705, "grad_norm": 3.338168144226074, "learning_rate": 1.7196309584828292e-05, "loss": 0.2829, "step": 25600 }, { "epoch": 3.293182983085597, "grad_norm": 12.441572189331055, "learning_rate": 1.7068170169144028e-05, "loss": 0.2563, "step": 25700 }, { "epoch": 3.3059969246540235, "grad_norm": 2.870978832244873, "learning_rate": 1.6940030753459767e-05, "loss": 0.2957, "step": 25800 }, { "epoch": 3.3188108662224503, "grad_norm": 10.626642227172852, "learning_rate": 1.6811891337775502e-05, "loss": 0.3493, "step": 25900 }, { "epoch": 3.3316248077908766, "grad_norm": 1.1796225309371948, "learning_rate": 1.6683751922091237e-05, "loss": 0.293, "step": 26000 }, { "epoch": 3.344438749359303, "grad_norm": 46.64753341674805, "learning_rate": 1.6555612506406973e-05, "loss": 0.2739, "step": 26100 }, { "epoch": 3.357252690927729, "grad_norm": 17.778207778930664, "learning_rate": 1.6427473090722705e-05, "loss": 0.2897, "step": 26200 }, { "epoch": 3.370066632496156, "grad_norm": 1.6698403358459473, "learning_rate": 1.629933367503844e-05, "loss": 0.2661, "step": 26300 }, { "epoch": 3.382880574064582, "grad_norm": 0.18206116557121277, "learning_rate": 1.617119425935418e-05, "loss": 0.2847, "step": 26400 }, { "epoch": 3.395694515633009, "grad_norm": 6.839690208435059, "learning_rate": 1.6043054843669915e-05, "loss": 0.3044, "step": 26500 }, { "epoch": 3.4085084572014352, "grad_norm": 0.6313930749893188, "learning_rate": 1.591491542798565e-05, "loss": 0.2623, "step": 26600 }, { "epoch": 3.4213223987698616, "grad_norm": 70.23905181884766, "learning_rate": 1.5786776012301385e-05, "loss": 0.2573, "step": 26700 }, { "epoch": 3.434136340338288, "grad_norm": 16.72913360595703, "learning_rate": 1.565863659661712e-05, "loss": 0.2626, "step": 26800 }, { "epoch": 3.4469502819067146, "grad_norm": 43.662845611572266, "learning_rate": 1.5530497180932856e-05, "loss": 0.2679, "step": 26900 }, { "epoch": 3.459764223475141, "grad_norm": 20.96466064453125, "learning_rate": 1.540235776524859e-05, "loss": 0.3082, "step": 27000 }, { "epoch": 3.4725781650435676, "grad_norm": 45.02407455444336, "learning_rate": 1.5274218349564327e-05, "loss": 0.2492, "step": 27100 }, { "epoch": 3.485392106611994, "grad_norm": 14.404077529907227, "learning_rate": 1.5146078933880062e-05, "loss": 0.2704, "step": 27200 }, { "epoch": 3.4982060481804202, "grad_norm": 19.40283966064453, "learning_rate": 1.5017939518195798e-05, "loss": 0.3089, "step": 27300 }, { "epoch": 3.5110199897488465, "grad_norm": 13.016902923583984, "learning_rate": 1.4889800102511533e-05, "loss": 0.2953, "step": 27400 }, { "epoch": 3.5238339313172733, "grad_norm": 6.934922695159912, "learning_rate": 1.4761660686827269e-05, "loss": 0.2132, "step": 27500 }, { "epoch": 3.5366478728856996, "grad_norm": 49.58895492553711, "learning_rate": 1.4633521271143006e-05, "loss": 0.271, "step": 27600 }, { "epoch": 3.5494618144541263, "grad_norm": 4.814508438110352, "learning_rate": 1.4505381855458741e-05, "loss": 0.3195, "step": 27700 }, { "epoch": 3.5622757560225526, "grad_norm": 28.65342903137207, "learning_rate": 1.4377242439774475e-05, "loss": 0.2869, "step": 27800 }, { "epoch": 3.575089697590979, "grad_norm": 5.931487083435059, "learning_rate": 1.424910302409021e-05, "loss": 0.2982, "step": 27900 }, { "epoch": 3.5879036391594052, "grad_norm": 0.22432470321655273, "learning_rate": 1.4120963608405946e-05, "loss": 0.3167, "step": 28000 }, { "epoch": 3.600717580727832, "grad_norm": 27.89299964904785, "learning_rate": 1.3992824192721681e-05, "loss": 0.2831, "step": 28100 }, { "epoch": 3.6135315222962583, "grad_norm": 6.232203006744385, "learning_rate": 1.3864684777037418e-05, "loss": 0.2328, "step": 28200 }, { "epoch": 3.626345463864685, "grad_norm": 0.3798358738422394, "learning_rate": 1.3736545361353154e-05, "loss": 0.2565, "step": 28300 }, { "epoch": 3.6391594054331113, "grad_norm": 2.3177566528320312, "learning_rate": 1.3608405945668889e-05, "loss": 0.2822, "step": 28400 }, { "epoch": 3.6519733470015376, "grad_norm": 0.9287611246109009, "learning_rate": 1.3480266529984623e-05, "loss": 0.2206, "step": 28500 }, { "epoch": 3.664787288569964, "grad_norm": 19.89398765563965, "learning_rate": 1.3352127114300358e-05, "loss": 0.2934, "step": 28600 }, { "epoch": 3.6776012301383907, "grad_norm": 14.735712051391602, "learning_rate": 1.3223987698616094e-05, "loss": 0.2667, "step": 28700 }, { "epoch": 3.690415171706817, "grad_norm": 2.782954454421997, "learning_rate": 1.309584828293183e-05, "loss": 0.2565, "step": 28800 }, { "epoch": 3.7032291132752437, "grad_norm": 20.082395553588867, "learning_rate": 1.2967708867247566e-05, "loss": 0.3069, "step": 28900 }, { "epoch": 3.71604305484367, "grad_norm": 1.8632967472076416, "learning_rate": 1.2839569451563302e-05, "loss": 0.2484, "step": 29000 }, { "epoch": 3.7288569964120963, "grad_norm": 6.2880330085754395, "learning_rate": 1.2711430035879037e-05, "loss": 0.2769, "step": 29100 }, { "epoch": 3.7416709379805226, "grad_norm": 18.328922271728516, "learning_rate": 1.2583290620194774e-05, "loss": 0.284, "step": 29200 }, { "epoch": 3.7544848795489494, "grad_norm": 0.2658964991569519, "learning_rate": 1.2455151204510508e-05, "loss": 0.2725, "step": 29300 }, { "epoch": 3.7672988211173757, "grad_norm": 7.819123268127441, "learning_rate": 1.2327011788826243e-05, "loss": 0.2513, "step": 29400 }, { "epoch": 3.7801127626858024, "grad_norm": 4.6279144287109375, "learning_rate": 1.2198872373141979e-05, "loss": 0.2573, "step": 29500 }, { "epoch": 3.7929267042542287, "grad_norm": 24.996662139892578, "learning_rate": 1.2070732957457714e-05, "loss": 0.2621, "step": 29600 }, { "epoch": 3.805740645822655, "grad_norm": 20.87746810913086, "learning_rate": 1.194259354177345e-05, "loss": 0.2499, "step": 29700 }, { "epoch": 3.8185545873910813, "grad_norm": 1.5061414241790771, "learning_rate": 1.1814454126089187e-05, "loss": 0.265, "step": 29800 }, { "epoch": 3.831368528959508, "grad_norm": 2.7230064868927, "learning_rate": 1.168631471040492e-05, "loss": 0.2469, "step": 29900 }, { "epoch": 3.8441824705279344, "grad_norm": 0.6768075823783875, "learning_rate": 1.1558175294720656e-05, "loss": 0.2686, "step": 30000 }, { "epoch": 3.8569964120963607, "grad_norm": 0.08343211561441422, "learning_rate": 1.1430035879036393e-05, "loss": 0.2565, "step": 30100 }, { "epoch": 3.8698103536647874, "grad_norm": 25.58348274230957, "learning_rate": 1.1301896463352128e-05, "loss": 0.2967, "step": 30200 }, { "epoch": 3.8826242952332137, "grad_norm": 1.0459709167480469, "learning_rate": 1.1173757047667862e-05, "loss": 0.3028, "step": 30300 }, { "epoch": 3.89543823680164, "grad_norm": 0.33878639340400696, "learning_rate": 1.1045617631983599e-05, "loss": 0.2243, "step": 30400 }, { "epoch": 3.9082521783700668, "grad_norm": 2.021047592163086, "learning_rate": 1.0917478216299335e-05, "loss": 0.3656, "step": 30500 }, { "epoch": 3.921066119938493, "grad_norm": 1.6855653524398804, "learning_rate": 1.078933880061507e-05, "loss": 0.2323, "step": 30600 }, { "epoch": 3.9338800615069194, "grad_norm": 21.66104507446289, "learning_rate": 1.0661199384930805e-05, "loss": 0.2205, "step": 30700 }, { "epoch": 3.946694003075346, "grad_norm": 2.4428458213806152, "learning_rate": 1.053305996924654e-05, "loss": 0.2436, "step": 30800 }, { "epoch": 3.9595079446437724, "grad_norm": 39.37623596191406, "learning_rate": 1.0404920553562276e-05, "loss": 0.2831, "step": 30900 }, { "epoch": 3.9723218862121987, "grad_norm": 44.4313850402832, "learning_rate": 1.0276781137878012e-05, "loss": 0.2522, "step": 31000 }, { "epoch": 3.9851358277806255, "grad_norm": 2.6004929542541504, "learning_rate": 1.0148641722193747e-05, "loss": 0.3209, "step": 31100 }, { "epoch": 3.9979497693490518, "grad_norm": 2.536029815673828, "learning_rate": 1.0020502306509482e-05, "loss": 0.2807, "step": 31200 }, { "epoch": 4.0, "eval_f1": 0.6869426704202687, "eval_loss": 2.03011155128479, "eval_runtime": 825.2142, "eval_samples_per_second": 9.457, "eval_steps_per_second": 2.364, "step": 31216 } ], "logging_steps": 100, "max_steps": 39020, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.105948110057636e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }