{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 730, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013717421124828532, "grad_norm": 1.2567085027694702, "learning_rate": 1.3043478260869566e-06, "loss": 1.2424, "step": 5 }, { "epoch": 0.027434842249657063, "grad_norm": 1.181019902229309, "learning_rate": 2.9347826086956523e-06, "loss": 1.2733, "step": 10 }, { "epoch": 0.0411522633744856, "grad_norm": 0.6036953926086426, "learning_rate": 4.565217391304348e-06, "loss": 1.2413, "step": 15 }, { "epoch": 0.05486968449931413, "grad_norm": 0.8439048528671265, "learning_rate": 6.195652173913044e-06, "loss": 1.2214, "step": 20 }, { "epoch": 0.06858710562414266, "grad_norm": 0.5841037631034851, "learning_rate": 7.826086956521738e-06, "loss": 1.2187, "step": 25 }, { "epoch": 0.0823045267489712, "grad_norm": 0.6085624694824219, "learning_rate": 9.456521739130436e-06, "loss": 1.1609, "step": 30 }, { "epoch": 0.09602194787379972, "grad_norm": 0.6230579018592834, "learning_rate": 1.108695652173913e-05, "loss": 1.1653, "step": 35 }, { "epoch": 0.10973936899862825, "grad_norm": 0.5125408172607422, "learning_rate": 1.2717391304347827e-05, "loss": 1.1442, "step": 40 }, { "epoch": 0.12345679012345678, "grad_norm": 0.4418923258781433, "learning_rate": 1.4347826086956522e-05, "loss": 1.135, "step": 45 }, { "epoch": 0.13717421124828533, "grad_norm": 0.39918941259384155, "learning_rate": 1.597826086956522e-05, "loss": 1.15, "step": 50 }, { "epoch": 0.15089163237311384, "grad_norm": 0.4432643949985504, "learning_rate": 1.7608695652173915e-05, "loss": 1.1159, "step": 55 }, { "epoch": 0.1646090534979424, "grad_norm": 0.7146240472793579, "learning_rate": 1.9239130434782607e-05, "loss": 1.1433, "step": 60 }, { "epoch": 0.17832647462277093, "grad_norm": 0.6695220470428467, "learning_rate": 2.0869565217391306e-05, "loss": 1.1032, "step": 65 }, { "epoch": 0.19204389574759945, "grad_norm": 0.489704966545105, "learning_rate": 2.25e-05, "loss": 1.0569, "step": 70 }, { "epoch": 0.205761316872428, "grad_norm": 0.4658520817756653, "learning_rate": 2.4130434782608697e-05, "loss": 1.0715, "step": 75 }, { "epoch": 0.2194787379972565, "grad_norm": 0.5942860245704651, "learning_rate": 2.5760869565217392e-05, "loss": 1.0534, "step": 80 }, { "epoch": 0.23319615912208505, "grad_norm": 0.48524120450019836, "learning_rate": 2.7391304347826085e-05, "loss": 1.1362, "step": 85 }, { "epoch": 0.24691358024691357, "grad_norm": 0.41874566674232483, "learning_rate": 2.9021739130434783e-05, "loss": 1.0455, "step": 90 }, { "epoch": 0.2606310013717421, "grad_norm": 0.49967625737190247, "learning_rate": 2.999990141214925e-05, "loss": 1.0053, "step": 95 }, { "epoch": 0.27434842249657065, "grad_norm": 0.529108464717865, "learning_rate": 2.999879231371134e-05, "loss": 1.0228, "step": 100 }, { "epoch": 0.2880658436213992, "grad_norm": 0.5769373774528503, "learning_rate": 2.9996450973444988e-05, "loss": 1.0495, "step": 105 }, { "epoch": 0.3017832647462277, "grad_norm": 0.6768115162849426, "learning_rate": 2.999287758370551e-05, "loss": 0.9451, "step": 110 }, { "epoch": 0.31550068587105623, "grad_norm": 0.6151679754257202, "learning_rate": 2.998807243806856e-05, "loss": 1.0238, "step": 115 }, { "epoch": 0.3292181069958848, "grad_norm": 0.6618012189865112, "learning_rate": 2.998203593130602e-05, "loss": 1.0144, "step": 120 }, { "epoch": 0.3429355281207133, "grad_norm": 0.5754362344741821, "learning_rate": 2.9974768559353564e-05, "loss": 0.9812, "step": 125 }, { "epoch": 0.35665294924554186, "grad_norm": 0.614632785320282, "learning_rate": 2.99662709192699e-05, "loss": 0.9152, "step": 130 }, { "epoch": 0.37037037037037035, "grad_norm": 0.5445948839187622, "learning_rate": 2.995654370918775e-05, "loss": 1.0159, "step": 135 }, { "epoch": 0.3840877914951989, "grad_norm": 0.592310905456543, "learning_rate": 2.9945587728256456e-05, "loss": 0.9158, "step": 140 }, { "epoch": 0.39780521262002744, "grad_norm": 0.6413975954055786, "learning_rate": 2.9933403876576364e-05, "loss": 0.938, "step": 145 }, { "epoch": 0.411522633744856, "grad_norm": 0.5976924300193787, "learning_rate": 2.9919993155124834e-05, "loss": 0.8984, "step": 150 }, { "epoch": 0.4252400548696845, "grad_norm": 0.6372507810592651, "learning_rate": 2.990535666567403e-05, "loss": 0.8719, "step": 155 }, { "epoch": 0.438957475994513, "grad_norm": 0.5930208563804626, "learning_rate": 2.9889495610700416e-05, "loss": 0.9289, "step": 160 }, { "epoch": 0.45267489711934156, "grad_norm": 0.6256594061851501, "learning_rate": 2.9872411293285916e-05, "loss": 0.852, "step": 165 }, { "epoch": 0.4663923182441701, "grad_norm": 0.6492271423339844, "learning_rate": 2.985410511701092e-05, "loss": 0.8581, "step": 170 }, { "epoch": 0.48010973936899864, "grad_norm": 0.7370251417160034, "learning_rate": 2.9834578585838907e-05, "loss": 0.9052, "step": 175 }, { "epoch": 0.49382716049382713, "grad_norm": 0.7324991226196289, "learning_rate": 2.9813833303992948e-05, "loss": 0.8242, "step": 180 }, { "epoch": 0.5075445816186557, "grad_norm": 0.7410762906074524, "learning_rate": 2.979187097582386e-05, "loss": 0.8182, "step": 185 }, { "epoch": 0.5212620027434842, "grad_norm": 0.6918640732765198, "learning_rate": 2.976869340567021e-05, "loss": 0.838, "step": 190 }, { "epoch": 0.5349794238683128, "grad_norm": 2.690075159072876, "learning_rate": 2.9744302497710076e-05, "loss": 0.8393, "step": 195 }, { "epoch": 0.5486968449931413, "grad_norm": 0.7638638019561768, "learning_rate": 2.9718700255804588e-05, "loss": 0.8171, "step": 200 }, { "epoch": 0.5624142661179699, "grad_norm": 0.8737165331840515, "learning_rate": 2.969188878333332e-05, "loss": 0.8264, "step": 205 }, { "epoch": 0.5761316872427984, "grad_norm": 0.7183387875556946, "learning_rate": 2.9663870283021477e-05, "loss": 0.8421, "step": 210 }, { "epoch": 0.5898491083676269, "grad_norm": 0.8239722847938538, "learning_rate": 2.9634647056758927e-05, "loss": 0.7993, "step": 215 }, { "epoch": 0.6035665294924554, "grad_norm": 0.8498063683509827, "learning_rate": 2.960422150541109e-05, "loss": 0.8239, "step": 220 }, { "epoch": 0.6172839506172839, "grad_norm": 1.0519356727600098, "learning_rate": 2.9572596128621683e-05, "loss": 0.7706, "step": 225 }, { "epoch": 0.6310013717421125, "grad_norm": 0.844680666923523, "learning_rate": 2.9539773524607373e-05, "loss": 0.7471, "step": 230 }, { "epoch": 0.644718792866941, "grad_norm": 0.7600374221801758, "learning_rate": 2.95057563899443e-05, "loss": 0.7894, "step": 235 }, { "epoch": 0.6584362139917695, "grad_norm": 0.8541322946548462, "learning_rate": 2.947054751934656e-05, "loss": 0.7903, "step": 240 }, { "epoch": 0.6721536351165981, "grad_norm": 0.7854955196380615, "learning_rate": 2.9434149805436586e-05, "loss": 0.7754, "step": 245 }, { "epoch": 0.6858710562414266, "grad_norm": 0.7242818474769592, "learning_rate": 2.9396566238507496e-05, "loss": 0.7455, "step": 250 }, { "epoch": 0.6995884773662552, "grad_norm": 0.9281851053237915, "learning_rate": 2.935779990627744e-05, "loss": 0.7562, "step": 255 }, { "epoch": 0.7133058984910837, "grad_norm": 0.8336049318313599, "learning_rate": 2.931785399363592e-05, "loss": 0.7557, "step": 260 }, { "epoch": 0.7270233196159122, "grad_norm": 0.967589795589447, "learning_rate": 2.9276731782382123e-05, "loss": 0.7062, "step": 265 }, { "epoch": 0.7407407407407407, "grad_norm": 0.8542262315750122, "learning_rate": 2.9234436650955297e-05, "loss": 0.7184, "step": 270 }, { "epoch": 0.7544581618655692, "grad_norm": 0.9164705276489258, "learning_rate": 2.9190972074157232e-05, "loss": 0.6814, "step": 275 }, { "epoch": 0.7681755829903978, "grad_norm": 0.8561894297599792, "learning_rate": 2.9146341622866716e-05, "loss": 0.6944, "step": 280 }, { "epoch": 0.7818930041152263, "grad_norm": 0.9264963865280151, "learning_rate": 2.910054896374623e-05, "loss": 0.6571, "step": 285 }, { "epoch": 0.7956104252400549, "grad_norm": 0.8652317523956299, "learning_rate": 2.9053597858940666e-05, "loss": 0.7355, "step": 290 }, { "epoch": 0.8093278463648834, "grad_norm": 0.9738150835037231, "learning_rate": 2.9005492165768278e-05, "loss": 0.6453, "step": 295 }, { "epoch": 0.823045267489712, "grad_norm": 1.012303352355957, "learning_rate": 2.895623583640375e-05, "loss": 0.6413, "step": 300 }, { "epoch": 0.8367626886145405, "grad_norm": 0.8940839767456055, "learning_rate": 2.890583291755351e-05, "loss": 0.7145, "step": 305 }, { "epoch": 0.850480109739369, "grad_norm": 0.8137922883033752, "learning_rate": 2.8854287550123278e-05, "loss": 0.6835, "step": 310 }, { "epoch": 0.8641975308641975, "grad_norm": 0.8530572056770325, "learning_rate": 2.880160396887787e-05, "loss": 0.6702, "step": 315 }, { "epoch": 0.877914951989026, "grad_norm": 0.9513605237007141, "learning_rate": 2.8747786502093258e-05, "loss": 0.7024, "step": 320 }, { "epoch": 0.8916323731138546, "grad_norm": 0.899515688419342, "learning_rate": 2.8692839571201e-05, "loss": 0.6845, "step": 325 }, { "epoch": 0.9053497942386831, "grad_norm": 0.9002428650856018, "learning_rate": 2.863676769042498e-05, "loss": 0.6447, "step": 330 }, { "epoch": 0.9190672153635117, "grad_norm": 0.8433024883270264, "learning_rate": 2.8579575466410566e-05, "loss": 0.6325, "step": 335 }, { "epoch": 0.9327846364883402, "grad_norm": 0.9360433220863342, "learning_rate": 2.8521267597846094e-05, "loss": 0.5826, "step": 340 }, { "epoch": 0.9465020576131687, "grad_norm": 0.9888073205947876, "learning_rate": 2.8461848875076884e-05, "loss": 0.6349, "step": 345 }, { "epoch": 0.9602194787379973, "grad_norm": 0.914045512676239, "learning_rate": 2.8401324179711678e-05, "loss": 0.5755, "step": 350 }, { "epoch": 0.9739368998628258, "grad_norm": 0.9859684705734253, "learning_rate": 2.8339698484221574e-05, "loss": 0.6325, "step": 355 }, { "epoch": 0.9876543209876543, "grad_norm": 0.9974327683448792, "learning_rate": 2.827697685153151e-05, "loss": 0.5818, "step": 360 }, { "epoch": 1.0, "grad_norm": 1.362316370010376, "learning_rate": 2.8213164434604316e-05, "loss": 0.5783, "step": 365 }, { "epoch": 1.0137174211248285, "grad_norm": 1.4332367181777954, "learning_rate": 2.814826647601738e-05, "loss": 0.5402, "step": 370 }, { "epoch": 1.027434842249657, "grad_norm": 1.2112072706222534, "learning_rate": 2.8082288307531914e-05, "loss": 0.5368, "step": 375 }, { "epoch": 1.0411522633744856, "grad_norm": 1.2344884872436523, "learning_rate": 2.8015235349654938e-05, "loss": 0.5097, "step": 380 }, { "epoch": 1.0548696844993142, "grad_norm": 0.9244922995567322, "learning_rate": 2.7947113111193936e-05, "loss": 0.5583, "step": 385 }, { "epoch": 1.0685871056241427, "grad_norm": 0.980315089225769, "learning_rate": 2.7877927188804288e-05, "loss": 0.51, "step": 390 }, { "epoch": 1.0823045267489713, "grad_norm": 0.9603582620620728, "learning_rate": 2.7807683266529466e-05, "loss": 0.5517, "step": 395 }, { "epoch": 1.0960219478737998, "grad_norm": 1.0928096771240234, "learning_rate": 2.773638711533405e-05, "loss": 0.5018, "step": 400 }, { "epoch": 1.1097393689986284, "grad_norm": 1.0439014434814453, "learning_rate": 2.7664044592629615e-05, "loss": 0.5139, "step": 405 }, { "epoch": 1.123456790123457, "grad_norm": 0.9458956718444824, "learning_rate": 2.7590661641793513e-05, "loss": 0.5583, "step": 410 }, { "epoch": 1.1371742112482854, "grad_norm": 0.9447291493415833, "learning_rate": 2.7516244291680565e-05, "loss": 0.4616, "step": 415 }, { "epoch": 1.1508916323731138, "grad_norm": 1.076687216758728, "learning_rate": 2.7440798656127792e-05, "loss": 0.4695, "step": 420 }, { "epoch": 1.1646090534979423, "grad_norm": 1.0236420631408691, "learning_rate": 2.7364330933452094e-05, "loss": 0.5455, "step": 425 }, { "epoch": 1.1783264746227708, "grad_norm": 1.0733507871627808, "learning_rate": 2.7286847405941024e-05, "loss": 0.4956, "step": 430 }, { "epoch": 1.1920438957475994, "grad_norm": 0.9793803095817566, "learning_rate": 2.720835443933669e-05, "loss": 0.5202, "step": 435 }, { "epoch": 1.205761316872428, "grad_norm": 1.0080183744430542, "learning_rate": 2.712885848231273e-05, "loss": 0.4865, "step": 440 }, { "epoch": 1.2194787379972565, "grad_norm": 0.9846312999725342, "learning_rate": 2.7048366065944538e-05, "loss": 0.4843, "step": 445 }, { "epoch": 1.233196159122085, "grad_norm": 1.0748393535614014, "learning_rate": 2.6966883803172698e-05, "loss": 0.4714, "step": 450 }, { "epoch": 1.2469135802469136, "grad_norm": 1.0638792514801025, "learning_rate": 2.6884418388259675e-05, "loss": 0.5295, "step": 455 }, { "epoch": 1.260631001371742, "grad_norm": 1.0708755254745483, "learning_rate": 2.6800976596239855e-05, "loss": 0.4713, "step": 460 }, { "epoch": 1.2743484224965707, "grad_norm": 0.9797709584236145, "learning_rate": 2.6716565282362928e-05, "loss": 0.4489, "step": 465 }, { "epoch": 1.2880658436213992, "grad_norm": 1.017024278640747, "learning_rate": 2.663119138153069e-05, "loss": 0.4699, "step": 470 }, { "epoch": 1.3017832647462277, "grad_norm": 1.1177973747253418, "learning_rate": 2.654486190772729e-05, "loss": 0.4354, "step": 475 }, { "epoch": 1.3155006858710563, "grad_norm": 0.9993549585342407, "learning_rate": 2.6457583953443022e-05, "loss": 0.4882, "step": 480 }, { "epoch": 1.3292181069958848, "grad_norm": 1.1262377500534058, "learning_rate": 2.636936468909158e-05, "loss": 0.41, "step": 485 }, { "epoch": 1.3429355281207134, "grad_norm": 0.964245080947876, "learning_rate": 2.628021136242101e-05, "loss": 0.4364, "step": 490 }, { "epoch": 1.356652949245542, "grad_norm": 1.0607110261917114, "learning_rate": 2.619013129791823e-05, "loss": 0.4891, "step": 495 }, { "epoch": 1.3703703703703702, "grad_norm": 1.0203349590301514, "learning_rate": 2.6099131896207327e-05, "loss": 0.4341, "step": 500 }, { "epoch": 1.3840877914951988, "grad_norm": 0.9529250860214233, "learning_rate": 2.6007220633441486e-05, "loss": 0.4266, "step": 505 }, { "epoch": 1.3978052126200273, "grad_norm": 0.9978516697883606, "learning_rate": 2.591440506068883e-05, "loss": 0.434, "step": 510 }, { "epoch": 1.4115226337448559, "grad_norm": 1.03233003616333, "learning_rate": 2.582069280331204e-05, "loss": 0.3978, "step": 515 }, { "epoch": 1.4252400548696844, "grad_norm": 1.2834852933883667, "learning_rate": 2.5726091560341873e-05, "loss": 0.4496, "step": 520 }, { "epoch": 1.438957475994513, "grad_norm": 1.0178697109222412, "learning_rate": 2.5630609103844646e-05, "loss": 0.433, "step": 525 }, { "epoch": 1.4526748971193415, "grad_norm": 1.0839297771453857, "learning_rate": 2.5534253278283725e-05, "loss": 0.4494, "step": 530 }, { "epoch": 1.46639231824417, "grad_norm": 0.9672828316688538, "learning_rate": 2.5437031999875047e-05, "loss": 0.4438, "step": 535 }, { "epoch": 1.4801097393689986, "grad_norm": 0.9954227805137634, "learning_rate": 2.533895325593674e-05, "loss": 0.4253, "step": 540 }, { "epoch": 1.4938271604938271, "grad_norm": 0.9949136972427368, "learning_rate": 2.5240025104232938e-05, "loss": 0.4565, "step": 545 }, { "epoch": 1.5075445816186557, "grad_norm": 1.2475636005401611, "learning_rate": 2.514025567231178e-05, "loss": 0.397, "step": 550 }, { "epoch": 1.5212620027434842, "grad_norm": 1.0037999153137207, "learning_rate": 2.5039653156837686e-05, "loss": 0.3955, "step": 555 }, { "epoch": 1.5349794238683128, "grad_norm": 1.2002379894256592, "learning_rate": 2.4938225822917932e-05, "loss": 0.3541, "step": 560 }, { "epoch": 1.5486968449931413, "grad_norm": 1.0180314779281616, "learning_rate": 2.4835982003423654e-05, "loss": 0.403, "step": 565 }, { "epoch": 1.5624142661179699, "grad_norm": 1.017683982849121, "learning_rate": 2.473293009830522e-05, "loss": 0.4082, "step": 570 }, { "epoch": 1.5761316872427984, "grad_norm": 1.055148720741272, "learning_rate": 2.4629078573902136e-05, "loss": 0.4118, "step": 575 }, { "epoch": 1.589849108367627, "grad_norm": 1.082885980606079, "learning_rate": 2.45244359622475e-05, "loss": 0.3646, "step": 580 }, { "epoch": 1.6035665294924555, "grad_norm": 0.9864196181297302, "learning_rate": 2.4419010860367013e-05, "loss": 0.3726, "step": 585 }, { "epoch": 1.617283950617284, "grad_norm": 1.0728785991668701, "learning_rate": 2.431281192957271e-05, "loss": 0.3665, "step": 590 }, { "epoch": 1.6310013717421126, "grad_norm": 1.0340756177902222, "learning_rate": 2.4205847894751358e-05, "loss": 0.402, "step": 595 }, { "epoch": 1.6447187928669411, "grad_norm": 0.9732591509819031, "learning_rate": 2.409812754364768e-05, "loss": 0.3913, "step": 600 }, { "epoch": 1.6584362139917697, "grad_norm": 0.9975690245628357, "learning_rate": 2.398965972614235e-05, "loss": 0.356, "step": 605 }, { "epoch": 1.6721536351165982, "grad_norm": 1.0766946077346802, "learning_rate": 2.3880453353524963e-05, "loss": 0.3921, "step": 610 }, { "epoch": 1.6858710562414267, "grad_norm": 1.0128309726715088, "learning_rate": 2.377051739776189e-05, "loss": 0.4264, "step": 615 }, { "epoch": 1.6995884773662553, "grad_norm": 1.0055245161056519, "learning_rate": 2.3659860890759184e-05, "loss": 0.3296, "step": 620 }, { "epoch": 1.7133058984910838, "grad_norm": 1.058605432510376, "learning_rate": 2.3548492923620567e-05, "loss": 0.3617, "step": 625 }, { "epoch": 1.7270233196159122, "grad_norm": 1.0829980373382568, "learning_rate": 2.343642264590051e-05, "loss": 0.3801, "step": 630 }, { "epoch": 1.7407407407407407, "grad_norm": 1.2426400184631348, "learning_rate": 2.3323659264852586e-05, "loss": 0.3491, "step": 635 }, { "epoch": 1.7544581618655692, "grad_norm": 1.0566037893295288, "learning_rate": 2.3210212044672995e-05, "loss": 0.3547, "step": 640 }, { "epoch": 1.7681755829903978, "grad_norm": 0.9220610857009888, "learning_rate": 2.3096090305739476e-05, "loss": 0.3682, "step": 645 }, { "epoch": 1.7818930041152263, "grad_norm": 1.124056100845337, "learning_rate": 2.298130342384559e-05, "loss": 0.3236, "step": 650 }, { "epoch": 1.7956104252400549, "grad_norm": 1.10678231716156, "learning_rate": 2.2865860829430405e-05, "loss": 0.359, "step": 655 }, { "epoch": 1.8093278463648834, "grad_norm": 1.1028835773468018, "learning_rate": 2.2749772006803782e-05, "loss": 0.3276, "step": 660 }, { "epoch": 1.823045267489712, "grad_norm": 0.9391087889671326, "learning_rate": 2.2633046493367128e-05, "loss": 0.3589, "step": 665 }, { "epoch": 1.8367626886145405, "grad_norm": 1.010254979133606, "learning_rate": 2.2515693878829872e-05, "loss": 0.3517, "step": 670 }, { "epoch": 1.850480109739369, "grad_norm": 1.0423787832260132, "learning_rate": 2.2397723804421613e-05, "loss": 0.32, "step": 675 }, { "epoch": 1.8641975308641974, "grad_norm": 1.1941534280776978, "learning_rate": 2.227914596210002e-05, "loss": 0.3128, "step": 680 }, { "epoch": 1.877914951989026, "grad_norm": 0.9325462579727173, "learning_rate": 2.2159970093754583e-05, "loss": 0.3595, "step": 685 }, { "epoch": 1.8916323731138545, "grad_norm": 1.2271301746368408, "learning_rate": 2.2040205990406257e-05, "loss": 0.3118, "step": 690 }, { "epoch": 1.905349794238683, "grad_norm": 0.9768636226654053, "learning_rate": 2.1919863491403083e-05, "loss": 0.3858, "step": 695 }, { "epoch": 1.9190672153635115, "grad_norm": 1.3226031064987183, "learning_rate": 2.1798952483611812e-05, "loss": 0.3268, "step": 700 }, { "epoch": 1.93278463648834, "grad_norm": 0.8924455046653748, "learning_rate": 2.167748290060564e-05, "loss": 0.2887, "step": 705 }, { "epoch": 1.9465020576131686, "grad_norm": 1.220363974571228, "learning_rate": 2.1555464721848107e-05, "loss": 0.3174, "step": 710 }, { "epoch": 1.9602194787379972, "grad_norm": 1.1989498138427734, "learning_rate": 2.1432907971873225e-05, "loss": 0.3026, "step": 715 }, { "epoch": 1.9739368998628257, "grad_norm": 1.065783977508545, "learning_rate": 2.1309822719461905e-05, "loss": 0.3121, "step": 720 }, { "epoch": 1.9876543209876543, "grad_norm": 1.019674301147461, "learning_rate": 2.118621907681474e-05, "loss": 0.2984, "step": 725 }, { "epoch": 2.0, "grad_norm": 1.3716942071914673, "learning_rate": 2.106210719872121e-05, "loss": 0.2699, "step": 730 } ], "logging_steps": 5, "max_steps": 1825, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1214531076208722e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }