{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998997292690264, "eval_steps": 500, "global_step": 831, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012032487716835455, "grad_norm": 1.0745393392299216, "learning_rate": 1.1904761904761906e-07, "loss": 0.7396, "step": 1 }, { "epoch": 0.006016243858417728, "grad_norm": 1.0096667298005468, "learning_rate": 5.952380952380953e-07, "loss": 0.7345, "step": 5 }, { "epoch": 0.012032487716835455, "grad_norm": 0.5432840285599178, "learning_rate": 1.1904761904761906e-06, "loss": 0.6995, "step": 10 }, { "epoch": 0.018048731575253184, "grad_norm": 0.5379885594965839, "learning_rate": 1.7857142857142859e-06, "loss": 0.5364, "step": 15 }, { "epoch": 0.02406497543367091, "grad_norm": 0.28279328593216746, "learning_rate": 2.380952380952381e-06, "loss": 0.3438, "step": 20 }, { "epoch": 0.03008121929208864, "grad_norm": 0.11670701576343243, "learning_rate": 2.9761904761904763e-06, "loss": 0.2194, "step": 25 }, { "epoch": 0.03609746315050637, "grad_norm": 0.09598554828698612, "learning_rate": 3.5714285714285718e-06, "loss": 0.1655, "step": 30 }, { "epoch": 0.0421137070089241, "grad_norm": 0.07370159012965125, "learning_rate": 4.166666666666667e-06, "loss": 0.1433, "step": 35 }, { "epoch": 0.04812995086734182, "grad_norm": 0.05410564768989329, "learning_rate": 4.761904761904762e-06, "loss": 0.1227, "step": 40 }, { "epoch": 0.05414619472575955, "grad_norm": 0.049586846542590546, "learning_rate": 5.357142857142857e-06, "loss": 0.1097, "step": 45 }, { "epoch": 0.06016243858417728, "grad_norm": 0.048103082558905504, "learning_rate": 5.9523809523809525e-06, "loss": 0.0978, "step": 50 }, { "epoch": 0.066178682442595, "grad_norm": 0.05669271340624084, "learning_rate": 6.547619047619048e-06, "loss": 0.0817, "step": 55 }, { "epoch": 0.07219492630101274, "grad_norm": 0.054915884901381114, "learning_rate": 7.1428571428571436e-06, "loss": 0.0801, "step": 60 }, { "epoch": 0.07821117015943047, "grad_norm": 0.06021281263850561, "learning_rate": 7.738095238095238e-06, "loss": 0.0702, "step": 65 }, { "epoch": 0.0842274140178482, "grad_norm": 0.039388545955733205, "learning_rate": 8.333333333333334e-06, "loss": 0.0619, "step": 70 }, { "epoch": 0.09024365787626591, "grad_norm": 0.044736289577753, "learning_rate": 8.92857142857143e-06, "loss": 0.0556, "step": 75 }, { "epoch": 0.09625990173468364, "grad_norm": 0.04237369775091706, "learning_rate": 9.523809523809525e-06, "loss": 0.0523, "step": 80 }, { "epoch": 0.10227614559310137, "grad_norm": 0.04699046038689833, "learning_rate": 9.999955782120656e-06, "loss": 0.0497, "step": 85 }, { "epoch": 0.1082923894515191, "grad_norm": 0.03964650441405405, "learning_rate": 9.99840823846134e-06, "loss": 0.0458, "step": 90 }, { "epoch": 0.11430863330993683, "grad_norm": 0.03621563631742996, "learning_rate": 9.994650582860978e-06, "loss": 0.0477, "step": 95 }, { "epoch": 0.12032487716835456, "grad_norm": 0.0268237826645417, "learning_rate": 9.98868447681642e-06, "loss": 0.0439, "step": 100 }, { "epoch": 0.12634112102677228, "grad_norm": 0.028555915124659773, "learning_rate": 9.980512558319915e-06, "loss": 0.0409, "step": 105 }, { "epoch": 0.13235736488519, "grad_norm": 0.026049496996790083, "learning_rate": 9.970138440692706e-06, "loss": 0.0414, "step": 110 }, { "epoch": 0.13837360874360774, "grad_norm": 0.03147552348817025, "learning_rate": 9.957566710987338e-06, "loss": 0.0406, "step": 115 }, { "epoch": 0.14438985260202547, "grad_norm": 0.027902049215008537, "learning_rate": 9.942802927959444e-06, "loss": 0.0392, "step": 120 }, { "epoch": 0.1504060964604432, "grad_norm": 0.028174099496676555, "learning_rate": 9.925853619609858e-06, "loss": 0.0339, "step": 125 }, { "epoch": 0.15642234031886093, "grad_norm": 0.025957912802832783, "learning_rate": 9.906726280298185e-06, "loss": 0.0365, "step": 130 }, { "epoch": 0.16243858417727866, "grad_norm": 0.026511579347961597, "learning_rate": 9.885429367429062e-06, "loss": 0.0365, "step": 135 }, { "epoch": 0.1684548280356964, "grad_norm": 0.02801670507212045, "learning_rate": 9.861972297712606e-06, "loss": 0.0343, "step": 140 }, { "epoch": 0.17447107189411412, "grad_norm": 0.023525196780612226, "learning_rate": 9.836365443000697e-06, "loss": 0.0331, "step": 145 }, { "epoch": 0.18048731575253182, "grad_norm": 0.02456235083949215, "learning_rate": 9.808620125700925e-06, "loss": 0.0335, "step": 150 }, { "epoch": 0.18650355961094955, "grad_norm": 0.021953324848861307, "learning_rate": 9.778748613770234e-06, "loss": 0.0313, "step": 155 }, { "epoch": 0.19251980346936728, "grad_norm": 0.02914752771577575, "learning_rate": 9.746764115290496e-06, "loss": 0.0354, "step": 160 }, { "epoch": 0.19853604732778501, "grad_norm": 0.0241041372740356, "learning_rate": 9.712680772628365e-06, "loss": 0.0338, "step": 165 }, { "epoch": 0.20455229118620274, "grad_norm": 0.02385726337646924, "learning_rate": 9.676513656182059e-06, "loss": 0.0343, "step": 170 }, { "epoch": 0.21056853504462048, "grad_norm": 0.021532109435525824, "learning_rate": 9.63827875771778e-06, "loss": 0.0317, "step": 175 }, { "epoch": 0.2165847789030382, "grad_norm": 0.02462556259506109, "learning_rate": 9.597992983298748e-06, "loss": 0.0299, "step": 180 }, { "epoch": 0.22260102276145594, "grad_norm": 0.022316517368094545, "learning_rate": 9.55567414580995e-06, "loss": 0.0323, "step": 185 }, { "epoch": 0.22861726661987367, "grad_norm": 0.02288293026305173, "learning_rate": 9.511340957081957e-06, "loss": 0.0307, "step": 190 }, { "epoch": 0.2346335104782914, "grad_norm": 0.022260065504874377, "learning_rate": 9.46501301961723e-06, "loss": 0.0345, "step": 195 }, { "epoch": 0.24064975433670913, "grad_norm": 0.021736470284370146, "learning_rate": 9.416710817922615e-06, "loss": 0.0311, "step": 200 }, { "epoch": 0.24666599819512683, "grad_norm": 0.02550413340783063, "learning_rate": 9.366455709451857e-06, "loss": 0.0329, "step": 205 }, { "epoch": 0.25268224205354456, "grad_norm": 0.02201788645460498, "learning_rate": 9.314269915162115e-06, "loss": 0.0309, "step": 210 }, { "epoch": 0.2586984859119623, "grad_norm": 0.02936137915292986, "learning_rate": 9.260176509688673e-06, "loss": 0.0285, "step": 215 }, { "epoch": 0.26471472977038, "grad_norm": 0.023538798060004095, "learning_rate": 9.204199411142196e-06, "loss": 0.0293, "step": 220 }, { "epoch": 0.2707309736287978, "grad_norm": 0.018492280335574894, "learning_rate": 9.146363370533004e-06, "loss": 0.0281, "step": 225 }, { "epoch": 0.2767472174872155, "grad_norm": 0.023852820186103463, "learning_rate": 9.086693960827106e-06, "loss": 0.028, "step": 230 }, { "epoch": 0.2827634613456332, "grad_norm": 0.021207055501016953, "learning_rate": 9.025217565638766e-06, "loss": 0.0291, "step": 235 }, { "epoch": 0.28877970520405094, "grad_norm": 0.01877956343623577, "learning_rate": 8.961961367564652e-06, "loss": 0.0282, "step": 240 }, { "epoch": 0.29479594906246864, "grad_norm": 0.019669983451369902, "learning_rate": 8.89695333616467e-06, "loss": 0.0259, "step": 245 }, { "epoch": 0.3008121929208864, "grad_norm": 0.026680075575420455, "learning_rate": 8.83022221559489e-06, "loss": 0.0289, "step": 250 }, { "epoch": 0.3068284367793041, "grad_norm": 0.01856152788906122, "learning_rate": 8.761797511897907e-06, "loss": 0.0241, "step": 255 }, { "epoch": 0.31284468063772186, "grad_norm": 0.01860271376372244, "learning_rate": 8.691709479956373e-06, "loss": 0.0272, "step": 260 }, { "epoch": 0.31886092449613956, "grad_norm": 0.019865593703402643, "learning_rate": 8.619989110115398e-06, "loss": 0.0257, "step": 265 }, { "epoch": 0.3248771683545573, "grad_norm": 0.02048991665947615, "learning_rate": 8.546668114479769e-06, "loss": 0.029, "step": 270 }, { "epoch": 0.330893412212975, "grad_norm": 0.017217185351734676, "learning_rate": 8.471778912892008e-06, "loss": 0.0252, "step": 275 }, { "epoch": 0.3369096560713928, "grad_norm": 0.01813603892384963, "learning_rate": 8.395354618597533e-06, "loss": 0.0268, "step": 280 }, { "epoch": 0.3429258999298105, "grad_norm": 0.0238318384115611, "learning_rate": 8.31742902360319e-06, "loss": 0.0282, "step": 285 }, { "epoch": 0.34894214378822824, "grad_norm": 0.019064179482916437, "learning_rate": 8.238036583735673e-06, "loss": 0.0271, "step": 290 }, { "epoch": 0.35495838764664595, "grad_norm": 0.0222590787343455, "learning_rate": 8.157212403406424e-06, "loss": 0.0257, "step": 295 }, { "epoch": 0.36097463150506365, "grad_norm": 0.01848676924848829, "learning_rate": 8.07499222008977e-06, "loss": 0.0251, "step": 300 }, { "epoch": 0.3669908753634814, "grad_norm": 0.01993669886796683, "learning_rate": 7.991412388521108e-06, "loss": 0.0261, "step": 305 }, { "epoch": 0.3730071192218991, "grad_norm": 0.019508670102946216, "learning_rate": 7.906509864622202e-06, "loss": 0.0258, "step": 310 }, { "epoch": 0.37902336308031687, "grad_norm": 0.02181687850379784, "learning_rate": 7.820322189160618e-06, "loss": 0.0219, "step": 315 }, { "epoch": 0.38503960693873457, "grad_norm": 0.020435112312527343, "learning_rate": 7.732887471150589e-06, "loss": 0.0258, "step": 320 }, { "epoch": 0.3910558507971523, "grad_norm": 0.016792623507338397, "learning_rate": 7.644244371002619e-06, "loss": 0.0259, "step": 325 }, { "epoch": 0.39707209465557003, "grad_norm": 0.025243263772643947, "learning_rate": 7.554432083429253e-06, "loss": 0.0239, "step": 330 }, { "epoch": 0.4030883385139878, "grad_norm": 0.017995141016588313, "learning_rate": 7.463490320114646e-06, "loss": 0.023, "step": 335 }, { "epoch": 0.4091045823724055, "grad_norm": 0.016947239802050783, "learning_rate": 7.371459292155501e-06, "loss": 0.0227, "step": 340 }, { "epoch": 0.41512082623082325, "grad_norm": 0.01945334381344522, "learning_rate": 7.278379692281209e-06, "loss": 0.0236, "step": 345 }, { "epoch": 0.42113707008924095, "grad_norm": 0.018112579661825487, "learning_rate": 7.184292676861024e-06, "loss": 0.0262, "step": 350 }, { "epoch": 0.42715331394765865, "grad_norm": 0.01698932526214814, "learning_rate": 7.0892398477062375e-06, "loss": 0.024, "step": 355 }, { "epoch": 0.4331695578060764, "grad_norm": 0.018750963386521588, "learning_rate": 6.99326323367538e-06, "loss": 0.0245, "step": 360 }, { "epoch": 0.4391858016644941, "grad_norm": 0.02081900306974978, "learning_rate": 6.8964052720906175e-06, "loss": 0.026, "step": 365 }, { "epoch": 0.44520204552291187, "grad_norm": 0.02082309821957481, "learning_rate": 6.798708789973527e-06, "loss": 0.0255, "step": 370 }, { "epoch": 0.4512182893813296, "grad_norm": 0.023227812045822693, "learning_rate": 6.700216985108568e-06, "loss": 0.0243, "step": 375 }, { "epoch": 0.45723453323974733, "grad_norm": 0.01688396438835524, "learning_rate": 6.600973406942617e-06, "loss": 0.0235, "step": 380 }, { "epoch": 0.46325077709816503, "grad_norm": 0.020269716613110212, "learning_rate": 6.501021937328992e-06, "loss": 0.0215, "step": 385 }, { "epoch": 0.4692670209565828, "grad_norm": 0.018025367141900818, "learning_rate": 6.4004067711245366e-06, "loss": 0.0221, "step": 390 }, { "epoch": 0.4752832648150005, "grad_norm": 0.020858778838367335, "learning_rate": 6.29917239664826e-06, "loss": 0.0232, "step": 395 }, { "epoch": 0.48129950867341825, "grad_norm": 0.022426616797026122, "learning_rate": 6.1973635760102645e-06, "loss": 0.0232, "step": 400 }, { "epoch": 0.48731575253183596, "grad_norm": 0.02208916714792084, "learning_rate": 6.0950253253195656e-06, "loss": 0.0277, "step": 405 }, { "epoch": 0.49333199639025366, "grad_norm": 0.020289821733336374, "learning_rate": 5.9922028947796495e-06, "loss": 0.0226, "step": 410 }, { "epoch": 0.4993482402486714, "grad_norm": 0.022132901095444316, "learning_rate": 5.888941748680484e-06, "loss": 0.023, "step": 415 }, { "epoch": 0.5053644841070891, "grad_norm": 0.02080443746491033, "learning_rate": 5.785287545295895e-06, "loss": 0.0214, "step": 420 }, { "epoch": 0.5113807279655068, "grad_norm": 0.018676657765971747, "learning_rate": 5.681286116695155e-06, "loss": 0.0225, "step": 425 }, { "epoch": 0.5173969718239246, "grad_norm": 0.020395399948370366, "learning_rate": 5.5769834484777344e-06, "loss": 0.0222, "step": 430 }, { "epoch": 0.5234132156823423, "grad_norm": 0.01859239650955675, "learning_rate": 5.472425659440157e-06, "loss": 0.0205, "step": 435 }, { "epoch": 0.52942945954076, "grad_norm": 0.01868449550665156, "learning_rate": 5.367658981183979e-06, "loss": 0.0232, "step": 440 }, { "epoch": 0.5354457033991777, "grad_norm": 0.015161045404289166, "learning_rate": 5.2627297376738674e-06, "loss": 0.0203, "step": 445 }, { "epoch": 0.5414619472575956, "grad_norm": 0.016845793697294737, "learning_rate": 5.157684324754858e-06, "loss": 0.0211, "step": 450 }, { "epoch": 0.5474781911160133, "grad_norm": 0.0194074719508428, "learning_rate": 5.052569189637813e-06, "loss": 0.0238, "step": 455 }, { "epoch": 0.553494434974431, "grad_norm": 0.01936893905137615, "learning_rate": 4.947430810362188e-06, "loss": 0.0216, "step": 460 }, { "epoch": 0.5595106788328487, "grad_norm": 0.01859222507454881, "learning_rate": 4.842315675245144e-06, "loss": 0.0195, "step": 465 }, { "epoch": 0.5655269226912664, "grad_norm": 0.022318020720072922, "learning_rate": 4.737270262326134e-06, "loss": 0.0214, "step": 470 }, { "epoch": 0.5715431665496842, "grad_norm": 0.01657420129142331, "learning_rate": 4.632341018816023e-06, "loss": 0.0213, "step": 475 }, { "epoch": 0.5775594104081019, "grad_norm": 0.02035658875684946, "learning_rate": 4.527574340559844e-06, "loss": 0.0226, "step": 480 }, { "epoch": 0.5835756542665196, "grad_norm": 0.016236086655299523, "learning_rate": 4.423016551522268e-06, "loss": 0.0203, "step": 485 }, { "epoch": 0.5895918981249373, "grad_norm": 0.017728340188220976, "learning_rate": 4.318713883304846e-06, "loss": 0.0227, "step": 490 }, { "epoch": 0.5956081419833551, "grad_norm": 0.019076149038680715, "learning_rate": 4.214712454704107e-06, "loss": 0.0212, "step": 495 }, { "epoch": 0.6016243858417728, "grad_norm": 0.015355569764402307, "learning_rate": 4.111058251319517e-06, "loss": 0.0201, "step": 500 }, { "epoch": 0.6076406297001905, "grad_norm": 0.018415895096337096, "learning_rate": 4.007797105220352e-06, "loss": 0.0191, "step": 505 }, { "epoch": 0.6136568735586082, "grad_norm": 0.017521505257648343, "learning_rate": 3.904974674680436e-06, "loss": 0.0206, "step": 510 }, { "epoch": 0.619673117417026, "grad_norm": 0.017605011135453034, "learning_rate": 3.802636423989738e-06, "loss": 0.0212, "step": 515 }, { "epoch": 0.6256893612754437, "grad_norm": 0.01785825295985011, "learning_rate": 3.70082760335174e-06, "loss": 0.019, "step": 520 }, { "epoch": 0.6317056051338614, "grad_norm": 0.01652138449465044, "learning_rate": 3.5995932288754655e-06, "loss": 0.0181, "step": 525 }, { "epoch": 0.6377218489922791, "grad_norm": 0.018648556241997245, "learning_rate": 3.4989780626710103e-06, "loss": 0.0205, "step": 530 }, { "epoch": 0.6437380928506968, "grad_norm": 0.01827077862731449, "learning_rate": 3.3990265930573863e-06, "loss": 0.0169, "step": 535 }, { "epoch": 0.6497543367091146, "grad_norm": 0.015591039180393907, "learning_rate": 3.2997830148914316e-06, "loss": 0.0206, "step": 540 }, { "epoch": 0.6557705805675323, "grad_norm": 0.019792404271656995, "learning_rate": 3.2012912100264743e-06, "loss": 0.0202, "step": 545 }, { "epoch": 0.66178682442595, "grad_norm": 0.018533196717200034, "learning_rate": 3.1035947279093846e-06, "loss": 0.021, "step": 550 }, { "epoch": 0.6678030682843678, "grad_norm": 0.019636694754151293, "learning_rate": 3.006736766324623e-06, "loss": 0.0221, "step": 555 }, { "epoch": 0.6738193121427856, "grad_norm": 0.014256034059804634, "learning_rate": 2.9107601522937638e-06, "loss": 0.0202, "step": 560 }, { "epoch": 0.6798355560012033, "grad_norm": 0.016107622432284356, "learning_rate": 2.8157073231389752e-06, "loss": 0.0187, "step": 565 }, { "epoch": 0.685851799859621, "grad_norm": 0.019189045539841333, "learning_rate": 2.721620307718793e-06, "loss": 0.0187, "step": 570 }, { "epoch": 0.6918680437180387, "grad_norm": 0.014474735470360907, "learning_rate": 2.6285407078445015e-06, "loss": 0.021, "step": 575 }, { "epoch": 0.6978842875764565, "grad_norm": 0.019262771622277575, "learning_rate": 2.536509679885355e-06, "loss": 0.0209, "step": 580 }, { "epoch": 0.7039005314348742, "grad_norm": 0.01673939208310001, "learning_rate": 2.4455679165707473e-06, "loss": 0.0205, "step": 585 }, { "epoch": 0.7099167752932919, "grad_norm": 0.014946535263918313, "learning_rate": 2.3557556289973838e-06, "loss": 0.0205, "step": 590 }, { "epoch": 0.7159330191517096, "grad_norm": 0.01596788205464162, "learning_rate": 2.2671125288494123e-06, "loss": 0.0189, "step": 595 }, { "epoch": 0.7219492630101273, "grad_norm": 0.016621734500822712, "learning_rate": 2.1796778108393824e-06, "loss": 0.0198, "step": 600 }, { "epoch": 0.7279655068685451, "grad_norm": 0.018148193058810227, "learning_rate": 2.0934901353777994e-06, "loss": 0.0203, "step": 605 }, { "epoch": 0.7339817507269628, "grad_norm": 0.016498667065988057, "learning_rate": 2.008587611478894e-06, "loss": 0.0193, "step": 610 }, { "epoch": 0.7399979945853805, "grad_norm": 0.017876917503246166, "learning_rate": 1.9250077799102323e-06, "loss": 0.0197, "step": 615 }, { "epoch": 0.7460142384437982, "grad_norm": 0.01737881401071936, "learning_rate": 1.842787596593576e-06, "loss": 0.02, "step": 620 }, { "epoch": 0.752030482302216, "grad_norm": 0.01669399114670848, "learning_rate": 1.761963416264329e-06, "loss": 0.0198, "step": 625 }, { "epoch": 0.7580467261606337, "grad_norm": 0.01963624059543007, "learning_rate": 1.6825709763968112e-06, "loss": 0.0193, "step": 630 }, { "epoch": 0.7640629700190514, "grad_norm": 0.016744844031857532, "learning_rate": 1.6046453814024671e-06, "loss": 0.0194, "step": 635 }, { "epoch": 0.7700792138774691, "grad_norm": 0.015338845138862633, "learning_rate": 1.5282210871079929e-06, "loss": 0.0192, "step": 640 }, { "epoch": 0.7760954577358868, "grad_norm": 0.014792090312693856, "learning_rate": 1.453331885520234e-06, "loss": 0.0164, "step": 645 }, { "epoch": 0.7821117015943047, "grad_norm": 0.01431108236231679, "learning_rate": 1.3800108898846022e-06, "loss": 0.0193, "step": 650 }, { "epoch": 0.7881279454527224, "grad_norm": 0.01517595387399891, "learning_rate": 1.3082905200436291e-06, "loss": 0.0208, "step": 655 }, { "epoch": 0.7941441893111401, "grad_norm": 0.01585759719142669, "learning_rate": 1.2382024881020937e-06, "loss": 0.019, "step": 660 }, { "epoch": 0.8001604331695578, "grad_norm": 0.01823159719823014, "learning_rate": 1.1697777844051105e-06, "loss": 0.0198, "step": 665 }, { "epoch": 0.8061766770279756, "grad_norm": 0.013515790000071682, "learning_rate": 1.1030466638353293e-06, "loss": 0.0187, "step": 670 }, { "epoch": 0.8121929208863933, "grad_norm": 0.016225422506154382, "learning_rate": 1.0380386324353508e-06, "loss": 0.0175, "step": 675 }, { "epoch": 0.818209164744811, "grad_norm": 0.018290959272687618, "learning_rate": 9.74782434361234e-07, "loss": 0.0191, "step": 680 }, { "epoch": 0.8242254086032287, "grad_norm": 0.014122550894005206, "learning_rate": 9.133060391728965e-07, "loss": 0.0186, "step": 685 }, { "epoch": 0.8302416524616465, "grad_norm": 0.01479861703335419, "learning_rate": 8.536366294669979e-07, "loss": 0.017, "step": 690 }, { "epoch": 0.8362578963200642, "grad_norm": 0.01704145841348508, "learning_rate": 7.958005888578063e-07, "loss": 0.0189, "step": 695 }, { "epoch": 0.8422741401784819, "grad_norm": 0.014330474670880548, "learning_rate": 7.398234903113266e-07, "loss": 0.0177, "step": 700 }, { "epoch": 0.8482903840368996, "grad_norm": 0.016122466793252455, "learning_rate": 6.857300848378857e-07, "loss": 0.0186, "step": 705 }, { "epoch": 0.8543066278953173, "grad_norm": 0.016531884356419155, "learning_rate": 6.335442905481442e-07, "loss": 0.0183, "step": 710 }, { "epoch": 0.8603228717537351, "grad_norm": 0.017990075667932776, "learning_rate": 5.832891820773868e-07, "loss": 0.0201, "step": 715 }, { "epoch": 0.8663391156121528, "grad_norm": 0.01787947586996767, "learning_rate": 5.349869803827717e-07, "loss": 0.0168, "step": 720 }, { "epoch": 0.8723553594705705, "grad_norm": 0.016104542603383502, "learning_rate": 4.886590429180426e-07, "loss": 0.0179, "step": 725 }, { "epoch": 0.8783716033289882, "grad_norm": 0.01865954701869269, "learning_rate": 4.443258541900508e-07, "loss": 0.0179, "step": 730 }, { "epoch": 0.884387847187406, "grad_norm": 0.014772814789547404, "learning_rate": 4.020070167012541e-07, "loss": 0.0166, "step": 735 }, { "epoch": 0.8904040910458237, "grad_norm": 0.0189883024943083, "learning_rate": 3.6172124228221914e-07, "loss": 0.0185, "step": 740 }, { "epoch": 0.8964203349042414, "grad_norm": 0.01755215014494978, "learning_rate": 3.23486343817942e-07, "loss": 0.0174, "step": 745 }, { "epoch": 0.9024365787626591, "grad_norm": 0.021255269979833603, "learning_rate": 2.873192273716369e-07, "loss": 0.0198, "step": 750 }, { "epoch": 0.908452822621077, "grad_norm": 0.018993801870159418, "learning_rate": 2.532358847095051e-07, "loss": 0.0181, "step": 755 }, { "epoch": 0.9144690664794947, "grad_norm": 0.01729608956020829, "learning_rate": 2.2125138622976494e-07, "loss": 0.019, "step": 760 }, { "epoch": 0.9204853103379124, "grad_norm": 0.015801337315760878, "learning_rate": 1.9137987429907635e-07, "loss": 0.0168, "step": 765 }, { "epoch": 0.9265015541963301, "grad_norm": 0.016592823438343365, "learning_rate": 1.636345569993042e-07, "loss": 0.0172, "step": 770 }, { "epoch": 0.9325177980547478, "grad_norm": 0.013326130117293207, "learning_rate": 1.3802770228739547e-07, "loss": 0.0162, "step": 775 }, { "epoch": 0.9385340419131656, "grad_norm": 0.017403164818775768, "learning_rate": 1.1457063257093892e-07, "loss": 0.0181, "step": 780 }, { "epoch": 0.9445502857715833, "grad_norm": 0.01717367430718781, "learning_rate": 9.32737197018152e-08, "loss": 0.0193, "step": 785 }, { "epoch": 0.950566529630001, "grad_norm": 0.01932159740421986, "learning_rate": 7.414638039014266e-08, "loss": 0.0189, "step": 790 }, { "epoch": 0.9565827734884187, "grad_norm": 0.014545477047193084, "learning_rate": 5.7197072040557356e-08, "loss": 0.0175, "step": 795 }, { "epoch": 0.9625990173468365, "grad_norm": 0.01471817818854388, "learning_rate": 4.243328901266219e-08, "loss": 0.0184, "step": 800 }, { "epoch": 0.9686152612052542, "grad_norm": 0.016067984603921493, "learning_rate": 2.986155930729484e-08, "loss": 0.0163, "step": 805 }, { "epoch": 0.9746315050636719, "grad_norm": 0.014988902741589531, "learning_rate": 1.9487441680084983e-08, "loss": 0.0174, "step": 810 }, { "epoch": 0.9806477489220896, "grad_norm": 0.016141274215466517, "learning_rate": 1.1315523183581534e-08, "loss": 0.0186, "step": 815 }, { "epoch": 0.9866639927805073, "grad_norm": 0.016494560923804934, "learning_rate": 5.349417139022816e-09, "loss": 0.0186, "step": 820 }, { "epoch": 0.9926802366389251, "grad_norm": 0.019570420674046143, "learning_rate": 1.591761538662362e-09, "loss": 0.0203, "step": 825 }, { "epoch": 0.9986964804973428, "grad_norm": 0.014921909514844484, "learning_rate": 4.4217879344166104e-11, "loss": 0.0162, "step": 830 }, { "epoch": 0.9998997292690264, "step": 831, "total_flos": 4.483822414295204e+18, "train_loss": 0.043213658636630875, "train_runtime": 13633.5754, "train_samples_per_second": 2.926, "train_steps_per_second": 0.061 } ], "logging_steps": 5, "max_steps": 831, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.483822414295204e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }