{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9991254154276719, "eval_steps": 500, "global_step": 1071, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004664451052416769, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.8476, "step": 5 }, { "epoch": 0.009328902104833538, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.861, "step": 10 }, { "epoch": 0.013993353157250307, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.8574, "step": 15 }, { "epoch": 0.018657804209667075, "grad_norm": 1.7217303169710205, "learning_rate": 3.7037037037037037e-06, "loss": 0.8586, "step": 20 }, { "epoch": 0.023322255262083844, "grad_norm": 1.0811304401552388, "learning_rate": 8.333333333333334e-06, "loss": 0.8027, "step": 25 }, { "epoch": 0.027986706314500613, "grad_norm": 0.6678730431826253, "learning_rate": 1.2962962962962962e-05, "loss": 0.7386, "step": 30 }, { "epoch": 0.03265115736691738, "grad_norm": 0.49414971424324633, "learning_rate": 1.7592592592592595e-05, "loss": 0.6911, "step": 35 }, { "epoch": 0.03731560841933415, "grad_norm": 0.4956290038506038, "learning_rate": 2.2222222222222223e-05, "loss": 0.6738, "step": 40 }, { "epoch": 0.041980059471750916, "grad_norm": 0.45113968047451636, "learning_rate": 2.6851851851851855e-05, "loss": 0.6615, "step": 45 }, { "epoch": 0.04664451052416769, "grad_norm": 0.4151578102626258, "learning_rate": 3.148148148148148e-05, "loss": 0.637, "step": 50 }, { "epoch": 0.051308961576584454, "grad_norm": 0.41598749293660114, "learning_rate": 3.611111111111111e-05, "loss": 0.6521, "step": 55 }, { "epoch": 0.055973412629001226, "grad_norm": 0.44364992697109346, "learning_rate": 4.074074074074074e-05, "loss": 0.6416, "step": 60 }, { "epoch": 0.06063786368141799, "grad_norm": 0.4334859115222668, "learning_rate": 4.5370370370370374e-05, "loss": 0.6179, "step": 65 }, { "epoch": 0.06530231473383476, "grad_norm": 0.4168421218480663, "learning_rate": 5e-05, "loss": 0.6455, "step": 70 }, { "epoch": 0.06996676578625154, "grad_norm": 0.5598377478471296, "learning_rate": 4.999731625198103e-05, "loss": 0.6218, "step": 75 }, { "epoch": 0.0746312168386683, "grad_norm": 0.5562791478235322, "learning_rate": 4.998926564814665e-05, "loss": 0.6164, "step": 80 }, { "epoch": 0.07929566789108507, "grad_norm": 0.494836009809662, "learning_rate": 4.997585010901171e-05, "loss": 0.6083, "step": 85 }, { "epoch": 0.08396011894350183, "grad_norm": 0.47761536307054875, "learning_rate": 4.995707283492525e-05, "loss": 0.597, "step": 90 }, { "epoch": 0.08862456999591861, "grad_norm": 0.5012276873640602, "learning_rate": 4.9932938305306997e-05, "loss": 0.5986, "step": 95 }, { "epoch": 0.09328902104833538, "grad_norm": 0.558780335812584, "learning_rate": 4.990345227757883e-05, "loss": 0.6108, "step": 100 }, { "epoch": 0.09795347210075214, "grad_norm": 0.5027530843851625, "learning_rate": 4.986862178579129e-05, "loss": 0.616, "step": 105 }, { "epoch": 0.10261792315316891, "grad_norm": 0.5276174777450547, "learning_rate": 4.982845513894555e-05, "loss": 0.6028, "step": 110 }, { "epoch": 0.10728237420558567, "grad_norm": 0.48105561141886855, "learning_rate": 4.9782961919011294e-05, "loss": 0.5926, "step": 115 }, { "epoch": 0.11194682525800245, "grad_norm": 0.5884754531701646, "learning_rate": 4.973215297864088e-05, "loss": 0.5997, "step": 120 }, { "epoch": 0.11661127631041922, "grad_norm": 0.5038947888532336, "learning_rate": 4.967604043858034e-05, "loss": 0.5966, "step": 125 }, { "epoch": 0.12127572736283598, "grad_norm": 0.4628854625972218, "learning_rate": 4.9614637684777964e-05, "loss": 0.5889, "step": 130 }, { "epoch": 0.12594017841525276, "grad_norm": 0.369569834468325, "learning_rate": 4.954795936519099e-05, "loss": 0.5886, "step": 135 }, { "epoch": 0.13060462946766951, "grad_norm": 0.510224312267307, "learning_rate": 4.9476021386291255e-05, "loss": 0.5904, "step": 140 }, { "epoch": 0.1352690805200863, "grad_norm": 0.5165955737252187, "learning_rate": 4.93988409092706e-05, "loss": 0.5931, "step": 145 }, { "epoch": 0.13993353157250307, "grad_norm": 0.45509182070489973, "learning_rate": 4.931643634594701e-05, "loss": 0.5764, "step": 150 }, { "epoch": 0.14459798262491982, "grad_norm": 0.41968982538759186, "learning_rate": 4.9228827354372374e-05, "loss": 0.5809, "step": 155 }, { "epoch": 0.1492624336773366, "grad_norm": 0.41229365733365814, "learning_rate": 4.9136034834142906e-05, "loss": 0.5886, "step": 160 }, { "epoch": 0.15392688472975335, "grad_norm": 0.3826932795909075, "learning_rate": 4.9038080921413485e-05, "loss": 0.5842, "step": 165 }, { "epoch": 0.15859133578217013, "grad_norm": 0.41134892045101074, "learning_rate": 4.893498898361693e-05, "loss": 0.5946, "step": 170 }, { "epoch": 0.1632557868345869, "grad_norm": 0.38992267173979334, "learning_rate": 4.882678361388958e-05, "loss": 0.5885, "step": 175 }, { "epoch": 0.16792023788700366, "grad_norm": 0.40073952235248766, "learning_rate": 4.871349062520448e-05, "loss": 0.5841, "step": 180 }, { "epoch": 0.17258468893942044, "grad_norm": 0.4200184545679776, "learning_rate": 4.859513704421353e-05, "loss": 0.5771, "step": 185 }, { "epoch": 0.17724913999183722, "grad_norm": 0.4292182704616412, "learning_rate": 4.847175110480015e-05, "loss": 0.5708, "step": 190 }, { "epoch": 0.18191359104425397, "grad_norm": 0.450661348267434, "learning_rate": 4.8343362241343944e-05, "loss": 0.5708, "step": 195 }, { "epoch": 0.18657804209667075, "grad_norm": 0.3869287419256418, "learning_rate": 4.8210001081698954e-05, "loss": 0.5708, "step": 200 }, { "epoch": 0.1912424931490875, "grad_norm": 0.42553333511533853, "learning_rate": 4.8071699439887215e-05, "loss": 0.5845, "step": 205 }, { "epoch": 0.19590694420150429, "grad_norm": 0.4146813048215204, "learning_rate": 4.792849030850938e-05, "loss": 0.5762, "step": 210 }, { "epoch": 0.20057139525392106, "grad_norm": 0.5810537942195573, "learning_rate": 4.778040785087412e-05, "loss": 0.5736, "step": 215 }, { "epoch": 0.20523584630633782, "grad_norm": 0.4540901088901895, "learning_rate": 4.762748739284831e-05, "loss": 0.5695, "step": 220 }, { "epoch": 0.2099002973587546, "grad_norm": 0.39778098326448963, "learning_rate": 4.746976541442986e-05, "loss": 0.562, "step": 225 }, { "epoch": 0.21456474841117135, "grad_norm": 0.45060914716273687, "learning_rate": 4.730727954104515e-05, "loss": 0.5708, "step": 230 }, { "epoch": 0.21922919946358813, "grad_norm": 0.40075705306410814, "learning_rate": 4.714006853457339e-05, "loss": 0.5779, "step": 235 }, { "epoch": 0.2238936505160049, "grad_norm": 0.3988838422959827, "learning_rate": 4.6968172284099654e-05, "loss": 0.5769, "step": 240 }, { "epoch": 0.22855810156842166, "grad_norm": 0.40730912286641724, "learning_rate": 4.679163179639923e-05, "loss": 0.5622, "step": 245 }, { "epoch": 0.23322255262083844, "grad_norm": 0.41570250249981516, "learning_rate": 4.661048918615513e-05, "loss": 0.5626, "step": 250 }, { "epoch": 0.23788700367325522, "grad_norm": 0.37266061168544923, "learning_rate": 4.642478766591148e-05, "loss": 0.5603, "step": 255 }, { "epoch": 0.24255145472567197, "grad_norm": 0.41204417027814066, "learning_rate": 4.623457153576493e-05, "loss": 0.5692, "step": 260 }, { "epoch": 0.24721590577808875, "grad_norm": 0.3517500791021223, "learning_rate": 4.603988617279655e-05, "loss": 0.5575, "step": 265 }, { "epoch": 0.2518803568305055, "grad_norm": 0.3626203299511348, "learning_rate": 4.5840778020247025e-05, "loss": 0.5561, "step": 270 }, { "epoch": 0.2565448078829223, "grad_norm": 0.4323865340964901, "learning_rate": 4.56372945764372e-05, "loss": 0.5605, "step": 275 }, { "epoch": 0.26120925893533903, "grad_norm": 0.4558769125407336, "learning_rate": 4.542948438343725e-05, "loss": 0.564, "step": 280 }, { "epoch": 0.26587370998775584, "grad_norm": 0.3788810919577419, "learning_rate": 4.521739701548662e-05, "loss": 0.5689, "step": 285 }, { "epoch": 0.2705381610401726, "grad_norm": 0.3494299182914199, "learning_rate": 4.500108306716784e-05, "loss": 0.5532, "step": 290 }, { "epoch": 0.27520261209258934, "grad_norm": 0.3243682374762204, "learning_rate": 4.478059414133695e-05, "loss": 0.5472, "step": 295 }, { "epoch": 0.27986706314500615, "grad_norm": 0.3774629263697534, "learning_rate": 4.455598283681331e-05, "loss": 0.5631, "step": 300 }, { "epoch": 0.2845315141974229, "grad_norm": 0.3590842486979046, "learning_rate": 4.43273027358319e-05, "loss": 0.5473, "step": 305 }, { "epoch": 0.28919596524983965, "grad_norm": 0.33869526982866255, "learning_rate": 4.4094608391260996e-05, "loss": 0.5556, "step": 310 }, { "epoch": 0.29386041630225646, "grad_norm": 0.3740834437948184, "learning_rate": 4.3857955313588256e-05, "loss": 0.5586, "step": 315 }, { "epoch": 0.2985248673546732, "grad_norm": 0.4069053631763161, "learning_rate": 4.3617399957678426e-05, "loss": 0.5699, "step": 320 }, { "epoch": 0.30318931840708996, "grad_norm": 0.42775595832918634, "learning_rate": 4.3372999709305726e-05, "loss": 0.5541, "step": 325 }, { "epoch": 0.3078537694595067, "grad_norm": 0.4225047260621452, "learning_rate": 4.3124812871464146e-05, "loss": 0.5612, "step": 330 }, { "epoch": 0.3125182205119235, "grad_norm": 0.4064532119864525, "learning_rate": 4.287289865045895e-05, "loss": 0.5566, "step": 335 }, { "epoch": 0.31718267156434027, "grad_norm": 0.40813784397443764, "learning_rate": 4.261731714178274e-05, "loss": 0.5427, "step": 340 }, { "epoch": 0.321847122616757, "grad_norm": 0.37505518830201817, "learning_rate": 4.23581293157793e-05, "loss": 0.5573, "step": 345 }, { "epoch": 0.3265115736691738, "grad_norm": 0.40656917887904453, "learning_rate": 4.20953970030988e-05, "loss": 0.5531, "step": 350 }, { "epoch": 0.3311760247215906, "grad_norm": 0.39600365804673093, "learning_rate": 4.182918287994781e-05, "loss": 0.5475, "step": 355 }, { "epoch": 0.33584047577400733, "grad_norm": 0.3703296359847103, "learning_rate": 4.155955045313748e-05, "loss": 0.5651, "step": 360 }, { "epoch": 0.34050492682642414, "grad_norm": 0.3666872032980013, "learning_rate": 4.128656404493371e-05, "loss": 0.559, "step": 365 }, { "epoch": 0.3451693778788409, "grad_norm": 0.33550827998787563, "learning_rate": 4.10102887777127e-05, "loss": 0.5609, "step": 370 }, { "epoch": 0.34983382893125764, "grad_norm": 0.3730387291098042, "learning_rate": 4.073079055842566e-05, "loss": 0.5485, "step": 375 }, { "epoch": 0.35449827998367445, "grad_norm": 0.3970093889085142, "learning_rate": 4.044813606287634e-05, "loss": 0.5475, "step": 380 }, { "epoch": 0.3591627310360912, "grad_norm": 0.32485469708579506, "learning_rate": 4.016239271981519e-05, "loss": 0.5461, "step": 385 }, { "epoch": 0.36382718208850795, "grad_norm": 0.3382837402395568, "learning_rate": 3.987362869485384e-05, "loss": 0.5496, "step": 390 }, { "epoch": 0.3684916331409247, "grad_norm": 0.3609893182037058, "learning_rate": 3.9581912874203854e-05, "loss": 0.5561, "step": 395 }, { "epoch": 0.3731560841933415, "grad_norm": 0.35168703173333654, "learning_rate": 3.928731484824351e-05, "loss": 0.5479, "step": 400 }, { "epoch": 0.37782053524575826, "grad_norm": 0.3629018488370036, "learning_rate": 3.898990489491668e-05, "loss": 0.5501, "step": 405 }, { "epoch": 0.382484986298175, "grad_norm": 0.37390402781019416, "learning_rate": 3.8689753962967636e-05, "loss": 0.5354, "step": 410 }, { "epoch": 0.3871494373505918, "grad_norm": 0.335380363177113, "learning_rate": 3.838693365501586e-05, "loss": 0.5469, "step": 415 }, { "epoch": 0.39181388840300857, "grad_norm": 0.3424832891358978, "learning_rate": 3.80815162104748e-05, "loss": 0.552, "step": 420 }, { "epoch": 0.3964783394554253, "grad_norm": 0.37221668230659216, "learning_rate": 3.7773574488318854e-05, "loss": 0.5553, "step": 425 }, { "epoch": 0.40114279050784213, "grad_norm": 0.37074728239538524, "learning_rate": 3.746318194970239e-05, "loss": 0.5523, "step": 430 }, { "epoch": 0.4058072415602589, "grad_norm": 0.38839870040744245, "learning_rate": 3.715041264043525e-05, "loss": 0.556, "step": 435 }, { "epoch": 0.41047169261267563, "grad_norm": 0.3575823882886992, "learning_rate": 3.683534117331869e-05, "loss": 0.5485, "step": 440 }, { "epoch": 0.41513614366509244, "grad_norm": 0.3161641377206891, "learning_rate": 3.65180427103461e-05, "loss": 0.5467, "step": 445 }, { "epoch": 0.4198005947175092, "grad_norm": 0.3703781300095459, "learning_rate": 3.619859294477273e-05, "loss": 0.5492, "step": 450 }, { "epoch": 0.42446504576992594, "grad_norm": 0.3492760299630012, "learning_rate": 3.587706808305861e-05, "loss": 0.5421, "step": 455 }, { "epoch": 0.4291294968223427, "grad_norm": 0.3267136941264168, "learning_rate": 3.5553544826689145e-05, "loss": 0.538, "step": 460 }, { "epoch": 0.4337939478747595, "grad_norm": 0.29422503748430795, "learning_rate": 3.522810035387752e-05, "loss": 0.5288, "step": 465 }, { "epoch": 0.43845839892717625, "grad_norm": 0.34737447664291465, "learning_rate": 3.490081230115343e-05, "loss": 0.5525, "step": 470 }, { "epoch": 0.443122849979593, "grad_norm": 0.3404046261795078, "learning_rate": 3.4571758744842507e-05, "loss": 0.5522, "step": 475 }, { "epoch": 0.4477873010320098, "grad_norm": 0.36447616317715154, "learning_rate": 3.4241018182440735e-05, "loss": 0.5441, "step": 480 }, { "epoch": 0.45245175208442656, "grad_norm": 0.31157047733964094, "learning_rate": 3.390866951388847e-05, "loss": 0.5391, "step": 485 }, { "epoch": 0.4571162031368433, "grad_norm": 0.3116582428424639, "learning_rate": 3.3574792022748466e-05, "loss": 0.5535, "step": 490 }, { "epoch": 0.4617806541892601, "grad_norm": 0.3279027907069977, "learning_rate": 3.3239465357292304e-05, "loss": 0.5376, "step": 495 }, { "epoch": 0.46644510524167687, "grad_norm": 0.3484015010403029, "learning_rate": 3.290276951149992e-05, "loss": 0.5336, "step": 500 }, { "epoch": 0.4711095562940936, "grad_norm": 0.33469705746842926, "learning_rate": 3.256478480597656e-05, "loss": 0.546, "step": 505 }, { "epoch": 0.47577400734651043, "grad_norm": 0.30392678850247695, "learning_rate": 3.222559186879191e-05, "loss": 0.5407, "step": 510 }, { "epoch": 0.4804384583989272, "grad_norm": 0.29705201467826636, "learning_rate": 3.18852716162458e-05, "loss": 0.5395, "step": 515 }, { "epoch": 0.48510290945134393, "grad_norm": 0.31309471816872475, "learning_rate": 3.154390523356523e-05, "loss": 0.5309, "step": 520 }, { "epoch": 0.48976736050376074, "grad_norm": 0.3532094930450357, "learning_rate": 3.1201574155537155e-05, "loss": 0.5341, "step": 525 }, { "epoch": 0.4944318115561775, "grad_norm": 0.3130231914781531, "learning_rate": 3.085836004708179e-05, "loss": 0.5243, "step": 530 }, { "epoch": 0.49909626260859424, "grad_norm": 0.32746390804605524, "learning_rate": 3.0514344783771015e-05, "loss": 0.5458, "step": 535 }, { "epoch": 0.503760713661011, "grad_norm": 0.3140774516982654, "learning_rate": 3.0169610432296513e-05, "loss": 0.542, "step": 540 }, { "epoch": 0.5084251647134278, "grad_norm": 0.33623222677088005, "learning_rate": 2.9824239230892316e-05, "loss": 0.5449, "step": 545 }, { "epoch": 0.5130896157658446, "grad_norm": 0.34034229987032566, "learning_rate": 2.9478313569716425e-05, "loss": 0.5399, "step": 550 }, { "epoch": 0.5177540668182613, "grad_norm": 0.31316724089217596, "learning_rate": 2.9131915971196216e-05, "loss": 0.538, "step": 555 }, { "epoch": 0.5224185178706781, "grad_norm": 0.3088718959081153, "learning_rate": 2.8785129070342247e-05, "loss": 0.5415, "step": 560 }, { "epoch": 0.5270829689230948, "grad_norm": 0.34309616279067395, "learning_rate": 2.8438035595035235e-05, "loss": 0.5323, "step": 565 }, { "epoch": 0.5317474199755117, "grad_norm": 0.3160243240149518, "learning_rate": 2.8090718346290902e-05, "loss": 0.5227, "step": 570 }, { "epoch": 0.5364118710279284, "grad_norm": 0.30318761428213864, "learning_rate": 2.77432601785073e-05, "loss": 0.538, "step": 575 }, { "epoch": 0.5410763220803452, "grad_norm": 0.33554969124573514, "learning_rate": 2.7395743979699527e-05, "loss": 0.5423, "step": 580 }, { "epoch": 0.5457407731327619, "grad_norm": 0.29358246176347985, "learning_rate": 2.7048252651726237e-05, "loss": 0.5302, "step": 585 }, { "epoch": 0.5504052241851787, "grad_norm": 0.27227963891831986, "learning_rate": 2.6700869090513025e-05, "loss": 0.5276, "step": 590 }, { "epoch": 0.5550696752375954, "grad_norm": 0.3315788476523275, "learning_rate": 2.6353676166277175e-05, "loss": 0.5375, "step": 595 }, { "epoch": 0.5597341262900123, "grad_norm": 0.29167775719810907, "learning_rate": 2.6006756703758462e-05, "loss": 0.5287, "step": 600 }, { "epoch": 0.564398577342429, "grad_norm": 0.2836903485895201, "learning_rate": 2.5660193462460914e-05, "loss": 0.5209, "step": 605 }, { "epoch": 0.5690630283948458, "grad_norm": 0.312902944542489, "learning_rate": 2.5314069116910073e-05, "loss": 0.5352, "step": 610 }, { "epoch": 0.5737274794472625, "grad_norm": 0.3061911987604843, "learning_rate": 2.496846623693052e-05, "loss": 0.5418, "step": 615 }, { "epoch": 0.5783919304996793, "grad_norm": 0.30918289084780787, "learning_rate": 2.4623467267948453e-05, "loss": 0.5264, "step": 620 }, { "epoch": 0.583056381552096, "grad_norm": 0.29432202760005055, "learning_rate": 2.427915451132382e-05, "loss": 0.5303, "step": 625 }, { "epoch": 0.5877208326045129, "grad_norm": 0.30248260299992763, "learning_rate": 2.3935610104716934e-05, "loss": 0.5318, "step": 630 }, { "epoch": 0.5923852836569297, "grad_norm": 0.2921703613190239, "learning_rate": 2.359291600249407e-05, "loss": 0.5351, "step": 635 }, { "epoch": 0.5970497347093464, "grad_norm": 0.2861561331468405, "learning_rate": 2.325115395617683e-05, "loss": 0.5191, "step": 640 }, { "epoch": 0.6017141857617632, "grad_norm": 0.29199262406988685, "learning_rate": 2.291040549493985e-05, "loss": 0.5314, "step": 645 }, { "epoch": 0.6063786368141799, "grad_norm": 0.28022740610521435, "learning_rate": 2.2570751906161624e-05, "loss": 0.5189, "step": 650 }, { "epoch": 0.6110430878665967, "grad_norm": 0.32229685451288487, "learning_rate": 2.223227421603289e-05, "loss": 0.5277, "step": 655 }, { "epoch": 0.6157075389190134, "grad_norm": 0.2812720351153964, "learning_rate": 2.1895053170227464e-05, "loss": 0.5315, "step": 660 }, { "epoch": 0.6203719899714303, "grad_norm": 0.2888622086540738, "learning_rate": 2.1559169214639884e-05, "loss": 0.5277, "step": 665 }, { "epoch": 0.625036441023847, "grad_norm": 0.3120048352568711, "learning_rate": 2.122470247619464e-05, "loss": 0.525, "step": 670 }, { "epoch": 0.6297008920762638, "grad_norm": 0.2818693243090786, "learning_rate": 2.0891732743731434e-05, "loss": 0.5322, "step": 675 }, { "epoch": 0.6343653431286805, "grad_norm": 0.2926059214892839, "learning_rate": 2.0560339448971146e-05, "loss": 0.525, "step": 680 }, { "epoch": 0.6390297941810973, "grad_norm": 0.29268503655464984, "learning_rate": 2.0230601647566966e-05, "loss": 0.5247, "step": 685 }, { "epoch": 0.643694245233514, "grad_norm": 0.2728644227058383, "learning_rate": 1.9902598000245222e-05, "loss": 0.5185, "step": 690 }, { "epoch": 0.6483586962859309, "grad_norm": 0.25962809295866146, "learning_rate": 1.9576406754040467e-05, "loss": 0.5228, "step": 695 }, { "epoch": 0.6530231473383477, "grad_norm": 0.2939794591562482, "learning_rate": 1.925210572362922e-05, "loss": 0.5402, "step": 700 }, { "epoch": 0.6576875983907644, "grad_norm": 0.2675031379318846, "learning_rate": 1.892977227276685e-05, "loss": 0.5224, "step": 705 }, { "epoch": 0.6623520494431812, "grad_norm": 0.33077413707192227, "learning_rate": 1.8609483295832036e-05, "loss": 0.5145, "step": 710 }, { "epoch": 0.6670165004955979, "grad_norm": 0.28316621823190463, "learning_rate": 1.829131519948323e-05, "loss": 0.518, "step": 715 }, { "epoch": 0.6716809515480147, "grad_norm": 0.28091675149417483, "learning_rate": 1.7975343884431357e-05, "loss": 0.5204, "step": 720 }, { "epoch": 0.6763454026004314, "grad_norm": 0.30963978843403667, "learning_rate": 1.7661644727333403e-05, "loss": 0.524, "step": 725 }, { "epoch": 0.6810098536528483, "grad_norm": 0.27818187692784135, "learning_rate": 1.7350292562810832e-05, "loss": 0.5202, "step": 730 }, { "epoch": 0.685674304705265, "grad_norm": 0.28024391562302353, "learning_rate": 1.704136166559737e-05, "loss": 0.5216, "step": 735 }, { "epoch": 0.6903387557576818, "grad_norm": 0.25776425960390403, "learning_rate": 1.6734925732820454e-05, "loss": 0.5201, "step": 740 }, { "epoch": 0.6950032068100985, "grad_norm": 0.25912854344445496, "learning_rate": 1.6431057866420313e-05, "loss": 0.5154, "step": 745 }, { "epoch": 0.6996676578625153, "grad_norm": 0.288843246635576, "learning_rate": 1.61298305557111e-05, "loss": 0.5319, "step": 750 }, { "epoch": 0.704332108914932, "grad_norm": 0.27843803013272117, "learning_rate": 1.583131566008825e-05, "loss": 0.5322, "step": 755 }, { "epoch": 0.7089965599673489, "grad_norm": 0.2724850636300469, "learning_rate": 1.553558439188594e-05, "loss": 0.5188, "step": 760 }, { "epoch": 0.7136610110197656, "grad_norm": 0.27405031166570964, "learning_rate": 1.5242707299389086e-05, "loss": 0.5264, "step": 765 }, { "epoch": 0.7183254620721824, "grad_norm": 0.28279666997673636, "learning_rate": 1.4952754250003637e-05, "loss": 0.5296, "step": 770 }, { "epoch": 0.7229899131245991, "grad_norm": 0.276837438024271, "learning_rate": 1.4665794413589298e-05, "loss": 0.5364, "step": 775 }, { "epoch": 0.7276543641770159, "grad_norm": 0.2724777598286785, "learning_rate": 1.4381896245958752e-05, "loss": 0.5158, "step": 780 }, { "epoch": 0.7323188152294327, "grad_norm": 0.2649273346430747, "learning_rate": 1.4101127472547084e-05, "loss": 0.519, "step": 785 }, { "epoch": 0.7369832662818494, "grad_norm": 0.2699794671741458, "learning_rate": 1.3823555072255606e-05, "loss": 0.5276, "step": 790 }, { "epoch": 0.7416477173342663, "grad_norm": 0.2597781223574146, "learning_rate": 1.354924526147357e-05, "loss": 0.5158, "step": 795 }, { "epoch": 0.746312168386683, "grad_norm": 0.28723301584203204, "learning_rate": 1.3278263478281994e-05, "loss": 0.5224, "step": 800 }, { "epoch": 0.7509766194390998, "grad_norm": 0.2535306464938602, "learning_rate": 1.3010674366843001e-05, "loss": 0.5361, "step": 805 }, { "epoch": 0.7556410704915165, "grad_norm": 0.27848610430564497, "learning_rate": 1.2746541761978592e-05, "loss": 0.5148, "step": 810 }, { "epoch": 0.7603055215439333, "grad_norm": 0.26508085752140736, "learning_rate": 1.2485928673942568e-05, "loss": 0.5197, "step": 815 }, { "epoch": 0.76496997259635, "grad_norm": 0.2765576189082103, "learning_rate": 1.2228897273389022e-05, "loss": 0.5043, "step": 820 }, { "epoch": 0.7696344236487669, "grad_norm": 0.25179624599466677, "learning_rate": 1.1975508876541262e-05, "loss": 0.5252, "step": 825 }, { "epoch": 0.7742988747011836, "grad_norm": 0.2597455495271655, "learning_rate": 1.1725823930564436e-05, "loss": 0.5292, "step": 830 }, { "epoch": 0.7789633257536004, "grad_norm": 0.2525298953468885, "learning_rate": 1.1479901999145583e-05, "loss": 0.5194, "step": 835 }, { "epoch": 0.7836277768060171, "grad_norm": 0.26668419176346814, "learning_rate": 1.1237801748284375e-05, "loss": 0.5183, "step": 840 }, { "epoch": 0.7882922278584339, "grad_norm": 0.2576707752493972, "learning_rate": 1.099958093229802e-05, "loss": 0.5145, "step": 845 }, { "epoch": 0.7929566789108506, "grad_norm": 0.2519900873666899, "learning_rate": 1.0765296380043684e-05, "loss": 0.52, "step": 850 }, { "epoch": 0.7976211299632674, "grad_norm": 0.25029147728099826, "learning_rate": 1.0535003981361613e-05, "loss": 0.5085, "step": 855 }, { "epoch": 0.8022855810156843, "grad_norm": 0.25974689320051614, "learning_rate": 1.030875867374238e-05, "loss": 0.5276, "step": 860 }, { "epoch": 0.806950032068101, "grad_norm": 0.2750183412160561, "learning_rate": 1.008661442922118e-05, "loss": 0.5288, "step": 865 }, { "epoch": 0.8116144831205178, "grad_norm": 0.24684508884873066, "learning_rate": 9.868624241502573e-06, "loss": 0.5108, "step": 870 }, { "epoch": 0.8162789341729345, "grad_norm": 0.24917506535926326, "learning_rate": 9.654840113318506e-06, "loss": 0.5153, "step": 875 }, { "epoch": 0.8209433852253513, "grad_norm": 0.26588122762475325, "learning_rate": 9.445313044022797e-06, "loss": 0.5134, "step": 880 }, { "epoch": 0.825607836277768, "grad_norm": 0.25205075163043134, "learning_rate": 9.240093017424978e-06, "loss": 0.5234, "step": 885 }, { "epoch": 0.8302722873301849, "grad_norm": 0.24453056572204862, "learning_rate": 9.039228989866358e-06, "loss": 0.5256, "step": 890 }, { "epoch": 0.8349367383826016, "grad_norm": 0.25964646851376255, "learning_rate": 8.84276887854126e-06, "loss": 0.5197, "step": 895 }, { "epoch": 0.8396011894350184, "grad_norm": 0.2380789074605979, "learning_rate": 8.650759550066084e-06, "loss": 0.5226, "step": 900 }, { "epoch": 0.8442656404874351, "grad_norm": 0.25537829061619166, "learning_rate": 8.46324680929905e-06, "loss": 0.5109, "step": 905 }, { "epoch": 0.8489300915398519, "grad_norm": 0.24326625950869546, "learning_rate": 8.280275388413186e-06, "loss": 0.5171, "step": 910 }, { "epoch": 0.8535945425922686, "grad_norm": 0.283956175304338, "learning_rate": 8.10188893622523e-06, "loss": 0.5138, "step": 915 }, { "epoch": 0.8582589936446854, "grad_norm": 0.26712492993115533, "learning_rate": 7.928130007782977e-06, "loss": 0.5347, "step": 920 }, { "epoch": 0.8629234446971022, "grad_norm": 0.2770179693009444, "learning_rate": 7.759040054213531e-06, "loss": 0.5332, "step": 925 }, { "epoch": 0.867587895749519, "grad_norm": 0.25471359044347464, "learning_rate": 7.59465941283494e-06, "loss": 0.5245, "step": 930 }, { "epoch": 0.8722523468019358, "grad_norm": 0.2474269162576819, "learning_rate": 7.435027297533474e-06, "loss": 0.5122, "step": 935 }, { "epoch": 0.8769167978543525, "grad_norm": 0.25130162397546496, "learning_rate": 7.2801817894089756e-06, "loss": 0.5063, "step": 940 }, { "epoch": 0.8815812489067693, "grad_norm": 0.24881956227908078, "learning_rate": 7.130159827690404e-06, "loss": 0.5177, "step": 945 }, { "epoch": 0.886245699959186, "grad_norm": 0.25545295509666294, "learning_rate": 6.98499720092375e-06, "loss": 0.508, "step": 950 }, { "epoch": 0.8909101510116029, "grad_norm": 0.24814116495940802, "learning_rate": 6.844728538434536e-06, "loss": 0.5156, "step": 955 }, { "epoch": 0.8955746020640196, "grad_norm": 0.24345586049308032, "learning_rate": 6.709387302066758e-06, "loss": 0.5192, "step": 960 }, { "epoch": 0.9002390531164364, "grad_norm": 0.25035521575621933, "learning_rate": 6.579005778200434e-06, "loss": 0.5253, "step": 965 }, { "epoch": 0.9049035041688531, "grad_norm": 0.2580394858408089, "learning_rate": 6.453615070049447e-06, "loss": 0.5251, "step": 970 }, { "epoch": 0.9095679552212699, "grad_norm": 0.23956216998636262, "learning_rate": 6.3332450902417666e-06, "loss": 0.5107, "step": 975 }, { "epoch": 0.9142324062736866, "grad_norm": 0.23764348877193034, "learning_rate": 6.21792455368361e-06, "loss": 0.5129, "step": 980 }, { "epoch": 0.9188968573261035, "grad_norm": 0.2520198832874535, "learning_rate": 6.1076809707093225e-06, "loss": 0.5208, "step": 985 }, { "epoch": 0.9235613083785202, "grad_norm": 0.23664963056881916, "learning_rate": 6.002540640518684e-06, "loss": 0.5066, "step": 990 }, { "epoch": 0.928225759430937, "grad_norm": 0.3088138680489082, "learning_rate": 5.902528644903055e-06, "loss": 0.5153, "step": 995 }, { "epoch": 0.9328902104833537, "grad_norm": 0.24722065847341207, "learning_rate": 5.807668842262004e-06, "loss": 0.5169, "step": 1000 }, { "epoch": 0.9375546615357705, "grad_norm": 0.2351634623111495, "learning_rate": 5.7179838619117394e-06, "loss": 0.5097, "step": 1005 }, { "epoch": 0.9422191125881872, "grad_norm": 0.2422992129098815, "learning_rate": 5.633495098686789e-06, "loss": 0.5119, "step": 1010 }, { "epoch": 0.946883563640604, "grad_norm": 0.2550904076289393, "learning_rate": 5.554222707836121e-06, "loss": 0.5147, "step": 1015 }, { "epoch": 0.9515480146930209, "grad_norm": 0.25885529407119634, "learning_rate": 5.480185600215012e-06, "loss": 0.5079, "step": 1020 }, { "epoch": 0.9562124657454376, "grad_norm": 0.23792385142576641, "learning_rate": 5.411401437773773e-06, "loss": 0.5143, "step": 1025 }, { "epoch": 0.9608769167978544, "grad_norm": 0.24312465281652115, "learning_rate": 5.347886629344369e-06, "loss": 0.5107, "step": 1030 }, { "epoch": 0.9655413678502711, "grad_norm": 0.23595847074173298, "learning_rate": 5.28965632672603e-06, "loss": 0.5187, "step": 1035 }, { "epoch": 0.9702058189026879, "grad_norm": 0.2479378393066615, "learning_rate": 5.236724421070693e-06, "loss": 0.5144, "step": 1040 }, { "epoch": 0.9748702699551046, "grad_norm": 0.24722093987308796, "learning_rate": 5.189103539569195e-06, "loss": 0.5215, "step": 1045 }, { "epoch": 0.9795347210075215, "grad_norm": 0.23704918121696106, "learning_rate": 5.146805042438997e-06, "loss": 0.516, "step": 1050 }, { "epoch": 0.9841991720599382, "grad_norm": 0.2499713377733996, "learning_rate": 5.10983902021413e-06, "loss": 0.5086, "step": 1055 }, { "epoch": 0.988863623112355, "grad_norm": 0.24126794921726105, "learning_rate": 5.078214291338054e-06, "loss": 0.5263, "step": 1060 }, { "epoch": 0.9935280741647717, "grad_norm": 0.2575581210389819, "learning_rate": 5.051938400059965e-06, "loss": 0.5173, "step": 1065 }, { "epoch": 0.9981925252171885, "grad_norm": 0.24967221419990557, "learning_rate": 5.031017614635082e-06, "loss": 0.5177, "step": 1070 }, { "epoch": 0.9991254154276719, "step": 1071, "total_flos": 486995857768448.0, "train_loss": 0.5541811234532507, "train_runtime": 156721.9484, "train_samples_per_second": 0.219, "train_steps_per_second": 0.007 } ], "logging_steps": 5, "max_steps": 1071, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 486995857768448.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }